leonardlin commited on Sep 18

Commit

1e407f0

1 Parent(s): eb55039

Add ROCm build artifacts and HIP backend

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +3 -1
README.md +2 -2
build.py +73 -0
build.sh +9 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/__init__.py +202 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/__init__.py +10 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/activation_fn.py +33 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/all_to_all.py +54 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/arguments.py +101 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/common.py +26 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/dmlp_registry.py +42 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/dmoe.py +337 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/gelu.py +52 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/glu.py +244 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/memory_test.py +103 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/memory_test.sh +12 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/mlp.py +587 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/moe.py +507 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/mpu.py +94 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/router.py +116 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/sharedexpert_registry.py +32 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_megablocks_rocm.so +3 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_ops.py +18 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_version.py +6 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/backend/__init__.py +2 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/backend/kernels.py +557 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/bak.__init__.py +23 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/benchmark_util.py +35 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm/__init__.py +2 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm/backend.py +33 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm/ops.py +33 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm_util.py +31 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/layers.py +1225 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/__init__.py +35 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +63 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/all_to_all_benchmark.sh +12 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/binned_gather.py +37 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/binned_scatter.py +59 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/cumsum.py +52 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/gather.py +38 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/histogram.py +27 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/histogram_benchmark.py +78 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/matmul_benchmark.py +415 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/padded_gather.py +55 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/padded_scatter.py +98 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py +66 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/permute_benchmark.py +149 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/repeat.py +10 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/replicate.py +36 -0
build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/round_up.py +14 -0

.gitignore CHANGED Viewed

@@ -2,4 +2,6 @@
 __pycache__
 .bak
 megablocks-moe/.bak
-.pytest_cache

 __pycache__
 .bak
 megablocks-moe/.bak
+.pytest_cache
+.readme_example.py.swp
+.torch_extensions/

README.md CHANGED Viewed

@@ -7,7 +7,7 @@ tags:
 ## Quickstart
 ```bash
-uv run https://huggingface.co/kernels-community/megablocks/raw/main/readme_example.py
 ```
 ```python
@@ -30,7 +30,7 @@ torch.manual_seed(42)
 torch.cuda.manual_seed(42)
 # Download optimized kernels from the Hugging Face hub
-megablocks = get_kernel("kernels-community/megablocks")
 print("MegaBlocks kernel downloaded successfully.")
 model = megablocks.layers.MegaBlocksMoeMLP()

 ## Quickstart
 ```bash
+uv run https://huggingface.co/shisa-ai/megablocks-hip/raw/main/readme_example.py
 ```
 ```python
 torch.cuda.manual_seed(42)
 # Download optimized kernels from the Hugging Face hub
+megablocks = get_kernel("shisa-ai/megablocks-hip")
 print("MegaBlocks kernel downloaded successfully.")
 model = megablocks.layers.MegaBlocksMoeMLP()

build.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import pathlib
+import shutil
+from torch.utils.cpp_extension import load
+try:
+    from kernels.utils import build_variant
+except ImportError:  # fallback when kernels is unavailable
+    build_variant = None
+repo = pathlib.Path(__file__).resolve().parent
+os.environ.setdefault("TORCH_EXTENSIONS_DIR", str(repo / ".torch_extensions"))
+sources = [
+    repo / "torch-ext" / "torch_binding.cpp",
+    repo / "csrc" / "new_cumsum.cu",
+    repo / "csrc" / "new_histogram.cu",
+    repo / "csrc" / "new_indices.cu",
+    repo / "csrc" / "new_replicate.cu",
+    repo / "csrc" / "new_sort.cu",
+    repo / "csrc" / "grouped_gemm" / "grouped_gemm.cu",
+]
+mod = load(
+    name="_megablocks_rocm",
+    sources=[str(s) for s in sources],
+    extra_include_paths=[str(repo / "csrc")],
+    extra_cflags=["-O3", "-std=c++17"],
+    extra_cuda_cflags=["-O3"],  # torch switches this to hipcc flags on ROCm builds
+    extra_ldflags=["-lhipblaslt"],
+    verbose=True,
+    is_python_module=False,
+)
+module_path = pathlib.Path(mod if isinstance(mod, str) else mod.__file__)
+print("built:", module_path)
+if build_variant is None:
+    print("kernels not available; skipping package staging")
+else:
+    variant = build_variant()
+    package_root = repo / "build" / variant / "megablocks"
+    if package_root.exists():
+        shutil.rmtree(package_root)
+    shutil.copytree(
+        repo / "torch-ext" / "megablocks",
+        package_root,
+        ignore=shutil.ignore_patterns("__pycache__"),
+    )
+    ops_py = package_root / "_ops.py"
+    ops_py.write_text('''
+import torch
+from pathlib import Path
+_LIB_NAME = "_megablocks_rocm.so"
+def _load_ops():
+    lib_path = Path(__file__).with_name(_LIB_NAME)
+    torch.ops.load_library(str(lib_path))
+    return torch.ops._megablocks_rocm
+ops = _load_ops()
+def add_op_namespace_prefix(op_name: str) -> str:
+    return f"_megablocks_rocm::{op_name}"
+''')
+    shutil.copy2(module_path, package_root / module_path.name)
+    print(f"staged local kernel under build/{variant}/megablocks")

build.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+# export TORCH_EXTENSIONS_DIR=/root/shisa-v2/train/v2.1/megablocks.kernels-community/.torch_extensions; export ROCM_HOME=/opt/rocm-6.4.1; export HIP_HOME=$ROCM_HOME; export TORCH_HIP_ARCH_LIST=gfx942; export HSA_OVERRIDE_GFX_VERSION=gfx942; python megablocks.kernels-community/build.py
+# 3-4min build
+export ROCM_HOME=/opt/rocm-6.4.1
+export HIP_HOME=$ROCM_HOME
+export TORCH_HIP_ARCH_LIST=gfx942
+export HSA_OVERRIDE_GFX_VERSION=gfx942
+export TORCH_EXTENSIONS_DIR="$PWD/megablocks.kernels-community/.torch_extensions"
+python megablocks.kernels-community/build.py

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/__init__.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from ._ops import ops
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+from . import layers
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/activation_fn.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Callable, Union
+import torch
+from ..stk import Matrix
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/all_to_all.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.distributed as dist
+class AllToAllOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/arguments.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/common.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from .arguments import Arguments
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/dmlp_registry.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Union
+from . import glu, mlp
+from .arguments import Arguments
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+    Args:
+        args: propagated Arguments dataclass.
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/dmoe.py ADDED Viewed

	@@ -0,0 +1,337 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import torch
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+class ParallelDroplessMLP(moe.ParallelMLP):
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+class dMoE(moe.MoE):
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/gelu.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+from .. import stk
+import torch
+import torch.nn.functional as F
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/glu.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+from .. import stk
+import torch
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+class SparseGLU(SparseMLP):
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+        return stk.ops.dsd(x1, w2)
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+class GroupedGLU(SparseGLU):
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/memory_test.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import torch
+import torch.distributed as dist
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+    for args in _TESTS:
+        test_memory(group, *args)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/memory_test.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+DISTRIBUTED_ARGUMENTS="\
+--nproc_per_node 1 \
+--nnodes 1 \
+--node_rank 0 \
+--master_addr localhost \
+--master_port 6000"
+python -m torch.distributed.launch \
+       ${DISTRIBUTED_ARGUMENTS} \
+       megablocks/layers/memory_test.py

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/mlp.py ADDED Viewed

	@@ -0,0 +1,587 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+from .. import stk
+import torch
+from packaging import version
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+class ScaleGradient(torch.autograd.Function):
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+scale_gradient = ScaleGradient.apply
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+class MLP(torch.nn.Module):
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+class SparseMLP(torch.nn.Module):
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+class GroupedMLP(SparseMLP):
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+        return shared_expert_out + expert_out
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/moe.py ADDED Viewed

	@@ -0,0 +1,507 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.distributed as dist
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+_LOAD_BALANCING_LOSS = []
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+        return indices, bin_ids, bins, tokens_per_expert
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+class MoE(torch.nn.Module):
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+        # Token router.
+        self.router = router.LearnedRouter(args)
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/mpu.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import torch
+import torch.distributed as dist
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+class MoeParam(torch.Tensor):
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/router.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+import torch
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+_ROUTER_LOGITS = []
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+    logits_per_router = _ROUTER_LOGITS
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+class LearnedRouter(torch.nn.Module):
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_layers/sharedexpert_registry.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Union
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+    Args:
+        args: propagated Arguments dataclass.
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+    return _REGISTRY[args.mlp_type](args)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_megablocks_rocm.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1181fa6e502b6f3fa75ce439a68f6dccb691af130934c3dd697f3efa5cb723
+size 6437768

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_ops.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+from pathlib import Path
+_LIB_NAME = "_megablocks_rocm.so"
+def _load_ops():
+    lib_path = Path(__file__).with_name(_LIB_NAME)
+    torch.ops.load_library(str(lib_path))
+    return torch.ops._megablocks_rocm
+ops = _load_ops()
+def add_op_namespace_prefix(op_name: str) -> str:
+    return f"_megablocks_rocm::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/_version.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+"""The MegaBlocks Version."""
+__version__ = '0.11.0.dev0'

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/backend/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2024 Databricks
2	+ # SPDX-License-Identifier: Apache-2.0

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/backend/kernels.py ADDED Viewed

	@@ -0,0 +1,557 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import triton
+import triton.language as tl
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+    triton.autotune = _no_autotune
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+        offsets += BLOCK_X
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+        offsets += BLOCK_X
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/bak.__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from megablocks_moe.megablocks import (
+    MoE,
+    dMoE,
+    get_load_balancing_loss,
+    ParallelMLP,
+    ParallelDroplessMLP,
+    SparseMLP,
+    MLP,
+    SparseGLU,
+    Arguments,
+)
+__all__ = [
+    "MoE",
+    "dMoE",
+    "get_load_balancing_loss",
+    "ParallelMLP",
+    "ParallelDroplessMLP",
+    "SparseMLP",
+    "MLP",
+    "SparseGLU",
+    "Arguments",
+]

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/benchmark_util.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import torch
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import ops
2	+ from . import backend

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm/backend.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm/ops.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from . import backend
+import torch
+class GroupedGemm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/grouped_gemm_util.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/layers.py ADDED Viewed

	@@ -0,0 +1,1225 @@

+import torch
+import torch.distributed as dist
+from typing import Optional, Any, TYPE_CHECKING
+from . import _layers
+from . import ops
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+                return decorator
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+    # Create wrapper functions that check for compilation and return meta tensors
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+        ops.sort = sort_with_meta
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+        ops.histogram = histogram_with_meta
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+        ops.binned_gather = binned_gather_with_meta
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+        ops.binned_scatter = binned_scatter_with_meta
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+        ops.gather = gather_with_meta
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+        ops.scatter = scatter_with_meta
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+        ops.replicate = replicate_with_meta
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+        ops.repeat = repeat_with_meta
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+    return logits, expert_weights, expert_indices
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+    # Activation
+    x = activation_fn(x)
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+    return x
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+    return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+        return combined_out, expert_weights, router_scores
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/all_to_all_benchmark.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.distributed as dist
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+    x = torch.randn((sl, hs)).cuda().half()
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/all_to_all_benchmark.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+DISTRIBUTED_ARGUMENTS="\
+--nproc_per_node 8 \
+--nnodes 1 \
+--node_rank 0 \
+--master_addr localhost \
+--master_port 6000"
+python -m torch.distributed.launch \
+       ${DISTRIBUTED_ARGUMENTS} \
+       megablocks/ops/all_to_all_benchmark.py

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/binned_gather.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+from ..backend import kernels
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+binned_gather = BinnedGatherOp.apply

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/binned_scatter.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+from ..backend import kernels
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+binned_scatter = BinnedScatterOp.apply

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/cumsum.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+exclusive_cumsum = ExclusiveCumsumOp.apply
+class InclusiveCumsumOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+inclusive_cumsum = InclusiveCumsumOp.apply

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/gather.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+from ..backend import kernels
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+gather = GatherOp.apply

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/histogram.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+histogram = HistogramOp.apply

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/histogram_benchmark.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+import numpy as np
+import torch
+from absl.testing import parameterized
+from .. import ops
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+class HistogramBenchmark(parameterized.TestCase):
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+if __name__ == '__main__':
+    unittest.main()

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/matmul_benchmark.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+# import stk
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+from .. import stk
+import torch
+from absl.testing import parameterized
+from .. import benchmark_util, ops
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+class MatmulBenchmark(parameterized.TestCase):
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        def benchmark():
+            return stk.ops.dsd(x, w)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+        def benchmark():
+            return stk.ops.dsd(x, out)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+        def benchmark():
+            return torch.bmm(x, w)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+        def benchmark():
+            return torch.bmm(out, w)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+        def benchmark():
+            return torch.bmm(out, x)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        def benchmark():
+            return torch.bmm(x, w)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+        def benchmark():
+            return torch.bmm(out, w)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+        def benchmark():
+            return torch.bmm(x, out)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+if __name__ == '__main__':
+    unittest.main()

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/padded_gather.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+from ..backend import kernels
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+padded_gather = PaddedGatherOp.apply

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/padded_scatter.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+from ..backend import kernels
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+import torch
+from absl.testing import parameterized
+from .. import benchmark_util, ops
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+class PaddedScatterTest(parameterized.TestCase):
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+if __name__ == '__main__':
+    unittest.main()

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/permute_benchmark.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+import torch
+from absl.testing import parameterized
+from .. import benchmark_util, ops
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+class PermuteBenchmark(parameterized.TestCase):
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+        def benchmark():
+            return y.copy_(x)
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+if __name__ == '__main__':
+    unittest.main()

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/repeat.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/replicate.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+replicate = ReplicateOp.apply

build/torch28-cxx11-rocm64-x86_64-linux/megablocks/ops/round_up.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value