kernels-community
/

moe

Model card Files Files and versions Community

danieldk HF staff commited on Dec 18, 2024

Commit

29e93ec

1 Parent(s): 8d5a3ce

Add MoE kernels from vLLM

Browse files

TODO: add MoE configs, but this requires a change to the builder.

Files changed (38) hide show

README.md +7 -3
activation/activation_kernels.cu +204 -0
activation/cuda_compat.h +49 -0
activation/dispatch_utils.h +35 -0
build.toml +49 -0
core/exception.hpp +3 -0
core/scalar_type.hpp +347 -0
cuda_compat.h +49 -0
dispatch_utils.h +35 -0
ext-torch/__init__.py +1 -0
ext-torch/_custom_ops.py +135 -0
ext-torch/fp8.py +63 -0
ext-torch/fused_marlin_moe.py +338 -0
ext-torch/fused_moe.py +703 -0
ext-torch/platforms.py +22 -0
ext-torch/registration.h +27 -0
ext-torch/scalar_type.py +330 -0
ext-torch/torch_binding.cpp +45 -0
ext-torch/torch_binding.h +30 -0
ext-torch/utils/__init__.py +0 -0
ext-torch/utils/marlin_utils.py +307 -0
ext-torch/utils/marlin_utils_test.py +162 -0
ext-torch/utils/quant_utils.py +470 -0
flake.nix +14 -0
marlin-moe/marlin_kernels/marlin_moe_kernel.h +1616 -0
marlin-moe/marlin_kernels/marlin_moe_kernel_ku4.cu +31 -0
marlin-moe/marlin_kernels/marlin_moe_kernel_ku4.h +20 -0
marlin-moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu +31 -0
marlin-moe/marlin_kernels/marlin_moe_kernel_ku4b8.h +20 -0
marlin-moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu +31 -0
marlin-moe/marlin_kernels/marlin_moe_kernel_ku8b128.h +18 -0
marlin-moe/marlin_moe_ops.cu +584 -0
moe/moe_align_sum_kernels.cu +202 -0
moe/topk_softmax_kernels.cu +506 -0
test/__init__.py +0 -0
test/kernels/__init__.py +0 -0
test/kernels/test_moe.py +220 -0
test/kernels/utils.py +78 -0

README.md CHANGED Viewed

@@ -1,3 +1,7 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+---
+## MoE
+MoE kernels from [vLLM](https://github.com/vllm-project/).

activation/activation_kernels.cu ADDED Viewed

	@@ -0,0 +1,204 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cmath>
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+namespace vllm {
+// Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void act_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x) * y;
+  }
+}
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+template <typename T>
+__device__ __forceinline__ T gelu_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'none' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
+  const float f = (float)x;
+  constexpr float ALPHA = M_SQRT1_2;
+  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+}
+template <typename T>
+__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'tanh' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+  const float f = (float)x;
+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA = 0.044715;
+  float x_cube = f * f * f;
+  float inner = BETA * (f + KAPPA * x_cube);
+  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
+}
+}  // namespace vllm
+// Launch activation and gating kernel.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
+  int d = input.size(-1) / 2;                                            \
+  int64_t num_tokens = input.numel() / input.size(-1);                   \
+  dim3 grid(num_tokens);                                                 \
+  dim3 block(std::min(d, 1024));                                         \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
+  VLLM_DISPATCH_FLOATING_TYPES(                                          \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                   \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                         input.data_ptr<scalar_t>(), d); \
+      });
+void silu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
+//void gelu_and_mul(torch::Tensor& out,    // [..., d]
+//                  torch::Tensor& input)  // [..., 2 * d]
+//{
+//  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
+//}
+//void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
+//                       torch::Tensor& input)  // [..., 2 * d]
+//{
+//  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+//}
+namespace vllm {
+template <typename T>
+__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
+  const float f = (float)x;
+  return (T)(f > threshold ? f : 0.0f);
+}
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
+__global__ void act_and_mul_kernel_with_param(
+    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
+    const float param) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  }
+}
+}  // namespace vllm
+#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
+  int d = input.size(-1) / 2;                                           \
+  int64_t num_tokens = input.numel() / input.size(-1);                  \
+  dim3 grid(num_tokens);                                                \
+  dim3 block(std::min(d, 1024));                                        \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));     \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();         \
+  VLLM_DISPATCH_FLOATING_TYPES(                                         \
+      input.scalar_type(), "act_and_mul_kernel_with_param", [&] {       \
+        vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),      \
+                                         input.data_ptr<scalar_t>(), d, \
+                                         PARAM);                        \
+      });
+//void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
+//                     torch::Tensor& input,  // [..., 2 * d]
+//                     double threshold) {
+//  LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
+//}
+namespace vllm {
+// Element-wise activation kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void activation_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x);
+  }
+}
+}  // namespace vllm
+// Launch element-wise activation kernel.
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                       \
+  int d = input.size(-1);                                                      \
+  int64_t num_tokens = input.numel() / d;                                      \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+    vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>                        \
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),                 \
+                                     input.data_ptr<scalar_t>(), d);           \
+  });
+namespace vllm {
+template <typename T>
+__device__ __forceinline__ T gelu_new_kernel(const T& x) {
+  const float x3 = (float)(x * x * x);
+  const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+  return ((T)0.5) * x * (((T)1.0) + t);
+}
+template <typename T>
+__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
+  const float f = (float)x;
+  const T t =
+      (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+  return ((T)0.5) * x * (((T)1.0) + t);
+}
+template <typename T>
+__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
+  // x * sigmoid(1.702 * x)
+  return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
+}
+}  // namespace vllm
+//void gelu_new(torch::Tensor& out,    // [..., d]
+//              torch::Tensor& input)  // [..., d]
+//{
+//  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
+//}
+//void gelu_fast(torch::Tensor& out,    // [..., d]
+//               torch::Tensor& input)  // [..., d]
+//{
+//  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
+//}
+//void gelu_quick(torch::Tensor& out,    // [..., d]
+//                torch::Tensor& input)  // [..., d]
+//{
+//  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
+//}

activation/cuda_compat.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#endif
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
+#else
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
+#else
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
+    __shfl_down_sync(uint32_t(-1), var, lane_delta)
+#else
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#else
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#endif

activation/dispatch_utils.h ADDED Viewed

	@@ -0,0 +1,35 @@

+/*
+ * Adapted from
+ * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
+ */
+#pragma once
+#include <torch/all.h>
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
+#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                               \
+                     VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))

build.toml ADDED Viewed

	@@ -0,0 +1,49 @@

+[general]
+version = "0.0.1"
+[torch]
+name = "moe"
+src = [
+  "core/scalar_type.hpp",
+  "ext-torch/registration.h",
+  "ext-torch/torch_binding.cpp",
+  "ext-torch/torch_binding.h"
+]
+include = [ "." ]
+pyroot = "ext-torch"
+[kernel.moe]
+capabilities = [ "7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0" ]
+src = [
+  "cuda_compat.h",
+  "dispatch_utils.h",
+  "moe/moe_align_sum_kernels.cu",
+  "moe/topk_softmax_kernels.cu",
+]
+depends = [ "torch" ]
+[kernel.moe-marlin]
+capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0" ]
+src = [
+  "core/exception.hpp",
+  "core/scalar_type.hpp",
+  "marlin-moe/marlin_moe_ops.cu",
+  "marlin-moe/marlin_kernels/marlin_moe_kernel_ku4.cu",
+  "marlin-moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu",
+  "marlin-moe/marlin_kernels/marlin_moe_kernel.h",
+  "marlin-moe/marlin_kernels/marlin_moe_kernel_ku4.h",
+  "marlin-moe/marlin_kernels/marlin_moe_kernel_ku4b8.h",
+  "marlin-moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu",
+  "marlin-moe/marlin_kernels/marlin_moe_kernel_ku8b128.h",
+]
+include = [ "." ]
+depends = [ "torch" ]
+[kernel.activation]
+capabilities = [ "7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0" ]
+src = [
+  "activation/activation_kernels.cu",
+  "activation/cuda_compat.h",
+  "activation/dispatch_utils.h",
+]
+depends = [ "torch" ]

core/exception.hpp ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #pragma once
2	+
3	+ #define VLLM_IMPLIES(p, q) (!(p) \|\| (q))

core/scalar_type.hpp ADDED Viewed

	@@ -0,0 +1,347 @@

+#pragma once
+// For TORCH_CHECK
+#include <torch/library.h>
+namespace vllm {
+//
+//  ScalarType can represent a wide range of floating point and integer types,
+//  in particular it can be used to represent sub-byte data types (something
+//  that torch.dtype currently does not support).
+//
+//  The type definitions on the Python side can be found in: vllm/scalar_type.py
+//  these type definitions should be kept up to date with any Python API changes
+//  here.
+//
+class ScalarType {
+ public:
+  enum NanRepr : uint8_t {
+    NAN_NONE = 0,                // nans are not supported
+    NAN_IEEE_754 = 1,            // nans are: exp all 1s, mantissa not all 0s
+    NAN_EXTD_RANGE_MAX_MIN = 2,  // nans are: exp all 1s, mantissa all 1s
+    NAN_REPR_ID_MAX
+  };
+  constexpr ScalarType(uint8_t exponent, uint8_t mantissa, bool signed_,
+                       int32_t bias, bool finite_values_only = false,
+                       NanRepr nan_repr = NAN_IEEE_754)
+      : exponent(exponent),
+        mantissa(mantissa),
+        signed_(signed_),
+        bias(bias),
+        finite_values_only(finite_values_only),
+        nan_repr(nan_repr){};
+  static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
+    return ScalarType(0, size_bits - 1, true, bias);
+  }
+  static constexpr ScalarType uint(uint8_t size_bits, int32_t bias = 0) {
+    return ScalarType(0, size_bits, false, bias);
+  }
+  // IEEE 754 compliant floating point type
+  static constexpr ScalarType float_IEEE754(uint8_t exponent,
+                                            uint8_t mantissa) {
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
+  }
+  // IEEE 754 non-compliant floating point type
+  static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa,
+                                     bool finite_values_only,
+                                     NanRepr nan_repr) {
+    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    TORCH_CHECK(nan_repr != NAN_IEEE_754,
+                "use `float_IEEE754` constructor for floating point types that "
+                "follow IEEE 754 conventions");
+    return ScalarType(exponent, mantissa, true, 0, finite_values_only,
+                      nan_repr);
+  }
+  uint8_t const exponent;  // size of the exponent field (0 for integer types)
+  uint8_t const mantissa;  // size of the mantissa field (size of the integer
+                           // excluding the sign bit for integer types)
+  bool const signed_;  // flag if the type supports negative numbers (i.e. has a
+                       // sign bit)
+  int32_t const bias;  // stored values equal value + bias,
+                       // used for quantized type
+  // Extra Floating point info
+  bool const finite_values_only;  // i.e. no +/-inf if true
+  NanRepr const nan_repr;         // how NaNs are represented
+                                  // (not applicable for integer types)
+  using Id = int64_t;
+ private:
+  // Field size in id
+  template <typename T_>
+  static constexpr size_t member_id_field_width() {
+    using T = std::decay_t<T_>;
+    return std::is_same_v<T, bool> ? 1 : sizeof(T) * 8;
+  }
+  template <typename Fn, typename Init, typename Member, typename... Rest>
+  static constexpr auto reduce_members_helper(Fn f, Init val, Member member,
+                                              Rest... rest) {
+    auto new_val = f(val, member);
+    if constexpr (sizeof...(rest) > 0) {
+      return reduce_members_helper(f, new_val, rest...);
+    } else {
+      return new_val;
+    };
+  }
+  template <typename Fn, typename Init>
+  constexpr auto reduce_members(Fn f, Init init) const {
+    // Should be in constructor order for `from_id`
+    return reduce_members_helper(f, init, exponent, mantissa, signed_, bias,
+                                 finite_values_only, nan_repr);
+  };
+  template <typename Fn, typename Init>
+  static constexpr auto reduce_member_types(Fn f, Init init) {
+    constexpr auto dummy_type = ScalarType(0, 0, false, 0, false, NAN_NONE);
+    return dummy_type.reduce_members(f, init);
+  };
+  static constexpr auto id_size_bits() {
+    return reduce_member_types(
+        [](int acc, auto member) -> int {
+          return acc + member_id_field_width<decltype(member)>();
+        },
+        0);
+  }
+ public:
+  // unique id for this scalar type that can be computed at compile time for
+  //  c++17 template specialization this is not needed once we migrate to
+  //  c++20 and can pass literal classes as template parameters
+  constexpr Id id() const {
+    static_assert(id_size_bits() <= sizeof(Id) * 8,
+                  "ScalarType id is too large to be stored");
+    auto or_and_advance = [](std::pair<Id, uint32_t> result,
+                             auto member) -> std::pair<Id, uint32_t> {
+      auto [id, bit_offset] = result;
+      auto constexpr bits = member_id_field_width<decltype(member)>();
+      return {id | (int64_t(member) & ((uint64_t(1) << bits) - 1))
+                       << bit_offset,
+              bit_offset + bits};
+    };
+    return reduce_members(or_and_advance, std::pair<Id, uint32_t>{}).first;
+  }
+  // create a ScalarType from an id, for c++17 template specialization,
+  //  this is not needed once we migrate to c++20 and can pass literal
+  //  classes as template parameters
+  static constexpr ScalarType from_id(Id id) {
+    auto extract_and_advance = [id](auto result, auto member) {
+      using T = decltype(member);
+      auto [tuple, bit_offset] = result;
+      auto constexpr bits = member_id_field_width<T>();
+      auto extracted_val = static_cast<T>((int64_t(id) >> bit_offset) &
+                                          ((uint64_t(1) << bits) - 1));
+      auto new_tuple = std::tuple_cat(tuple, std::make_tuple(extracted_val));
+      return std::pair<decltype(new_tuple), int>{new_tuple, bit_offset + bits};
+    };
+    auto [tuple_args, _] = reduce_member_types(extract_and_advance,
+                                               std::pair<std::tuple<>, int>{});
+    return std::apply([](auto... args) { return ScalarType(args...); },
+                      tuple_args);
+  }
+  constexpr int64_t size_bits() const {
+    return mantissa + exponent + is_signed();
+  }
+  constexpr bool is_signed() const { return signed_; }
+  constexpr bool is_integer() const { return exponent == 0; }
+  constexpr bool is_floating_point() const { return exponent > 0; }
+  constexpr bool is_ieee_754() const {
+    return is_floating_point() && finite_values_only == false &&
+           nan_repr == NAN_IEEE_754;
+  }
+  constexpr bool has_nans() const {
+    return is_floating_point() && nan_repr != NAN_NONE;
+  }
+  constexpr bool has_infs() const {
+    return is_floating_point() && finite_values_only == false;
+  }
+  constexpr bool has_bias() const { return bias != 0; }
+ private:
+  double _floating_point_max() const {
+    TORCH_CHECK(mantissa <= 52 && exponent <= 11,
+                "Cannot represent max/min as a double for type ", str());
+    uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
+      max_mantissa -= 1;
+    }
+    uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
+      TORCH_CHECK(exponent < 11,
+                  "Cannot represent max/min as a double for type ", str());
+      max_exponent += 1;
+    }
+    // adjust the exponent to match that of a double
+    //  for now we assume the exponent bias is the standard 2^(e-1) -1, (where e
+    //  is the exponent bits), there is some precedent for non-standard biases,
+    //  example `float8_e4m3b11fnuz` here: https://github.com/jax-ml/ml_dtypes
+    //  but to avoid premature over complication we are just assuming the
+    //  standard exponent bias until there is a need to support non-standard
+    //  biases
+    uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1;
+    uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1;  // double e = 11
+    uint64_t max_exponent_double =
+        max_exponent - exponent_bias + exponent_bias_double;
+    // shift the mantissa into the position for a double and
+    // the exponent
+    uint64_t double_raw =
+        (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52);
+    return *reinterpret_cast<double*>(&double_raw);
+  }
+  constexpr std::variant<int64_t, double> _raw_max() const {
+    if (is_floating_point()) {
+      return {_floating_point_max()};
+    } else {
+      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
+                  "Cannot represent max as a int64_t");
+      return {(int64_t(1) << mantissa) - 1};
+    }
+  }
+  constexpr std::variant<int64_t, double> _raw_min() const {
+    if (is_floating_point()) {
+      TORCH_CHECK(is_signed(),
+                  "We currently assume all floating point types are signed");
+      constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
+      double max = _floating_point_max();
+      uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
+      uint64_t min_raw = max_raw | sign_bit_double;
+      return {*reinterpret_cast<double*>(&min_raw)};
+    } else {
+      TORCH_CHECK(!is_signed() || size_bits() <= 64,
+                  "Cannot represent min as a int64_t");
+      if (is_signed()) {
+        // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
+        // then perform an arithmetic shift right to set all the bits above
+        // (size_bits() - 1) to 1
+        return {INT64_MIN >> (64 - size_bits())};
+      } else {
+        return {int64_t(0)};
+      }
+    }
+  }
+ public:
+  // Max representable value for this scalar type.
+  // (accounting for bias if there is one)
+  constexpr std::variant<int64_t, double> max() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_max());
+  }
+  // Min representable value for this scalar type.
+  // (accounting for bias if there is one)
+  constexpr std::variant<int64_t, double> min() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_min());
+  }
+  std::string str() const {
+    /* naming generally follows: https://github.com/jax-ml/ml_dtypes
+     * for floating point types (leading f) the scheme is:
+     *  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+     *  flags:
+     *  - no-flags: means it follows IEEE 754 conventions
+     *  - f: means finite values only (no infinities)
+     *  - n: means nans are supported (non-standard encoding)
+     * for integer types the scheme is:
+     *  `[u]int<size_bits>[b<bias>]`
+     *  - if bias is not present it means its zero
+     */
+    if (is_floating_point()) {
+      auto ret = "float" + std::to_string(size_bits()) + "_e" +
+                 std::to_string(exponent) + "m" + std::to_string(mantissa);
+      if (!is_ieee_754()) {
+        if (finite_values_only) {
+          ret += "f";
+        }
+        if (nan_repr != NAN_NONE) {
+          ret += "n";
+        }
+      }
+      return ret;
+    } else {
+      auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits());
+      if (has_bias()) {
+        ret += "b" + std::to_string(bias);
+      }
+      return ret;
+    }
+  }
+  constexpr bool operator==(ScalarType const& other) const {
+    return mantissa == other.mantissa && exponent == other.exponent &&
+           bias == other.bias && signed_ == other.signed_ &&
+           finite_values_only == other.finite_values_only &&
+           nan_repr == other.nan_repr;
+  }
+};
+using ScalarTypeId = ScalarType::Id;
+// "rust style" names generally following:
+//   https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
+static inline constexpr auto kS4 = ScalarType::int_(4);
+static inline constexpr auto kU4 = ScalarType::uint(4);
+static inline constexpr auto kU4B8 = ScalarType::uint(4, 8);
+static inline constexpr auto kS8 = ScalarType::int_(8);
+static inline constexpr auto kU8 = ScalarType::uint(8);
+static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
+static inline constexpr auto kFE3M2f =
+    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
+static inline constexpr auto kFE4M3fn =
+    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
+static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
+static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
+// Fixed width style names, generally following:
+//  https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L47-L57
+static inline constexpr auto kInt4 = kS4;
+static inline constexpr auto kUint4 = kU4;
+static inline constexpr auto kUint4b8 = kU4B8;
+static inline constexpr auto kInt8 = kS8;
+static inline constexpr auto kUint8 = kU8;
+static inline constexpr auto kUint8b128 = kU8B128;
+static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
+static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
+static inline constexpr auto kFloat8_e5m2 = kFE5M2;
+static inline constexpr auto kFloat16_e8m7 = kFE8M7;
+static inline constexpr auto kFloat16_e5m10 = kFE5M10;
+// colloquial names
+static inline constexpr auto kHalf = kFE5M10;
+static inline constexpr auto kFloat16 = kHalf;
+static inline constexpr auto kBFloat16 = kFE8M7;
+static inline constexpr auto kFloat16Id = kFloat16.id();
+};  // namespace vllm

cuda_compat.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#endif
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
+#else
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
+#else
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
+    __shfl_down_sync(uint32_t(-1), var, lane_delta)
+#else
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
+#endif
+#ifndef USE_ROCM
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#else
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#endif

dispatch_utils.h ADDED Viewed

	@@ -0,0 +1,35 @@

+/*
+ * Adapted from
+ * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
+ */
+#pragma once
+#include <torch/all.h>
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
+#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                               \
+                     VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))

ext-torch/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ import moe._custom_ops as ops

ext-torch/_custom_ops.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from typing import TYPE_CHECKING
+import torch
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+try:
+    from ._ops import ops, add_op_namespace_prefix
+except ImportError as e:
+    # Fallback for local development.
+    try:
+        import _moe
+        ops = torch._moe
+        def add_op_namespace_prefix(op_name: str):
+            return f"_quantization::{op_name}"
+    except ImportError:
+        raise e
+from .scalar_type import ScalarType
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out

ext-torch/fp8.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+from typing import Tuple, Optional, Union
+def is_hip() -> bool:
+    return torch.version.hip is not None
+def scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    num_token_padding: Optional[int] = None,
+    scale_ub: Optional[torch.Tensor] = None,
+    use_per_token_if_dynamic: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP8 and return quantized tensor and scale.
+    This function supports both static and dynamic quantization: If you
+    provide the scale, it will use static scaling and if you omit it,
+    the scale will be determined dynamically. The function also allows
+    optional padding of the output tensors for downstream kernels that
+    will benefit from padding.
+    Args:
+        input: The input tensor to be quantized to FP8
+        scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic
+            per token case
+        num_token_padding: If specified, pad the first dimension
+            of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token
+            in the dynamic quantization case.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+            scaling factor.
+    """
+    # This code assumes batch_dim and num_tokens are flattened
+    assert input.ndim == 2
+    shape: Union[Tuple[int, int], torch.Size] = input.shape
+    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    if scale is None:
+        if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input, scale, scale_ub
+            )
+        else:
+            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        # num_token_padding not implemented for this case
+        assert scale.numel() == 1 or num_token_padding is None
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+    return output, scale

ext-torch/fused_marlin_moe.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""Fused MoE utilities for GPTQ."""
+import functools
+from typing import Any, Dict, Optional
+import torch
+from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import scalar_types
+import moe._custom_ops as ops
+def get_scalar_type(num_bits: int, has_zp: bool):
+    if has_zp:
+        assert num_bits == 4
+        return scalar_types.uint4
+    else:
+        return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
+def single_marlin_moe(
+    hidden_states: torch.Tensor,
+    w: torch.Tensor,
+    scales: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    g_idx: Optional[torch.Tensor] = None,
+    sort_indices: Optional[torch.Tensor] = None,
+    w_zeros: Optional[torch.Tensor] = None,
+    override_config: Optional[Dict[str, Any]] = None,
+    num_bits: int = 8,
+    is_k_full: bool = True,
+) -> torch.Tensor:
+    """
+    This function computes the multiplication of hidden_states with expert
+    weights used in Marlin MoE, using weights w and top-k gating mechanism.
+    Its purpose is testing and debugging the fused MoE kernel.
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the Marlin Mul.
+    - w (torch.Tensor): The set of expert weights.
+    - scales (torch.Tensor): The quantization scales.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx (Optional[torch.Tensor]): Optional act_order indices.
+    - sort_indices (Optional[torch.Tensor]): Optional act_order input
+      permutation.
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - w_zeros (Optional[torch.Tensor]): Optional zero points to be used for w.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - num_bits (bool): The number of bits in expert weights quantization.
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
+    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w.is_contiguous(), "Expert weights must be contiguous"
+    assert hidden_states.dtype == torch.float16
+    assert num_bits in [4, 8]
+    M, K = hidden_states.shape
+    E = w.shape[0]
+    N = w.shape[2] // (num_bits // 2)
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, renormalize)
+    # This might not be an optimal config for a single MMM
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w.shape,
+        w.shape,
+        topk_ids.shape[1],
+        None,
+        override_config=override_config,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+    block_size_m = config["BLOCK_SIZE_M"]
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+    max_workspace_size = (N // 64) * 16
+    workspace = torch.zeros(
+        max_workspace_size,
+        dtype=torch.int,
+        device=hidden_states.device,
+        requires_grad=False,
+    )
+    has_zero_point = w_zeros is not None
+    if w_zeros is None:
+        w_zeros = torch.empty(
+            (0, 0),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+            requires_grad=False,
+        )
+    if g_idx is None:
+        g_idx = torch.empty(
+            (0, 0), dtype=torch.int32, device=hidden_states.device, requires_grad=False
+        )
+    if sort_indices is None:
+        sort_indices = torch.empty(
+            (0), dtype=torch.int32, device=hidden_states.device, requires_grad=False
+        )
+    scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.ops.marlin_gemm_moe(
+        hidden_states,
+        w,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        scales,
+        w_zeros,
+        g_idx,
+        sort_indices,
+        workspace,
+        scalar_type.id,
+        M,
+        N,
+        K,
+        is_k_full,
+        E,
+        topk,
+        block_size_m,
+        True,
+        False,
+    )
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+def fused_marlin_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    g_idx1: Optional[torch.Tensor] = None,
+    g_idx2: Optional[torch.Tensor] = None,
+    sort_indices1: Optional[torch.Tensor] = None,
+    sort_indices2: Optional[torch.Tensor] = None,
+    w1_zeros: Optional[torch.Tensor] = None,
+    w2_zeros: Optional[torch.Tensor] = None,
+    override_config: Optional[Dict[str, Any]] = None,
+    num_bits: int = 8,
+    is_k_full: bool = True,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - w1_scale (torch.Tensor): Scale to be used for w1.
+    - w2_scale (torch.Tensor): Scale to be used for w2.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx1 (Optional[torch.Tensor]): The first set of act_order indices.
+    - g_idx2 (Optional[torch.Tensor]): The second set of act_order indices.
+    - sort_indices1 (Optional[torch.Tensor]): The first act_order input
+        permutation.
+    - sort_indices2 (Optional[torch.Tensor]): The second act_order input
+        permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
+    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
+    - num_bits (bool): The number of bits in expert weights quantization.
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2
+    ), "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype == torch.float16
+    assert num_bits in [4, 8]
+    has_no_act_order = (
+        g_idx1 is None
+        and g_idx2 is None
+        and sort_indices1 is None
+        and sort_indices2 is None
+    )
+    has_all_act_order = (
+        g_idx1 is not None
+        and g_idx2 is not None
+        and sort_indices1 is not None
+        and sort_indices2 is not None
+    )
+    assert has_no_act_order or has_all_act_order, (
+        "g_idx and sorted_indices " "must be all not None or must be all None"
+    )
+    has_no_zp = w1_zeros is None and w2_zeros is None
+    has_all_zp = w1_zeros is not None and w2_zeros is not None
+    assert has_no_zp or has_all_zp, (
+        "zero points must be both not None or " "must be both None"
+    )
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+    topk = topk_ids.shape[1]
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        None,
+        override_config=override_config,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+    block_size_m = config["BLOCK_SIZE_M"]
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+    max_workspace_size = (max(2 * N, K) // 64) * 16
+    workspace = torch.zeros(
+        max_workspace_size, dtype=torch.int, device="cuda", requires_grad=False
+    )
+    if has_no_zp:
+        w1_zeros = torch.empty(
+            (0, 0),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+            requires_grad=False,
+        )
+        w2_zeros = torch.empty(
+            (0, 0),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+            requires_grad=False,
+        )
+    if has_no_act_order:
+        g_idx1 = torch.empty(
+            (0, 0), dtype=torch.int32, device=hidden_states.device, requires_grad=False
+        )
+        g_idx2 = torch.empty(
+            (0, 0), dtype=torch.int32, device=hidden_states.device, requires_grad=False
+        )
+        sort_indices1 = torch.empty(
+            (0), dtype=torch.int32, device=hidden_states.device, requires_grad=False
+        )
+        sort_indices2 = torch.empty(
+            (0, 0), dtype=torch.int32, device=hidden_states.device, requires_grad=False
+        )
+    scalar_type1 = get_scalar_type(num_bits, has_all_zp)
+    scalar_type2 = get_scalar_type(num_bits, has_all_zp)
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+        hidden_states,
+        w1,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        w1_zeros,
+        g_idx1,
+        sort_indices1,
+        workspace,
+        scalar_type1.id,
+        M,
+        2 * N,
+        K,
+        is_k_full,
+        E,
+        topk,
+        block_size_m,
+        True,
+        False,
+    )
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+        intermediate_cache2,
+        w2,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w2_scale,
+        w2_zeros,
+        g_idx2,
+        sort_indices2,
+        workspace,
+        scalar_type2.id,
+        M,
+        K,
+        N,
+        is_k_full,
+        E,
+        topk,
+        block_size_m,
+        False,
+        True,
+    )
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

ext-torch/fused_moe.py ADDED Viewed

	@@ -0,0 +1,703 @@

+"""Fused MoE kernel."""
+import functools
+import json
+import os
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from .platforms import current_platform
+from .fp8 import scaled_fp8_quant
+import moe._custom_ops as ops
+VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
+@triton.jit
+def fused_moe_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bse,
+    stride_bsn,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+    )
+    off_experts = tl.load(expert_ids_ptr + pid_m)
+    b_ptrs = (
+        b_ptr
+        + off_experts * stride_be
+        + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    )
+    if use_int8_w8a16:
+        b_scale_ptrs = (
+            b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+        )
+        b_scale = tl.load(b_scale_ptrs)
+    if use_fp8_w8a8:
+        a_scale = tl.load(a_scale_ptr)
+        b_scale = tl.load(b_scale_ptr + off_experts)
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+        a = tl.load(
+            a_ptrs,
+            mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        )
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        # We accumulate along the K dimension.
+        if use_int8_w8a16:
+            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+        elif use_fp8_w8a8:
+            accumulator = tl.dot(a, b, acc=accumulator)
+        else:
+            accumulator += tl.dot(a, b)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+        accumulator = accumulator * moe_weight[:, None]
+    if use_int8_w8a16:
+        accumulator = (accumulator * b_scale).to(compute_type)
+    elif use_fp8_w8a8:
+        accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+    else:
+        accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+def moe_align_block_size(
+    topk_ids: torch.Tensor, block_size: int, num_experts: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+    ops.moe_align_block_size(
+        topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+    )
+    return sorted_ids, expert_ids, num_tokens_post_pad
+def invoke_fused_moe_kernel(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    B_scale: Optional[torch.Tensor],
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: Dict[str, Any],
+    compute_type: tl.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+) -> None:
+    assert topk_weights.stride(1) == 1
+    assert sorted_token_ids.stride(0) == 1
+    if use_fp8_w8a8:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+        assert B_scale is not None
+    elif use_int8_w8a16:
+        assert B_scale is not None
+    else:
+        assert A_scale is None
+        assert B_scale is None
+    grid = lambda META: (
+        triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"])
+        * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]),
+    )
+    fused_moe_kernel[grid](
+        A,
+        B,
+        C,
+        A_scale,
+        B_scale,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        B.shape[1],
+        B.shape[2],
+        sorted_token_ids.shape[0],
+        topk_ids.numel(),
+        A.stride(0),
+        A.stride(1),
+        B.stride(0),
+        B.stride(2),
+        B.stride(1),
+        C.stride(1),
+        C.stride(2),
+        B_scale.stride(0) if B_scale is not None and use_int8_w8a16 else 0,
+        B_scale.stride(1) if B_scale is not None and use_int8_w8a16 else 0,
+        MUL_ROUTED_WEIGHT=mul_routed_weight,
+        top_k=top_k,
+        compute_type=compute_type,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        **config,
+    )
+def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    dtype_selector = "" if not dtype else f",dtype={dtype}"
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
+@functools.lru_cache
+def get_moe_configs(E: int, N: int, dtype: Optional[str]) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the fused MoE kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the fused_moe kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    json_file_name = get_config_file_name(E, N, dtype)
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    return None
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+) -> Dict[str, int]:
+    config = {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+    }
+    # A heuristic: fused marlin works faster with this config for small M
+    if M <= E or (is_marlin and M <= 32):
+        config = {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 1,
+        }
+    return config
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+    is_marlin: bool = False,
+):
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        configs = get_moe_configs(E, N, dtype)
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, is_marlin)
+    return config
+def fused_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    M, _ = hidden_states.shape
+    topk_weights = torch.empty(
+        M, topk, dtype=torch.float32, device=hidden_states.device
+    )
+    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+    token_expert_indicies = torch.empty(
+        M, topk, dtype=torch.int32, device=hidden_states.device
+    )
+    ops.topk_softmax(
+        topk_weights,
+        topk_ids,
+        token_expert_indicies,
+        gating_output.float(),  # TODO(woosuk): Optimize this.
+    )
+    del token_expert_indicies  # Not used. Will be used in the future.
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+# This is used by the Deepseek-V2 model
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+def get_config_dtype_str(
+    dtype: torch.dtype,
+    use_int8_w8a16: Optional[bool] = False,
+    use_fp8_w8a8: Optional[bool] = False,
+):
+    if use_fp8_w8a8:
+        return "fp8_w8a8"
+    elif use_int8_w8a16:
+        return "int8_w8a16"
+    elif dtype == torch.float:
+        # avoiding cases where kernel fails when float32 MoE
+        # use fp16/bfloat16 configs
+        return "float32"
+    return None
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+):
+    # Check constraints.
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
+    config_dtype = get_config_dtype_str(
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        dtype=hidden_states.dtype,
+    )
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        config_dtype,
+        override_config=override_config,
+    )
+    config = get_config_func(M)
+    intermediate_cache1 = torch.empty(
+        (M, topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N // 2),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache3 = torch.empty(
+        (M, topk_ids.shape[1], w2.shape[1]),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    compute_type = tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (
+            chunk * CHUNK_SIZE,
+            min((chunk + 1) * CHUNK_SIZE, num_tokens),
+        )
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+        if tokens_in_chunk == 0:
+            break
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+            config = get_config_func(tokens_in_chunk)
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            curr_topk_ids, config["BLOCK_SIZE_M"], E
+        )
+        invoke_fused_moe_kernel(
+            curr_hidden_states,
+            w1,
+            intermediate_cache1,
+            a1_scale,
+            w1_scale,
+            curr_topk_weights,
+            curr_topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,
+            topk_ids.shape[1],
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+        )
+        ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+        invoke_fused_moe_kernel(
+            intermediate_cache2,
+            w2,
+            intermediate_cache3,
+            a2_scale,
+            w2_scale,
+            curr_topk_weights,
+            curr_topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            True,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+        )
+        ops.moe_sum(
+            intermediate_cache3.view(*intermediate_cache3.shape),
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+        )
+    return out_hidden_states
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - num_expert_group: Optional[int]: additional parameter for grouped_topk
+    - topk_group: Optional[int]: additional parameter for grouped_topk
+    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+        note: Deepseekv2 model uses grouped_topk
+    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    if use_grouped_topk:
+        assert num_expert_group is not None and topk_group is not None
+        topk_weights, topk_ids = grouped_topk(
+            hidden_states,
+            gating_output,
+            topk,
+            renormalize,
+            num_expert_group,
+            topk_group,
+        )
+    elif custom_routing_function is None:
+        topk_weights, topk_ids = fused_topk(
+            hidden_states, gating_output, topk, renormalize
+        )
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize
+        )
+    return fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        inplace=inplace,
+        override_config=override_config,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+    )

ext-torch/platforms.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import Callable, ParamSpec, TypeVar
+import os
+from functools import lru_cache, wraps
+import torch
+IS_ROCM = torch.version.hip is not None
+class CudaPlatform:
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(0)
+class RocmPlatform:
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()

ext-torch/registration.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+#include <Python.h>
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }

ext-torch/scalar_type.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import functools
+import struct
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+# Mirrors enum in `core/scalar_type.hpp`
+class NanRepr(Enum):
+    NONE = 0  # nans are not supported
+    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
+    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
+# This ScalarType class is a parallel implementation of the C++ ScalarType
+# class found in csrc/core/scalar_type.hpp.  These two classes should be kept
+# in sync until the inductor fully supports custom C++ classes.
+@dataclass(frozen=True)
+class ScalarType:
+    """
+    ScalarType can represent a wide range of floating point and integer
+    types, in particular it can be used to represent sub-byte data types
+    (something that torch.dtype currently does not support). It is also
+    capable of  representing types with a bias, i.e.:
+      `stored_value = value + bias`,
+    this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias
+    of 8). The implementation for this class can be found in
+    csrc/core/scalar_type.hpp, these type signatures should be kept in sync
+    with that file.
+    """
+    exponent: int
+    """
+    Number of bits in the exponent if this is a floating point type
+    (zero if this an integer type)
+    """
+    mantissa: int
+    """
+    Number of bits in the mantissa if this is a floating point type,
+    or the number bits representing an integer excluding the sign bit if
+    this an integer type.
+    """
+    signed: bool
+    "If the type is signed (i.e. has a sign bit)"
+    bias: int
+    """
+    bias used to encode the values in this scalar type
+    (value = stored_value - bias, default 0) for example if we store the
+    type as an unsigned integer with a bias of 128 then the value 0 will be
+    stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
+    """
+    _finite_values_only: bool = False
+    """
+    Private: if infs are supported, used `has_infs()` instead.
+    """
+    nan_repr: NanRepr = NanRepr.IEEE_754
+    """
+    How NaNs are represent in this scalar type, returns NanRepr value.
+    (not applicable for integer types)
+    """
+    def _floating_point_max_int(self) -> int:
+        assert (
+            self.mantissa <= 52 and self.exponent <= 11
+        ), f"Cannot represent max/min as a double for type {self.__str__()}"
+        max_mantissa = (1 << self.mantissa) - 1
+        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN:
+            max_mantissa = max_mantissa - 1
+        max_exponent = (1 << self.exponent) - 2
+        if (self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN
+                or self.nan_repr == NanRepr.NONE):
+            assert (
+                self.exponent < 11
+            ), f"Cannot represent max/min as a double for type {self.__str__()}"
+            max_exponent = max_exponent + 1
+        # adjust the exponent to match that of a double
+        # for now we assume the exponent bias is the standard 2^(e-1) -1, (where
+        # e is the exponent bits), there is some precedent for non-standard
+        # biases, example `float8_e4m3b11fnuz` here:
+        # https://github.com/jax-ml/ml_dtypes but to avoid premature over
+        # complication we are just assuming the standard exponent bias until
+        # there is a need to support non-standard biases
+        exponent_bias = (1 << (self.exponent - 1)) - 1
+        exponent_bias_double = (1 << 10) - 1  # double e = 11
+        max_exponent_double = (max_exponent - exponent_bias +
+                               exponent_bias_double)
+        # shift the mantissa and exponent into the proper positions for an
+        # IEEE double and bitwise-or them together.
+        return (max_mantissa <<
+                (52 - self.mantissa)) | (max_exponent_double << 52)
+    def _floating_point_max(self) -> float:
+        double_raw = self._floating_point_max_int()
+        return struct.unpack('!d', struct.pack('!Q', double_raw))[0]
+    def _raw_max(self) -> Union[int, float]:
+        if self.is_floating_point():
+            return self._floating_point_max()
+        else:
+            assert (self.size_bits < 64 or self.size_bits == 64
+                    and self.is_signed()), "Cannot represent max as an int"
+            return (1 << self.mantissa) - 1
+    def _raw_min(self) -> Union[int, float]:
+        if self.is_floating_point():
+            assert self.is_signed(
+            ), "We currently assume all floating point types are signed"
+            sign_bit_double = 1 << 63
+            max_raw = self._floating_point_max_int()
+            min_raw = max_raw | sign_bit_double
+            return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
+        else:
+            assert (not self.is_signed() or
+                    self.size_bits <= 64), "Cannot represent min as a int64_t"
+            if self.is_signed():
+                return -(1 << (self.size_bits - 1))
+            else:
+                return 0
+    @functools.cached_property
+    def id(self) -> int:
+        """
+        Convert the ScalarType to an int which can be passed to pytorch custom
+        ops. This layout of the int must be kept in sync with the C++
+        ScalarType's from_id method.
+        """
+        val = 0
+        offset = 0
+        def or_and_advance(member, bit_width):
+            nonlocal val
+            nonlocal offset
+            bit_mask = (1 << bit_width) - 1
+            val = val | (int(member) & bit_mask) << offset
+            offset = offset + bit_width
+        or_and_advance(self.exponent, 8)
+        or_and_advance(self.mantissa, 8)
+        or_and_advance(self.signed, 1)
+        or_and_advance(self.bias, 32)
+        or_and_advance(self._finite_values_only, 1)
+        or_and_advance(self.nan_repr.value, 8)
+        assert offset <= 64, \
+            f"ScalarType fields too big {offset} to fit into an int64"
+        return val
+    @property
+    def size_bits(self) -> int:
+        return self.exponent + self.mantissa + int(self.signed)
+    def min(self) -> Union[int, float]:
+        """
+        Min representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_min() - self.bias
+    def max(self) -> Union[int, float]:
+        """
+        Max representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_max() - self.bias
+    def is_signed(self) -> bool:
+        """
+        If the type is signed (i.e. has a sign bit), same as `signed`
+        added for consistency with:
+        https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
+        """
+        return self.signed
+    def is_floating_point(self) -> bool:
+        "If the type is a floating point type"
+        return self.exponent != 0
+    def is_integer(self) -> bool:
+        "If the type is an integer type"
+        return self.exponent == 0
+    def has_bias(self) -> bool:
+        "If the type has a non-zero bias"
+        return self.bias != 0
+    def has_infs(self) -> bool:
+        "If the type is floating point and supports infinity"
+        return not self._finite_values_only
+    def has_nans(self) -> bool:
+        return self.nan_repr != NanRepr.NONE.value
+    def is_ieee_754(self) -> bool:
+        """
+        If the type is a floating point type that follows IEEE 754
+        conventions
+        """
+        return self.nan_repr == NanRepr.IEEE_754.value and \
+            not self._finite_values_only
+    def __str__(self) -> str:
+        """
+        naming generally follows: https://github.com/jax-ml/ml_dtypes
+        for floating point types (leading f) the scheme is:
+        `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+        flags:
+          - no-flags: means it follows IEEE 754 conventions
+          - f: means finite values only (no infinities)
+          - n: means nans are supported (non-standard encoding)
+        for integer types the scheme is:
+          `[u]int<size_bits>[b<bias>]`
+          - if bias is not present it means its zero
+        """
+        if self.is_floating_point():
+            ret = "float" + str(self.size_bits) + "_e" + str(
+                self.exponent) + "m" + str(self.mantissa)
+            if not self.is_ieee_754():
+                if self._finite_values_only:
+                    ret = ret + "f"
+                if self.nan_repr != NanRepr.NONE:
+                    ret = ret + "n"
+            return ret
+        else:
+            ret = ("int" if self.is_signed() else "uint") + str(self.size_bits)
+            if self.has_bias():
+                ret = ret + "b" + str(self.bias)
+            return ret
+    def __repr__(self) -> str:
+        return "ScalarType." + self.__str__()
+    # __len__ needs to be defined (and has to throw TypeError) for pytorch's
+    # opcheck to work.
+    def __len__(self) -> int:
+        raise TypeError
+    #
+    # Convenience Constructors
+    #
+    @classmethod
+    def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+        "Create a signed integer scalar type (size_bits includes sign-bit)."
+        ret = cls(0, size_bits - 1, True, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+    @classmethod
+    def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+        """Create a unsigned integer scalar type."""
+        ret = cls(0, size_bits, False, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+    @classmethod
+    def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType':
+        """
+        Create a standard floating point type
+        (i.e. follows IEEE 754 conventions).
+        """
+        assert (mantissa > 0 and exponent > 0)
+        ret = cls(exponent, mantissa, True, 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+    @classmethod
+    def float_(cls, exponent: int, mantissa: int, finite_values_only: bool,
+               nan_repr: NanRepr) -> 'ScalarType':
+        """
+        Create a non-standard floating point type
+        (i.e. does not follow IEEE 754 conventions).
+        """
+        assert (mantissa > 0 and exponent > 0)
+        assert (nan_repr != NanRepr.IEEE_754), (
+            "use `float_IEEE754` constructor for floating point types that "
+            "follow IEEE 754 conventions")
+        ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+# naming generally follows: https://github.com/jax-ml/ml_dtypes
+# for floating point types (leading f) the scheme is:
+#  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+#  flags:
+#  - no-flags: means it follows IEEE 754 conventions
+#  - f: means finite values only (no infinities)
+#  - n: means nans are supported (non-standard encoding)
+# for integer types the scheme is:
+#  `[u]int<size_bits>[b<bias>]`
+#  - if bias is not present it means its zero
+class scalar_types:
+    int4 = ScalarType.int_(4, None)
+    uint4 = ScalarType.uint(4, None)
+    int8 = ScalarType.int_(8, None)
+    uint8 = ScalarType.uint(8, None)
+    float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
+    float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float16_e8m7 = ScalarType.float_IEEE754(8, 7)
+    float16_e5m10 = ScalarType.float_IEEE754(5, 10)
+    # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
+    float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+    # "gptq" types
+    uint2b2 = ScalarType.uint(2, 2)
+    uint3b4 = ScalarType.uint(3, 4)
+    uint4b8 = ScalarType.uint(4, 8)
+    uint8b128 = ScalarType.uint(8, 128)
+    # colloquial names
+    bfloat16 = float16_e8m7
+    float16 = float16_e5m10

ext-torch/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,45 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // Activation used in fused MoE layers.
+  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+  // Apply topk softmax to the gating outputs.
+  ops.def("topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
+          "token_expert_indices, Tensor gating_output) -> ()");
+  ops.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+  // Calculate the result of moe by summing up the partial results
+  // from all selected experts.
+  ops.def("moe_sum(Tensor! input, Tensor output) -> ()");
+  ops.impl("moe_sum", torch::kCUDA, &moe_sum);
+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size.
+  ops.def("moe_align_block_size(Tensor topk_ids, int num_experts,"
+          "                     int block_size, Tensor! sorted_token_ids,"
+          "                     Tensor! experts_ids,"
+          "                     Tensor! num_tokens_post_pad) -> ()");
+  ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+#ifndef USE_ROCM
+  ops.def("marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
+          "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
+          "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
+          "int b_q_type, SymInt size_m, "
+          "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
+          "topk, "
+          "int moe_block_size, bool replicate_input, bool apply_weights)"
+          " -> Tensor");
+#endif
+}
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, ops) {
+  ops.impl("marlin_gemm_moe", &marlin_gemm_moe);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

ext-torch/torch_binding.h ADDED Viewed

	@@ -0,0 +1,30 @@

+#pragma once
+#include <torch/torch.h>
+#include <core/scalar_type.hpp>
+void silu_and_mul(torch::Tensor &out, torch::Tensor &input);
+void topk_softmax(torch::Tensor &topk_weights, torch::Tensor &topk_indices,
+                  torch::Tensor &token_expert_indices,
+                  torch::Tensor &gating_output);
+void moe_sum(torch::Tensor &input, torch::Tensor &output);
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
+                          torch::Tensor experts_ids,
+                          torch::Tensor num_tokens_post_pad);
+#ifndef USE_ROCM
+torch::Tensor marlin_gemm_moe(
+    const torch::Tensor &a, const torch::Tensor &b_q_weights,
+    const torch::Tensor &sorted_ids, const torch::Tensor &topk_weights,
+    const torch::Tensor &topk_ids, const torch::Tensor &b_scales,
+    torch::Tensor &b_zeros, const torch::Tensor &g_idx,
+    const torch::Tensor &perm, torch::Tensor &workspace,
+    vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
+    int64_t moe_block_size, bool replicate_input, bool apply_weights);
+#endif

ext-torch/utils/__init__.py ADDED Viewed

File without changes

ext-torch/utils/marlin_utils.py ADDED Viewed

	@@ -0,0 +1,307 @@

+from typing import List, Optional, Tuple
+import numpy
+import torch
+from moe.scalar_type import ScalarType, scalar_types
+from .quant_utils import pack_cols, unpack_cols
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+GPTQ_MARLIN_24_TILE = 16
+GPTQ_MARLIN_24_MIN_THREAD_N = 128
+GPTQ_MARLIN_24_MIN_THREAD_K = 128
+GPTQ_MARLIN_24_MAX_PARALLEL = 64
+GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
+MARLIN_QQQ_TILE = 16
+MARLIN_QQQ_MIN_THREAD_N = 64
+MARLIN_QQQ_MIN_THREAD_K = 128
+MARLIN_QQQ_MAX_PARALLEL = 16
+MARLIN_QQQ_SUPPORTED_NUM_BITS = [4]
+MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128]
+MARLIN_QQQ_SUPPORTED_SYM = [True]
+MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+# In case there is a performance issue with Marlin, the variable below can be
+# changed to False, which allows Marlin to perform global reductions in fp16
+# precision (instead of fp32), and therefore, save on some memory movements.
+USE_FP32_REDUCE_DEFAULT = True
+# For binary size and compile time, we don't support the same types for with and
+#  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
+#  TODO: we may want to move this into the C++ so its closer to the actual impl
+def query_marlin_supported_quant_types(
+    has_zp: bool, device_capability: Optional[int] = None
+):
+    if device_capability is None:
+        capability_tuple = torch.cuda.get_device_capability()
+        device_capability = capability_tuple[0] * 10 + capability_tuple[1]
+    if device_capability < 80:
+        return []
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
+        #  to add `scalar_types.float8_e4m3fn` here
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
+def _check_marlin_supported(
+    quant_type: ScalarType,
+    group_size: Optional[int],
+    has_zp: bool,
+    device_capability: Optional[int] = None,
+) -> Tuple[bool, Optional[str]]:
+    if device_capability is None:
+        capability_tuple = torch.cuda.get_device_capability()
+        device_capability = capability_tuple[0] * 10 + capability_tuple[1]
+    supported_types = query_marlin_supported_quant_types(has_zp, device_capability)
+    if quant_type not in supported_types:
+        return (
+            False,
+            f"Marlin does not support weight_bits = {quant_type}. "
+            f"Only types = {supported_types} "
+            f"are supported (for group_size = {group_size}, "
+            f"device_capability = {device_capability}, zp = {has_zp}).",
+        )
+    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+        return (
+            False,
+            f"Marlin does not support group_size = {group_size}. "
+            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+            "are supported.",
+        )
+    return True, None
+def check_marlin_supported(
+    quant_type: ScalarType,
+    group_size: int,
+    has_zp: bool = False,
+    device_capability: Optional[int] = None,
+) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
+    return cond
+def verify_marlin_supported(
+    quant_type: ScalarType, group_size: int, has_zp: bool = False
+) -> None:
+    cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+def verify_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> None:
+    # Validate output_size_per_partition
+    if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(
+            f"Weight output_size_per_partition = "
+            f"{output_size_per_partition} is not divisible by "
+            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+    # Validate input_size_per_partition
+    if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(
+            f"Weight input_size_per_partition = "
+            f"{input_size_per_partition} is not divisible "
+            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+    if group_size < input_size and input_size_per_partition % group_size != 0:
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}."
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+def check_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> Tuple[bool, Optional[str]]:
+    try:
+        verify_marlin_supports_shape(
+            output_size_per_partition, input_size_per_partition, input_size, group_size
+        )
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+def marlin_make_workspace(
+    output_size_per_partition: int, device: torch.device
+) -> torch.Tensor:
+    max_workspace_size = (
+        output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
+    ) * GPTQ_MARLIN_MAX_PARALLEL
+    return torch.zeros(
+        max_workspace_size, dtype=torch.int, device=device, requires_grad=False
+    )
+def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+def marlin_repeat_scales_on_all_ranks(
+    act_order: bool, group_size: int, is_row_parallel: bool
+) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
+def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
+def marlin_sort_g_idx(g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+def get_scale_perms():
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: List[int] = []
+    for i in range(4):
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+def marlin_permute_scales(
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+) -> torch.Tensor:
+    scale_perm, scale_perm_single = get_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+    return s
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+def marlin_zero_points(
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # Permute zero-points in a similar way to scales, but do not use the
+    # "single" permutation, since zero-points are applied on every MMA
+    scale_perm, _ = get_scale_perms()
+    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    zp = zp.reshape((-1, size_n)).contiguous()
+    zp = pack_cols(zp, num_bits, size_k, size_n)
+    return zp
+def awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # AWQ zero-points are quantized and packed on the column dim.
+    # In addition, the values are permuted based on dequantizer.
+    # Here we undo both of these, and then apply marlin permutation
+    # and pack it back.
+    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
+    # Undo interleaving (use argsort(..) to get inverse perm)
+    if num_bits == 4:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
+    elif num_bits == 8:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
+    q_zp = q_zp.reshape((-1, size_n)).contiguous()
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    return marlin_zp
+def moe_awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+):
+    num_experts = q_zp_packed.shape[0]
+    output = torch.empty(
+        (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
+        device=q_zp_packed.device,
+        dtype=q_zp_packed.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
+    return output

ext-torch/utils/marlin_utils_test.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""Utility functions used for tests and benchmarks"""
+from typing import List, Optional
+import numpy as np
+import torch
+from moe.scalar_type import ScalarType
+from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales, marlin_zero_points
+from .quant_utils import (
+    get_pack_factor,
+    gptq_quantize_weights,
+    quantize_weights,
+    sort_weights,
+)
+class MarlinWorkspace:
+    def __init__(self, out_features, min_thread_n, max_parallel):
+        assert (
+            out_features % min_thread_n == 0
+        ), "out_features = {} is undivisible by min_thread_n = {}".format(
+            out_features, min_thread_n
+        )
+        max_workspace_size = (out_features // min_thread_n) * max_parallel
+        self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, device="cuda")
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+    assert q_w.shape == (size_k, size_n)
+    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
+    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
+    # Permute weights to 16x64 marlin tiles
+    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    q_w = q_w.permute((0, 2, 1, 3))
+    q_w = q_w.reshape((size_k // tile, size_n * tile))
+    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+    return q_w
+def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+    q_w = q_w.cpu().numpy().astype(np.uint32)
+    q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), dtype=np.uint32)
+    for i in range(pack_factor):
+        q_packed |= q_w[:, i::pack_factor] << num_bits * i
+    q_packed = torch.from_numpy(q_packed.astype(np.int32)).to(orig_device)
+    return q_packed
+def get_weight_perm(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                2 * (i % 4),
+                2 * (i % 4) + 1,
+                2 * (i % 4 + 4),
+                2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+    perm = np.array(perm_list)
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+def marlin_quantize(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    act_order: bool,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    size_k, size_n = w.shape
+    num_bits = quant_type.size_bits
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        w, quant_type, group_size, act_order, test_perm
+    )
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+    # Reformat to marlin
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+    return res_list
+def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int):
+    size_k, size_n = w.shape
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+    # Detect num groups
+    assert size_k % group_size == 0
+    num_groups = size_k // group_size
+    # Quantize with zp
+    w_ref, q_w, s, zp = quantize_weights(w, quant_type, group_size, zero_points=True)
+    # Reformat to marlin
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    marlin_zp = marlin_zero_points(zp, num_groups, size_n, quant_type.size_bits)
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+    return res_list

ext-torch/utils/quant_utils.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""This file is used for /tests and /benchmarks"""
+from typing import List, Optional
+import numpy
+import torch
+from moe.scalar_type import ScalarType, scalar_types
+SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+MARLIN_QQQ_SUPPORTED_NUM_BITS = [4]
+# Note: this is a hack. We should update each model to register the
+# stacked params and get it from there instead in a future PR.
+# fused_name: List[shard_name]
+FUSED_LAYER_NAME_MAPPING = {
+    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    "gate_up_proj": ["gate_proj", "up_proj"],
+}
+def pack_quantized_values_into_int32(
+    w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0
+):
+    # move dim to pack to the end
+    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
+    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
+    w_q_perm = w_q.permute(perm)
+    pack_factor = 32 // wtype.size_bits
+    mask = (1 << wtype.size_bits) - 1
+    new_shape_perm = list(w_q_perm.shape)
+    assert w_q_perm.shape[-1] % pack_factor == 0
+    new_shape_perm[-1] //= pack_factor
+    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
+    for i in range(pack_factor):
+        res |= (w_q_perm[..., i::pack_factor] & mask) << wtype.size_bits * i
+    return res.permute(inv_perm)
+def unpack_quantized_values_into_int32(
+    w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0
+):
+    # move dim to pack to the end
+    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
+    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
+    w_q_perm = w_q.permute(perm)
+    pack_factor = 32 // wtype.size_bits
+    mask = (1 << wtype.size_bits) - 1
+    new_shape_perm = list(w_q_perm.shape)
+    new_shape_perm[-1] *= pack_factor
+    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
+    for i in range(pack_factor):
+        res[..., i::pack_factor] = (w_q_perm >> wtype.size_bits * i) & mask
+    return res.permute(inv_perm)
+def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
+    # prefix: model.layers.0.self_attn.q_proj
+    # proj_name: q_proj
+    proj_name = prefix.split(".")[-1]
+    if proj_name in FUSED_LAYER_NAME_MAPPING:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name]
+        ]
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = shard_prefix in ignored_layers
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision."
+                )
+    else:
+        is_skipped = prefix in ignored_layers
+    assert is_skipped is not None
+    return is_skipped
+def get_pack_factor(num_bits):
+    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
+    return 32 // num_bits
+def permute_rows(
+    q_w: torch.Tensor,
+    w_ref: torch.Tensor,
+    group_size: int,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    assert q_w.shape == w_ref.shape
+    orig_device = q_w.device
+    k_size, _ = q_w.shape
+    g_idx = torch.zeros((k_size,), dtype=torch.int32)
+    for i in range(k_size):
+        g_idx[i] = i // group_size
+    # Simulate act_order by doing a random permutation on K
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
+    g_idx = g_idx[rand_perm].contiguous()
+    q_w = q_w[rand_perm, :].contiguous()
+    w_ref = w_ref[rand_perm, :].contiguous()
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        rand_perm.to(device=orig_device),
+    )
+def quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: Optional[int],
+    zero_points: bool = False,
+    ref_zero_points_after_scales: bool = False,
+):
+    assert (
+        quant_type.is_integer()
+    ), "Floating point quantization may work but has not been tested"
+    assert not zero_points or group_size is not None, (
+        "to have group zero points, group_size must be provided "
+        "(-1 group_size is channelwise)"
+    )
+    orig_device = w.device
+    orig_type = w.dtype
+    size_k, size_n = w.shape
+    assert w.is_floating_point(), "w must be float"
+    if group_size == -1:
+        group_size = size_k
+    # Reshape to [groupsize, -1]
+    if group_size is not None and group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+    # Compute scale for each group
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            assert not quant_type.is_signed() and quant_type.max() > 0
+            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+            maybe_w_zp = (
+                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
+            )
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
+            )
+    # Quantize
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
+    # Compute ref (dequantized)
+    # For some kernels (namely Machete) the zero-points are applied after the
+    # scales are applied, for this case computing the reference in similar way
+    # allows us to use tighter error tolerances in our unit tests.
+    if ref_zero_points_after_scales and maybe_w_zp is not None:
+        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
+    else:
+        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+    if quant_type.has_bias():
+        w_q += quant_type.bias
+    # Restore original shapes
+    if group_size is not None and group_size < size_k:
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+        w_q = reshape_w(w_q)
+        w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
+    if maybe_w_zp is not None:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
+    return (
+        w_ref.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s if group_size is not None else None,
+        maybe_w_zp,
+    )
+def gptq_quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    act_order: bool,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    size_k, _ = w.shape
+    assert w.is_floating_point(), "w must be float"
+    assert (
+        quant_type in SUPPORTED_GPTQ_QUANT_TYPES
+    ), f"Unsupported gptq type = {quant_type}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
+    # Apply act_order
+    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
+    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        assert (
+            group_size < size_k
+        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
+            group_size, size_k
+        )
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size, test_perm)
+    return w_ref, w_q, w_s, g_idx, rand_perm
+# QQQ employs different quant schemes for per-group and
+# per-channel quantization.
+def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
+    orig_device = w.device
+    size_k, size_n = w.shape
+    assert w.is_floating_point(), "w must be float"
+    assert (
+        num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS
+    ), f"Unsupported num_bits = {num_bits}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+    if group_size < size_k:
+        # Reshape to [groupsize, -1]
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+        max_q_val = 2**num_bits - 1
+        half_q_val = (max_q_val + 1) // 2
+        # Compute scale for each group
+        s_group = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_group *= 2 / max_q_val  # 2 => symmetric
+        # Quantize
+        q_w = torch.round(w / s_group).int()
+        q_w += half_q_val
+        q_w = torch.clamp(q_w, 0, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = (q_w - half_q_val).half() * s_group
+        # Restore original shapes
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+        q_w = reshape_w(q_w)
+        w_ref = reshape_w(w_ref)
+        # Compute int8 quantization scale for each channel
+        s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0]
+        s_channel /= 127.0
+        t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8)
+        w_ref = t_int8.half() * s_channel
+        s_channel = s_channel.reshape(1, -1).to(dtype=torch.float)
+        # Fuse scales
+        s_group = (s_group.reshape(-1, size_n).contiguous() / s_channel).to(
+            dtype=torch.half
+        )
+    else:
+        max_q_val = 2 ** (num_bits - 1) - 1
+        # Compute scale for each channel
+        s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_channel /= max_q_val
+        # Quantize
+        q_w = torch.round(w / s_channel).int()
+        q_w = torch.clamp(q_w, -max_q_val, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = q_w.half() * s_channel
+        s_group = torch.tensor([], dtype=torch.half)
+        # div 2 ** (8 - self.bits)) to offset right shift in unpacking
+        s_channel /= 2 ** (8 - num_bits)
+        s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float)
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        s_group.to(device=orig_device),
+        s_channel.to(device=orig_device),
+    )
+def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
+    orig_device = q_w.device
+    sort_indices = torch.argsort(g_idx).to(dtype=torch.int32)  # Sort based on g_idx
+    g_idx = g_idx[sort_indices].contiguous()
+    q_w = q_w[sort_indices, :].contiguous()
+    return (
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        sort_indices.to(device=orig_device),
+    )
+def pack_rows(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+    pack_factor = get_pack_factor(num_bits)
+    assert size_k % pack_factor == 0
+    orig_device = q_w.device
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)
+    for i in range(pack_factor):
+        q_res |= q_w[i::pack_factor, :] << num_bits * i
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    return q_res
+def pack_cols(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    orig_device = q_w.device
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
+    for i in range(pack_factor):
+        q_res |= q_w[:, i::pack_factor] << num_bits * i
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+    return q_res
+def unpack_cols(
+    packed_q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    assert packed_q_w.shape == (
+        size_k,
+        size_n // pack_factor,
+    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+        packed_q_w.shape, size_k, size_n, pack_factor
+    )
+    orig_device = packed_q_w.device
+    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
+    mask = (1 << num_bits) - 1
+    for i in range(pack_factor):
+        vals = packed_q_w_cpu & mask
+        packed_q_w_cpu >>= num_bits
+        q_res[:, i::pack_factor] = vals
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+    return q_res
+def gptq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    return pack_rows(q_w, num_bits, size_k, size_n)
+def awq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+    q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
+    q_w = q_w.reshape((-1, size_n)).contiguous()
+    return pack_cols(q_w, num_bits, size_k, size_n)

flake.nix ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  description = "Flake for activation kernels";
+  inputs = {
+    kernel-builder.url = "git+ssh://git@github.com/huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs ./.;
+}

marlin-moe/marlin_kernels/marlin_moe_kernel.h ADDED Viewed

	@@ -0,0 +1,1616 @@

+#pragma once
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "core/scalar_type.hpp"
+namespace marlin_moe {
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+using I4 = Vec<int, 4>;
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+using FragZP = Vec<half2, 4>;
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
+                           FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+}
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+template <vllm::ScalarTypeId w_type_id>
+__device__ inline FragB dequant(int q);
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+template <>
+__device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+// Fast Int8ToFp16: Efficiently dequantize 8bit int values to fp16
+// Reference:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+template <>
+__device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+template <>
+__device__ inline FragB dequant<vllm::kU4.id()>(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+template <>
+__device__ inline FragB dequant<vllm::kU8.id()>(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+__device__ inline void sub_zp(FragB& frag_b, half2& frag_zp, int i) {
+  half2 zp = __half2half2(reinterpret_cast<__half*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+// Same as above, but for act_order (each K is multiplied individually)
+__device__ inline void scale4(FragB& frag_b, FragS& frag_s_1, FragS& frag_s_2,
+                              FragS& frag_s_3, FragS& frag_s_4, int i) {
+  __half2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<__half*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<__half*>(&frag_s_2)[i];
+  __half2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<__half*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<__half*>(&frag_s_4)[i];
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+// Given 2 floats multiply by 2 scales (halves)
+__device__ inline void scale_float(float* c, FragS& s) {
+  __half* s_ptr = reinterpret_cast<__half*>(&s);
+  c[0] = __fmul_rn(c[0], __half2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], __half2float(s_ptr[1]));
+}
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const bool has_zp,           // whether zero-points are enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__device__ void MarlinMoESingle(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,  // float topk weights
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int* __restrict__ expert_offsets,
+    int num_groups,        // number of scale groups per output channel
+    int expert_idx,        // idx of current expert
+    int num_experts,       // number of experts
+    int topk,              // topk parameter of moe
+    int prob_m,            // batch dimension m
+    int prob_n,            // output dimension n
+    int prob_k,            // reduction dimension k
+    int tot_m,             // total number of rows in A and C
+    int* locks,            // extra global storage for barrier synchronization
+    bool replicate_input,  // do we use the same input for each expert?
+    bool apply_weights,    // apply weights to output
+    int current_m_block    // current m block to start kernel computation from
+) {
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  constexpr int pack_factor = 32 / w_type.size_bits();
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              ceildiv(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+    sorted_ids += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
+  }
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      sorted_ids += 16 * thread_m_blocks;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+  // A sizes/strides
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+          ? thread_k_blocks / group_blocks
+          : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+  // Zero-points sizes/strides
+  int zp_gl_stride = (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                 zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    zp_sh_rd = num_ints_per_thread * num_col_threads *
+                   ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+               num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+  }
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++) {
+    int a_idx = a_sh_wr_delta * i + a_sh_wr;
+    int row = a_idx / a_gl_rd_delta_o;
+    if (row >= prob_m) {
+      a_sh_wr_pred[i] = false;
+    } else {
+      a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+    }
+  }
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
+                                    int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+    int row_offset = first_group_id * s_gl_stride;
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
+        int row = a_idx / a_gl_stride;
+        int sorted_row =
+            replicate_input ? sorted_ids[row] / topk : sorted_ids[row];
+        int new_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
+        if (sorted_row < tot_m * (replicate_input ? 1 : topk) &&
+            new_idx < a_gl_stride * tot_m * (replicate_input ? 1 : topk)) {
+          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[new_idx],
+                         a_sh_wr_pred[i]);
+        }
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
+                          &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+  auto fetch_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+  bool is_same_group[stages];
+  int same_group_id[stages];
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      is_same_group[pipe] = false;
+      same_group_id[pipe] = 0;
+      return;
+    }
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+          int warp_row = warp_id / n_warps;
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+      return;
+    }
+    // Act-order case
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+    cur_k += warp_row * 16;
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    // This code does not handle group_blocks == 0,
+    // which signifies act_order.
+    // has_zp implies AWQ, which doesn't have act_order,
+    static_assert(!has_zp || group_blocks != 0);
+    if constexpr (has_zp) {
+      int pipe = full_pipe % stages;
+      if constexpr (group_blocks == -1) {
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+        }
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        int4* sh_zp_stage =
+            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      } else {
+        int warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
+        int warp_row = warp_id / n_warps;
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
+        int k_blocks = cur_k / 16;
+        int cur_group_id = 0;
+        // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+        cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+        sh_zp_stage += cur_group_id * zp_sh_stride;
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      }
+    }
+  };
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+    if constexpr (has_zp) {
+      FragB frag_zp_0;
+      FragB frag_zp_1;
+      int zp_quant_0, zp_quant_1;
+      if constexpr (w_type.size_bits() == 4) {
+        zp_quant_0 = frag_qzp[k % 2][0];
+        zp_quant_1 = zp_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        zp_quant_0 = frag_qzp[k % 2][0];
+        zp_quant_1 = frag_qzp[k % 2][1];
+      }
+      frag_zp_0 = dequant<w_type_id>(zp_quant_0);
+      frag_zp_1 = dequant<w_type_id>(zp_quant_1);
+      frag_zp[0] = frag_zp_0[0];
+      frag_zp[1] = frag_zp_0[1];
+      frag_zp[2] = frag_zp_1[0];
+      frag_zp[3] = frag_zp_1[1];
+    }
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant_0, b_quant_1;
+      if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k % 2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
+      FragB frag_b0 = dequant<w_type_id>(b_quant_0);
+      FragB frag_b1 = dequant<w_type_id>(b_quant_1);
+      // Apply zero-point to frag_b0
+      if constexpr (has_zp) {
+        sub_zp(frag_b0, frag_zp[j], 0);
+      }
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0);
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b0, frag_s[k % 2][j], 0);
+        }
+      }
+      // Apply zero-point to frag_b1
+      if constexpr (has_zp) {
+        sub_zp(frag_b1, frag_zp[j], 1);
+      }
+      // Apply scale to frag_b1
+      if constexpr (has_act_order) {
+        scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1);
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b1, frag_s[k % 2][j], 1);
+        }
+      }
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+      int row = (threadIdx.x % 32) / 4;
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          int c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+          int sorted_row = sorted_ids[c_idx / c_gl_stride];
+          int new_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], &C[new_idx],
+                         sorted_row < tot_m * topk &&
+                             (8 * (i / 2) + row < prob_m &&
+                              (i < (thread_m_blocks - 1) * 4 ||
+                               sorted_ids[8 * (i / 2) + row] < tot_m * topk)));
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (8 * (i / 2) + row < prob_m &&
+            (i < (thread_m_blocks - 1) * 4 ||
+             sorted_ids[8 * (i / 2) + row] < tot_m * topk)) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half*>(&c)[j] =
+                  __float2half(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            int c_idx =
+                c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+            int row = sorted_ids[c_idx / c_gl_stride];
+            if (row < tot_m * topk) {
+              int new_idx = row * c_gl_stride + c_idx % c_gl_stride;
+              C[new_idx] = c;
+            }
+          }
+        }
+      }
+    }
+  };
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    int c_gl_wr_end = c_gl_stride * prob_m;
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4) {
+        res = __hmul2(res, s[0]);
+      }
+      ((half2*)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        int row = sorted_ids[c_gl_wr / c_gl_stride];
+        if (row < tot_m * topk) {
+          int off = row * c_gl_stride + c_gl_wr % c_gl_stride;
+          if (!apply_weights) {
+            C[off] = sh[c_sh_rd];
+          } else {
+            __half* ctrg = reinterpret_cast<__half*>(&C[off]);
+            __half* csrc = reinterpret_cast<__half*>(&sh[c_sh_rd]);
+            for (int j = 0; j < 8; ++j) {
+              ctrg[j] = __float2half(topk_weights[row] * __half2float(csrc[j]));
+            }
+          }
+          c_gl_wr += c_gl_wr_delta;
+          c_sh_rd += c_sh_rd_delta;
+        }
+      }
+    }
+  };
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+      }
+      if constexpr (has_zp && group_blocks == -1) {
+        if (i == 0) {
+          fetch_zp_to_shared();
+        }
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if constexpr (w_type.size_bits() == 8) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        } else {
+          // For 4-bit per-column scales, we only fetch them here in the
+          // final step before write-out
+          if (last) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
+            cp_async_fence();
+          }
+        }
+      }
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if constexpr (w_type.size_bits() == 8) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+          }
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            }
+          }
+        }
+      }
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+            }
+          }
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+        }
+        start_pipes();
+      }
+    }
+  }
+}
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const bool has_zp,           // whether zero-points are enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void MarlinMoE(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids_base,  // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,   // float topk weights
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int* __restrict__ expert_offsets,
+    int num_groups,        // number of scale groups per output channel
+    int expert_idx,        // idx of current expert
+    int num_experts,       // number of experts
+    int topk,              // topk parameter of moe
+    int prob_m,            // batch dimension m
+    int prob_n,            // output dimension n
+    int prob_k,            // reduction dimension k
+    int tot_m,             // total number of rows in A and C
+    int* locks,            // extra global storage for barrier synchronization
+    bool replicate_input,  // do we use the same input for each expert?
+    bool apply_weights,    // apply weights to output
+    int current_m_block,   // current m block to start kernel computation from
+    int max_par,           // maximum parallelism
+    int cfg_max_m_blocks   // upper bound on m blocks
+) {
+  int m_block_ctr = current_m_block;
+  const int* sorted_ids_expert =
+      sorted_ids_base + expert_offsets[expert_idx] + m_block_ctr * 4 * max_par;
+  int tot_its = expert_offsets[expert_idx + 1] - expert_offsets[expert_idx];
+  if (tot_its == 0) {
+    return;
+  }
+  int tot_m_blocks = ceildiv(tot_its, 16);
+  int pad = 16 * tot_m_blocks - tot_its;
+  if (m_block_ctr >= tot_m_blocks) {
+    return;
+  }
+  int max_block = tot_m_blocks - m_block_ctr;
+  prob_m = tot_its - 16 * m_block_ctr;
+  int par = 1;
+  if (max_block > cfg_max_m_blocks) {
+    // Note that parallel > 1 currently only works for inputs without any
+    // padding
+    par = (16 * max_block - pad) / (16 * cfg_max_m_blocks);
+    if (par > max_par) par = max_par;
+    prob_m = (16 * cfg_max_m_blocks) * par;
+    m_block_ctr += cfg_max_m_blocks * (par - 1);
+    max_block = cfg_max_m_blocks;
+  }
+  if (max_block == 1) {
+    MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, has_zp, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
+  } else if (max_block == 2) {
+    MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, has_zp, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
+  } else if (max_block == 3) {
+    MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, has_zp, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
+  } else {
+    MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, has_zp, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
+  }
+}
+#else
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const bool has_zp,           // whether zero-points are enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void MarlinMoE(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,  // float topk weights
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int* __restrict__ expert_offsets,
+    int num_groups,        // number of scale groups per output channel
+    int expert_idx,        // idx of current expert
+    int num_experts,       // number of experts
+    int topk,              // topk parameter of moe
+    int prob_m,            // batch dimension m
+    int prob_n,            // output dimension n
+    int prob_k,            // reduction dimension k
+    int tot_m,             // total number of rows in A and C
+    int* locks,            // extra global storage for barrier synchronization
+    bool replicate_input,  // do we use the same input for each expert?
+    bool apply_weights,    // apply weights to output
+    int current_m_block,   // current m block to start kernel computation from
+    int max_par,           // maximum parallelism
+    int cfg_max_m_blocks   // upper bound on m blocks
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+#endif
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;               // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4;  // 4 pipeline stages fit into shared memory
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+#define __CALL_IF_MOE(W_TYPE, THREAD_N_BLOCKS, THREAD_K_BLOCKS, HAS_ACT_ORDER, \
+                      HAS_ZP, GROUP_BLOCKS, NUM_THREADS)                       \
+  else if (q_type == W_TYPE && thread_n_blocks == THREAD_N_BLOCKS &&           \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(                                                      \
+        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,  \
+                  STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS>,                \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
+    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
+              STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS>                     \
+        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
+            A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,      \
+            zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,     \
+            num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,           \
+            replicate_input, apply_weights, m_block, max_par,                  \
+            cfg_max_m_blocks);                                                 \
+  }
+#define GPTQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)          \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)
+#define AWQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)          \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
+}  // namespace marlin_moe

marlin-moe/marlin_kernels/marlin_moe_kernel_ku4.cu ADDED Viewed

	@@ -0,0 +1,31 @@

+#include "marlin_moe_kernel_ku4.h"
+namespace marlin_moe {
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku4(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
+    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
+    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
+    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
+    int m_block, int max_par, int cfg_max_m_blocks) {
+  bool has_zp = true;
+  if (false) {
+  }
+  AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
+  AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
+  AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
+  AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
+  else {
+    return false;
+  }
+  return true;
+}
+}  // namespace marlin_moe

marlin-moe/marlin_kernels/marlin_moe_kernel_ku4.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include "marlin_moe_kernel.h"
+namespace marlin_moe {
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku4(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
+    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
+    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
+    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
+    int m_block, int max_par, int cfg_max_m_blocks);
+}  // namespace marlin_moe

marlin-moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu ADDED Viewed

	@@ -0,0 +1,31 @@

+#include "marlin_moe_kernel_ku4b8.h"
+namespace marlin_moe {
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku4b8(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
+    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
+    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
+    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
+    int m_block, int max_par, int cfg_max_m_blocks) {
+  bool has_zp = false;
+  if (false) {
+  }
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
+  else {
+    return false;
+  }
+  return true;
+}
+}  // namespace marlin_moe

marlin-moe/marlin_kernels/marlin_moe_kernel_ku4b8.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include "marlin_moe_kernel.h"
+namespace marlin_moe {
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku4b8(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
+    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
+    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
+    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
+    int m_block, int max_par, int cfg_max_m_blocks);
+}  // namespace marlin_moe

marlin-moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu ADDED Viewed

	@@ -0,0 +1,31 @@

+#include "marlin_moe_kernel_ku8b128.h"
+namespace marlin_moe {
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku8b128(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
+    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
+    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
+    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
+    int m_block, int max_par, int cfg_max_m_blocks) {
+  bool has_zp = false;
+  if (false) {
+  }
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
+  else {
+    return false;
+  }
+  return true;
+}
+}  // namespace marlin_moe

marlin-moe/marlin_kernels/marlin_moe_kernel_ku8b128.h ADDED Viewed

	@@ -0,0 +1,18 @@

+#pragma once
+#include "marlin_moe_kernel.h"
+namespace marlin_moe {
+bool call_marlin_moe_kernel_ku8b128(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
+    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
+    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
+    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
+    int m_block, int max_par, int cfg_max_m_blocks);
+}

marlin-moe/marlin_moe_ops.cu ADDED Viewed

	@@ -0,0 +1,584 @@

+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "core/exception.hpp"
+#include "core/scalar_type.hpp"
+#include "marlin_kernels/marlin_moe_kernel_ku4b8.h"
+#include "marlin_kernels/marlin_moe_kernel_ku8b128.h"
+#include "marlin_kernels/marlin_moe_kernel_ku4.h"
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+namespace marlin_moe {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  int start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = finish_row - start_row;
+  int row_stride = size_k * sizeof(half) / 16;
+  auto permute_row = [&](int row) {
+    int iters = size_k / blockDim.x;
+    int rest = size_k % blockDim.x;
+    int offset = row * row_stride;
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+    int base_k = 0;
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+      out_half[cur_k] = a_row_half[src_pos];
+      base_k += blockDim.x;
+    }
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
+                                       int* __restrict__ expert_offsets,
+                                       int topk_length, int block_size) {
+  int expert_id = threadIdx.x;
+  int num_experts = blockDim.x;
+  int occurrences = 0;
+  for (int i = 0; i < topk_length; ++i) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  expert_offsets[expert_id + 1] = occurrences;
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    int tot_offset = 0;
+    expert_offsets[0] = 0;
+    for (int i = 0; i < num_experts; ++i) {
+      tot_offset += ceildiv(expert_offsets[i + 1], block_size) * block_size;
+      expert_offsets[i + 1] = tot_offset;
+    }
+  }
+  __syncthreads();
+}
+#else
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
+                                       int* __restrict__ expert_offsets,
+                                       int topk_length, int block_size) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+#endif
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+typedef struct {
+  int max_m_blocks;
+  thread_config_t tb_cfg;
+} exec_config_t;
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},  // Default
+    {128, 64, 128},   // Reduce N 2X, same K
+    {64, 256, 256},   // Reduce K 2X, increase N 2X
+    {64, 128, 128},   // Reduce K 2X, same N
+    {64, 64, 128},    // Reduce both 2X
+};
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},   // Default
+    {128, 128, 256},  // Reduce N 2X, increase K 2X
+    {64, 128, 128},   // Reduce N 2X, same K
+    {128, 64, 128},   // Reduce N 4X, increase K 2X
+    {64, 64, 128},    // Reduce N 4X, same K
+};
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = ceildiv(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = ceildiv(tb_k, group_size);
+  }
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * STAGES * 2;          // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 4;
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+    return tb_scales * STAGES;
+  }
+}
+bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
+                         int prob_m, int prob_n, int prob_k, int num_bits,
+                         int scales_cache_size, int max_shared_mem) {
+  int pack_factor = 32 / num_bits;
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int b_size = (tb_k * tb_n / pack_factor) * 4;
+  // Get A size
+  int m_blocks = ceildiv(prob_m, 16);
+  int tb_max_m = 16;
+  while (true) {
+    if (m_blocks >= max_m_blocks) {
+      tb_max_m *= max_m_blocks;
+      break;
+    }
+    max_m_blocks--;
+    if (max_m_blocks == 0) {
+      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
+    }
+  }
+  int a_size = (tb_max_m * tb_k) * 2;
+  float pipe_size = (a_size + b_size) * STAGES;
+  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
+  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+}
+bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, bool has_act_order, bool is_k_full,
+                     int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+  //  Determine cache for scales
+  int scales_cache_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+  // Check that pipeline fits into cache
+  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                           num_bits, scales_cache_size, max_shared_mem)) {
+    return false;
+  }
+  return true;
+}
+exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
+                                      int num_bits, int group_size,
+                                      bool has_act_order, bool is_k_full,
+                                      int max_shared_mem) {
+  int max_m_blocks = 4;
+  while (max_m_blocks > 0) {
+    if (prob_m <= 16) {
+      for (auto th_config : small_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, has_act_order, is_k_full,
+                            max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    } else {
+      for (auto th_config : large_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, has_act_order, is_k_full,
+                            max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    }
+    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
+                     // usage
+  }
+  return exec_config_t{0, {-1, -1, -1}};
+}
+#define CALL_MOE_KERNEL_FUNCTION(KERNEL_FUNCTION)                             \
+  else if (KERNEL_FUNCTION(                                                   \
+               q_type, thread_n_blocks, thread_k_blocks, has_act_order,       \
+               group_blocks, num_threads, blocks, max_shared_mem, stream,     \
+               A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,  \
+               zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \
+               num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,       \
+               replicate_input, apply_weights, m_block, max_par,              \
+               exec_cfg.max_m_blocks)) {                                      \
+  }
+void marlin_mm_moe(const void* A, const void* B, void* C,
+                   const void* sorted_ids, const void* topk_weights,
+                   const void* topk_ids, const void* s, void* zp,
+                   const void* g_idx, const void* perm, void* a_tmp,
+                   void* expert_offsets, int prob_m, int prob_n, int prob_k,
+                   void* workspace, vllm::ScalarType const& q_type,
+                   bool has_act_order, bool is_k_full, bool has_zp,
+                   int num_groups, int group_size, int num_experts, int topk,
+                   int moe_block_size, int dev, cudaStream_t stream,
+                   int thread_k, int thread_n, int sms, int max_par,
+                   bool replicate_input, bool apply_weights) {
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+  int num_bits = q_type.size_bits();
+  // Set thread config
+  exec_config_t exec_cfg;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    exec_cfg =
+        exec_config_t{4, thread_config_t{thread_k, thread_n, USER_THREADS}};
+  } else {
+    // Auto config
+    exec_cfg =
+        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
+                                has_act_order, is_k_full, max_shared_mem);
+  }
+  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
+                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
+                                  prob_m, prob_n, prob_k, num_bits, group_size,
+                                  has_act_order, is_k_full, max_shared_mem),
+              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
+              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
+              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
+              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", max_shared_mem = ", max_shared_mem);
+  int num_threads = exec_cfg.tb_cfg.num_threads;
+  thread_k = exec_cfg.tb_cfg.thread_k;
+  thread_n = exec_cfg.tb_cfg.thread_n;
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int blocks = sms;
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+  int tot_m = prob_m;
+  const int* topk_ids_ptr = (const int*)topk_ids;
+  int* expert_offsets_ptr = (int*)expert_offsets;
+  compute_expert_offsets<<<1, num_experts, 0, stream>>>(
+      topk_ids_ptr, expert_offsets_ptr, tot_m * topk, moe_block_size);
+  bool do_permute_a = has_act_order;
+  // If we have a full K, then we can run the non-act-order version of Marlin
+  // (since the weight rows are reordered by increasing group ids, and by
+  // having a full K, we have full original groups)
+  if (is_k_full) {
+    has_act_order = false;
+  }
+  int pack_factor = 32 / q_type.size_bits();
+  for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
+    const int4* A_ptr = (const int4*)A;
+    int4* a_tmp_ptr = (int4*)a_tmp;
+    const int4* B_ptr =
+        (const int4*)B + (prob_n * prob_k / (pack_factor * 4)) * expert_idx;
+    int4* C_ptr = (int4*)C;
+    const float* topk_weights_ptr = (const float*)topk_weights;
+    const int* sorted_ids_ptr = (const int*)sorted_ids;
+    const int4* s_ptr = (const int4*)s + num_groups * prob_n / 8 * expert_idx;
+    const int4* zp_ptr =
+        (const int4*)zp + num_groups * prob_n / (pack_factor * 4) * expert_idx;
+    const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
+    const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
+    int* locks = (int*)workspace;
+    if (do_permute_a) {
+      // Permute A columns
+      int topk_rows = replicate_input ? tot_m : tot_m * topk;
+      int block_rows = ceildiv(topk_rows, blocks);
+      permute_cols_kernel<<<blocks, num_threads, 0, stream>>>(
+          A_ptr, perm_ptr, a_tmp_ptr, topk_rows, prob_k, block_rows);
+      A_ptr = a_tmp_ptr;
+    }
+    int tot_m_blocks = ceildiv(tot_m, 16);
+    for (int m_block = 0; m_block < tot_m_blocks;
+         m_block += 4 * exec_cfg.max_m_blocks) {
+      if (false) {
+      }
+      CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4b8)
+      CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku8b128)
+      CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4)
+      else {
+        TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+                               str(prob_n) + ", " + str(prob_k) + "]" +
+                               ", has_act_order = " + str(has_act_order) +
+                               ", num_groups = " + str(num_groups) +
+                               ", group_size = " + str(group_size) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+      }
+    }
+  }
+}
+}  // namespace marlin_moe
+torch::Tensor marlin_gemm_moe(
+    const torch::Tensor& a, const torch::Tensor& b_q_weights,
+    const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
+    const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
+    torch::Tensor& b_zeros, const torch::Tensor& g_idx,
+    const torch::Tensor& perm, torch::Tensor& workspace,
+    vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
+    int64_t moe_block_size, bool replicate_input, bool apply_weights) {
+  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+  bool has_zp = b_zeros.size(1) != 0;
+  if (has_zp) {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4,
+        "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
+  } else {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
+        "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type.str());
+  }
+  int pack_factor = 32 / b_q_type.size_bits();
+  int max_par = 4;
+  int dev = a.get_device();
+  auto options_dtype =
+      torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt).device(a.device());
+  torch::Tensor c = torch::zeros({size_m, topk, size_n}, options_dtype);
+  torch::Tensor a_tmp =
+      replicate_input ? torch::zeros({size_m, size_k}, options_dtype)
+                      : torch::zeros({size_m, topk, size_k}, options_dtype);
+  torch::Tensor expert_offsets = torch::empty({num_experts + 1}, options_int);
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+  bool has_act_order = g_idx.size(1) != 0;
+  int b_rank = b_scales.sizes().size();
+  TORCH_CHECK(b_rank == 3, "b_scales rank = ", b_rank, " is not 3");
+  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(1);
+  TORCH_CHECK(VLLM_IMPLIES(!is_k_full, has_act_order),
+              "if is_k_full is false, has_act_order must be true");
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+  } else {
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
+    TORCH_CHECK(b_zeros.size(1) == num_groups,
+                "b_zeros dim 1 = ", b_zeros.size(1),
+                " is not num_groups = ", num_groups);
+    TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
+                "b_zeros dim 2 = ", b_zeros.size(2),
+                " is not size_n / pack_factor = ", size_n / pack_factor);
+  }
+  marlin_moe::marlin_mm_moe(
+      a.data_ptr(), b_q_weights.data_ptr(), c.data_ptr(), sorted_ids.data_ptr(),
+      topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
+      b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
+      expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
+      b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
+      num_experts, topk, moe_block_size, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par,
+      replicate_input, apply_weights);
+  return c;
+}

moe/moe_align_sum_kernels.cu ADDED Viewed

	@@ -0,0 +1,202 @@

+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include "../cuda_compat.h"
+#include "../dispatch_utils.h"
+#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
+namespace vllm {
+namespace moe {
+namespace {
+__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
+                                         int32_t col) {
+  // don't worry about overflow because num_experts is relatively small
+  return row * total_col + col;
+}
+}  // namespace
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
+                                            int32_t* sorted_token_ids,
+                                            int32_t* expert_ids,
+                                            int32_t* total_tokens_post_pad,
+                                            int32_t num_experts,
+                                            int32_t block_size, size_t numel) {
+  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+  extern __shared__ int32_t shared_mem[];
+  int32_t* tokens_cnts =
+      shared_mem;  // 2d tensor with shape (blockDim.x + 1, num_experts)
+  int32_t* cumsum =
+      shared_mem +
+      (blockDim.x + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
+  }
+  /**
+   * In the first step we compute token_cnts[thread_index + 1][expert_index],
+   * which counts how many tokens in the token shard of thread_index are
+   * assigned to expert expert_index.
+   */
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
+  }
+  __syncthreads();
+  // For each expert we accumulate the token counts from the different threads.
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
+          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+    }
+  }
+  __syncthreads();
+  // We accumulate the token counts of all experts in thread 0.
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] = cumsum[i - 1] +
+                  CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
+                          block_size) *
+                      block_size;
+    }
+    *total_tokens_post_pad = cumsum[num_experts];
+  }
+  __syncthreads();
+  /**
+   * For each expert, each thread processes the tokens of the corresponding
+   * blocks and stores the corresponding expert_id for each block.
+   */
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+  }
+  /**
+   * Each thread processes a token shard, calculating the index of each token
+   * after sorting by expert number. Given the example topk_ids =
+   * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
+   * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
+   * padding value(preset in python).
+   */
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    /** The cumsum[expert_id] stores the starting index of the tokens that the
+     * expert with expert_id needs to process, and
+     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
+     * processed by the expert with expert_id within the current thread's token
+     * shard.
+     */
+    int32_t rank_post_pad =
+        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
+        cumsum[expert_id];
+    sorted_token_ids[rank_post_pad] = i;
+    ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
+  }
+}
+template <typename scalar_t, int TOPK>
+__global__ void moe_sum_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., topk, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    scalar_t x = 0.0;
+#pragma unroll
+    for (int k = 0; k < TOPK; ++k) {
+      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
+    }
+    out[token_idx * d + idx] = x;
+  }
+}
+}  // namespace moe
+}  // namespace vllm
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
+                          torch::Tensor experts_ids,
+                          torch::Tensor num_tokens_post_pad) {
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+        // tensors
+        const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+        const int32_t shared_mem =
+            ((num_thread + 1) * num_experts + (num_experts + 1)) *
+            sizeof(int32_t);
+        // set dynamic shared mem
+        auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+            (void*)kernel, shared_mem));
+        kernel<<<1, num_thread, shared_mem, stream>>>(
+            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+            experts_ids.data_ptr<int32_t>(),
+            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+            topk_ids.numel());
+      });
+}
+void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
+             torch::Tensor& output)  // [num_tokens, hidden_size]
+{
+  const int hidden_size = input.size(-1);
+  const int num_tokens = output.numel() / hidden_size;
+  const int topk = input.size(1);
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  switch (topk) {
+    case 2:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+    case 3:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 3><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+    case 4:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+    default:
+      at::sum_out(output, input, 1);
+      break;
+  }
+}

moe/topk_softmax_kernels.cu ADDED Viewed

	@@ -0,0 +1,506 @@

+/*
+ * Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.7.1/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+ * Copyright (c) 2024, The vLLM team.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "../cuda_compat.h"
+#ifndef USE_ROCM
+    #include <cub/util_type.cuh>
+    #include <cub/cub.cuh>
+#else
+    #include <hipcub/util_type.hpp>
+    #include <hipcub/hipcub.hpp>
+#endif
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+namespace vllm {
+namespace moe {
+/// Aligned array type
+template <
+    typename T,
+    /// Number of elements in the array
+    int N,
+    /// Alignment requirement in bytes
+    int Alignment = sizeof(T) * N
+>
+class alignas(Alignment) AlignedArray {
+    float data[N];
+};
+// ====================== Softmax things ===============================
+// We have our own implementation of softmax here so we can support transposing the output
+// in the softmax kernel when we extend this module to support expert-choice routing.
+template <int TPB>
+__launch_bounds__(TPB) __global__
+    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
+{
+    using BlockReduce = cub::BlockReduce<float, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage;
+    __shared__ float normalizing_factor;
+    __shared__ float float_max;
+    const int thread_row_offset = blockIdx.x * num_cols;
+    cub::Sum sum;
+    float threadData(-FLT_MAX);
+    // Don't touch finished rows.
+    if ((finished != nullptr) && finished[blockIdx.x])
+    {
+        return;
+    }
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        threadData = max(static_cast<float>(input[idx]), threadData);
+    }
+    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+    if (threadIdx.x == 0)
+    {
+        float_max = maxElem;
+    }
+    __syncthreads();
+    threadData = 0;
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        threadData += exp((static_cast<float>(input[idx]) - float_max));
+    }
+    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
+    if (threadIdx.x == 0)
+    {
+        normalizing_factor = 1.f / Z;
+    }
+    __syncthreads();
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
+        output[idx] = val;
+    }
+}
+template <int TPB>
+__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output,
+    int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert)
+{
+    using cub_kvp = cub::KeyValuePair<int, float>;
+    using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage;
+    cub_kvp thread_kvp;
+    cub::ArgMax arg_max;
+    const int num_rows = gridDim.x;
+    const int block_row = blockIdx.x;
+    const bool row_is_active = finished ? !finished[block_row] : true;
+    const int thread_read_offset = blockIdx.x * num_experts;
+    for (int k_idx = 0; k_idx < k; ++k_idx)
+    {
+        thread_kvp.key = 0;
+        thread_kvp.value = -1.f; // This is OK because inputs are probabilities
+        cub_kvp inp_kvp;
+        for (int expert = threadIdx.x; expert < num_experts; expert += TPB)
+        {
+            const int idx = thread_read_offset + expert;
+            inp_kvp.key = expert;
+            inp_kvp.value = inputs_after_softmax[idx];
+            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
+            {
+                const int prior_winning_expert = indices[k * block_row + prior_k];
+                if (prior_winning_expert == expert)
+                {
+                    inp_kvp = thread_kvp;
+                }
+            }
+            thread_kvp = arg_max(inp_kvp, thread_kvp);
+        }
+        const cub_kvp result_kvp = BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
+        if (threadIdx.x == 0)
+        {
+            // Ignore experts the node isn't responsible for with expert parallelism
+            const int expert = result_kvp.key;
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+            const int idx = k * block_row + k_idx;
+            output[idx] = result_kvp.value;
+            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
+            assert(indices[idx] >= 0);
+            source_rows[idx] = k_idx * num_rows + block_row;
+        }
+        __syncthreads();
+    }
+}
+// ====================== TopK softmax things ===============================
+/*
+  A Top-K gating softmax written to exploit when the number of experts in the MoE layers
+  are a small power of 2. This allows us to cleanly share the rows among the threads in
+  a single warp and eliminate communication between warps (so no need to use shared mem).
+  It fuses the softmax, max and argmax into a single kernel.
+  Limitations:
+  1) This implementation is intended for when the number of experts is a small power of 2.
+  2) This implementation assumes k is small, but will work for any k.
+*/
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
+__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
+    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert)
+{
+    // We begin by enforcing compile time assertions and setting up compile time constants.
+    static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
+    static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
+    static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
+    static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
+    // Number of bytes each thread pulls in per load
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
+    static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
+    static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
+    // Restrictions based on previous section.
+    static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
+    static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
+    static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
+    static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
+    // We have NUM_EXPERTS elements per row. We specialize for small #experts
+    static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
+    static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
+    static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
+    // Restrictions for previous section.
+    static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0, "The elts per row must cleanly divide the total elt per warp");
+    // ===================== From this point, we finally start computing run-time variables. ========================
+    // Compute CTA and warp rows. We pack multiple rows into a single warp, and a block contains WARPS_PER_CTA warps.
+    // This, each block processes a chunk of rows. We start by computing the start row for each block.
+    const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
+    // Now, using the base row per thread block, we compute the base row per warp.
+    const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
+    // The threads in a warp are split into sub-groups that will work on a row.
+    // We compute row offset for each thread sub-group
+    const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
+    const int thread_row = warp_base_row + thread_row_in_warp;
+    // Threads with indices out of bounds should early exit here.
+    if (thread_row >= num_rows)
+    {
+        return;
+    }
+    const bool row_is_active = finished ? !finished[thread_row] : true;
+    // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
+    // row it will read.
+    const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+    // Now, we compute the group each thread belong to in order to determine the first column to start loads.
+    const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
+    const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
+    const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
+    // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
+    // this can support all powers of 2 up to 16.
+    // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
+    // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
+    using AccessType = AlignedArray<float, ELTS_PER_LDG>;
+    // Finally, we pull in the data from global mem
+    float row_chunk[VPT];
+    AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
+    const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+#pragma unroll
+    for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
+    {
+        row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+    }
+    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
+    // convert to float afterwards for the exp + sum reduction.
+    float thread_max = row_chunk[0];
+#pragma unroll
+    for (int ii = 1; ii < VPT; ++ii)
+    {
+        thread_max = max(thread_max, row_chunk[ii]);
+    }
+// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    {
+        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
+    }
+    // From this point, thread max in all the threads have the max within the row.
+    // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+    float row_sum = 0;
+#pragma unroll
+    for (int ii = 0; ii < VPT; ++ii)
+    {
+        row_chunk[ii] = expf(row_chunk[ii] - thread_max);
+        row_sum += row_chunk[ii];
+    }
+// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    {
+        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
+    }
+    // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+    // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+    // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+    // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+    // argmax after computing the softmax.
+    const float reciprocal_row_sum = 1.f / row_sum;
+#pragma unroll
+    for (int ii = 0; ii < VPT; ++ii)
+    {
+        row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+    }
+    // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
+    // with the max index.
+    int start_col = first_elt_read_by_thread;
+    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+    for (int k_idx = 0; k_idx < k; ++k_idx)
+    {
+        // First, each thread does the local argmax
+        float max_val = row_chunk[0];
+        int expert = start_col;
+#pragma unroll
+        for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD; ++ldg, col += COLS_PER_GROUP_LDG)
+        {
+#pragma unroll
+            for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
+            {
+                float val = row_chunk[ldg * ELTS_PER_LDG + ii];
+                // No check on the experts here since columns with the smallest index are processed first and only
+                // updated if > (not >=)
+                if (val > max_val)
+                {
+                    max_val = val;
+                    expert = col + ii;
+                }
+            }
+        }
+// Now, we perform the argmax reduce. We use the butterfly pattern so threads reach consensus about the max.
+// This will be useful for K > 1 so that the threads can agree on "who" had the max value. That thread can
+// then blank out their max with -inf and the warp can run more iterations...
+#pragma unroll
+        for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+        {
+            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
+            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
+            // We want lower indices to "win" in every thread so we break ties this way
+            if (other_max > max_val || (other_max == max_val && other_expert < expert))
+            {
+                max_val = other_max;
+                expert = other_expert;
+            }
+        }
+        // Write the max for this k iteration to global memory.
+        if (thread_group_idx == 0)
+        {
+            // Add a guard to ignore experts not included by this node
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+            // The lead thread from each sub-group will write out the final results to global memory. (This will be a
+            // single) thread per row of the input/output matrices.
+            const int idx = k * thread_row + k_idx;
+            output[idx] = max_val;
+            indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
+            source_rows[idx] = k_idx * num_rows + thread_row;
+        }
+        // Finally, we clear the value in the thread with the current max if there is another iteration to run.
+        if (k_idx + 1 < k)
+        {
+            const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
+            const int thread_to_clear_in_group = (expert / ELTS_PER_LDG) % THREADS_PER_ROW;
+            // Only the thread in the group which produced the max will reset the "winning" value to -inf.
+            if (thread_group_idx == thread_to_clear_in_group)
+            {
+                const int offset_for_expert = expert % ELTS_PER_LDG;
+                // Safe to set to any negative value since row_chunk values must be between 0 and 1.
+                row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+            }
+        }
+    }
+}
+namespace detail
+{
+// Constructs some constants needed to partition the work across threads at compile time.
+template <int EXPERTS, int BYTES_PER_LDG>
+struct TopkConstants
+{
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
+    static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
+    static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
+};
+} // namespace detail
+template <int EXPERTS, int WARPS_PER_TB>
+void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices,
+    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
+{
+    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
+    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
+    static constexpr int VPT = Constants::VPT;
+    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
+    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
+    dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
+}
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
+        gating_output, nullptr, topk_weights, topk_indicies,            \
+        token_expert_indices, num_tokens, topk, 0, num_experts,         \
+        stream);
+void topkGatingSoftmaxKernelLauncher(
+    const float* gating_output,
+    float* topk_weights,
+    int* topk_indicies,
+    int* token_expert_indices,
+    float* softmax_workspace,
+    const int num_tokens,
+    const int num_experts,
+    const int topk,
+    cudaStream_t stream) {
+    static constexpr int WARPS_PER_TB = 4;
+    switch (num_experts) {
+        case 1:
+            LAUNCH_SOFTMAX(1, WARPS_PER_TB);
+            break;
+        case 2:
+            LAUNCH_SOFTMAX(2, WARPS_PER_TB);
+            break;
+        case 4:
+            LAUNCH_SOFTMAX(4, WARPS_PER_TB);
+            break;
+        case 8:
+            LAUNCH_SOFTMAX(8, WARPS_PER_TB);
+            break;
+        case 16:
+            LAUNCH_SOFTMAX(16, WARPS_PER_TB);
+            break;
+        case 32:
+            LAUNCH_SOFTMAX(32, WARPS_PER_TB);
+            break;
+        case 64:
+            LAUNCH_SOFTMAX(64, WARPS_PER_TB);
+            break;
+        case 128:
+            LAUNCH_SOFTMAX(128, WARPS_PER_TB);
+            break;
+        case 256:
+            LAUNCH_SOFTMAX(256, WARPS_PER_TB);
+            break;
+        default: {
+            TORCH_CHECK(softmax_workspace != nullptr,
+                "softmax_workspace must be provided for num_experts that are not a power of 2.");
+            static constexpr int TPB = 256;
+            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, softmax_workspace, num_experts);
+            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
+                softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
+                num_experts, topk, 0, num_experts);
+        }
+    }
+}
+} // namespace moe
+} // namespace vllm
+void topk_softmax(
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
+    torch::Tensor& gating_output)               // [num_tokens, num_experts]
+{
+    const int num_experts = gating_output.size(-1);
+    const int num_tokens = gating_output.numel() / num_experts;
+    const int topk = topk_weights.size(-1);
+    const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+    const bool needs_workspace = !is_pow_2 || num_experts > 256;
+    const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+    vllm::moe::topkGatingSoftmaxKernelLauncher(
+        gating_output.data_ptr<float>(),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        token_expert_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        stream);
+}

test/__init__.py ADDED Viewed

File without changes

test/kernels/__init__.py ADDED Viewed

File without changes

test/kernels/test_moe.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""Tests for the MOE layers.
+Run `pytest tests/kernels/test_moe.py`.
+"""
+from typing import List
+import pytest
+import torch
+from moe._ops import ops
+from moe.fused_moe import fused_moe, fused_topk, moe_align_block_size
+from moe.fused_marlin_moe import fused_marlin_moe
+from moe.scalar_type import scalar_types
+from moe.utils.marlin_utils_test import marlin_quantize
+from .utils import compute_max_diff, opcheck
+def stack_and_dev(tensors: List[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("n", [128, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("group_size", [-1, 32, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+@pytest.mark.parametrize("num_bits", [4, 8])
+@pytest.mark.parametrize("is_k_full", [True, False])
+# @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+    num_bits: int,
+    is_k_full: bool,
+):
+    torch.manual_seed(7)
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+    else:
+        if not is_k_full:
+            return
+    quant_type = scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+            w1[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+        )
+        w_ref1_l.append(w_ref1)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l)
+    sort_indices1 = stack_and_dev(sort_indices1_l)
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+            w2[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+        )
+        w_ref2_l.append(w_ref2)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l)
+    sort_indices2 = stack_and_dev(sort_indices2_l)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+    triton_output = fused_moe(
+        a,
+        w_ref1.transpose(1, 2).contiguous(),
+        w_ref2.transpose(1, 2).contiguous(),
+        score,
+        topk,
+        renormalize=False,
+    )
+    marlin_output = fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        scales1,
+        scales2,
+        score,
+        topk_weights,
+        topk_ids,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        num_bits=num_bits,
+        is_k_full=is_k_full,
+    )
+    assert compute_max_diff(marlin_output, triton_output) < 4e-2
+    token_expert_indicies = torch.empty(m, topk, dtype=torch.int32, device=a.device)
+    opcheck(
+        ops.topk_softmax,
+        (
+            topk_weights,
+            topk_ids,
+            token_expert_indicies,
+            score.float(),
+        ),
+    )
+    block_size_m = 4
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, e)
+    max_workspace_size = ((m + 255) // 256) * (max(2 * n, k) // 64) * 16
+    workspace = torch.zeros(
+        max_workspace_size, dtype=torch.int, device="cuda", requires_grad=False
+    )
+    zp = torch.empty((0, 0), dtype=dtype, device="cuda", requires_grad=False)
+    opcheck(
+        ops.marlin_gemm_moe,
+        (
+            a,
+            qweight1,
+            sorted_token_ids,
+            topk_weights,
+            topk_ids,
+            scales1,
+            zp,
+            g_idx1,
+            sort_indices1,
+            workspace,
+            quant_type.id,
+            m,
+            2 * n,
+            k,
+            True,
+            e,
+            topk,
+            block_size_m,
+            True,
+            False,
+        ),
+    )
+def test_moe_align_block_size_opcheck():
+    num_experts = 4
+    block_size = 4
+    topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda")
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+    opcheck(
+        ops.moe_align_block_size,
+        (
+            topk_ids,
+            num_experts,
+            block_size,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
+        ),
+    )

test/kernels/utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""Kernel test utils"""
+import itertools
+import random
+import unittest
+from numbers import Number
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
+import pytest
+import torch
+from torch._prims_common import TensorLikeType
+# For now, disable "test_aot_dispatch_dynamic" since there are some
+# bugs related to this test in PyTorch 2.4.
+DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+)
+ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_dynamic",
+)
+# Copied/modified from torch._refs.__init__.py
+def fp8_allclose(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    equal_nan: bool = False,
+) -> bool:
+    """
+    Reference implementation of torch.allclose
+    """
+    torch._refs._check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol)
+    return bool(
+        torch.all(
+            torch.isclose(
+                a.double(), b.double(), rtol=rtol, atol=atol, equal_nan=equal_nan
+            )
+        ).item()
+    )
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+# A special version of op check that has a restricted default set of test_utils
+# and a patched version of allclose that supports fp8 types.
+def opcheck(
+    op: Union[
+        torch._ops.OpOverload,
+        torch._ops.OpOverloadPacket,
+        torch._library.custom_ops.CustomOpDef,
+    ],
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
+    raise_exception: bool = True,
+    cond: bool = True
+) -> Dict[str, str]:
+    with unittest.mock.patch("torch.allclose", new=fp8_allclose):
+        return (
+            torch.library.opcheck(
+                op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception
+            )
+            if cond
+            else {}
+        )