jetmoe
/

jetmoe-8b-chat

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class top_k_gating(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        num_experts,
+        top_k,
+    ):
+        """
+        Initialize the top-k gating mechanism.
+        Args:
+            input_size (int): Size of the input.
+            num_experts (int): Number of experts.
+            top_k (int): Number of top experts to select.
+            acc_aux_loss (bool): Whether to accumulate auxiliary loss statistics.
+            dropout (float): Dropout rate for gating network.
+            hidden_size (int): Hidden size of the gating network.
+            sample_topk (int): Number of top-k experts to sample during training.
+            aux_loss (str): Type of auxiliary loss ('mi' or 'switch').
+            gate_type (str): Type of gating mechanism ('mlp', 'linear', or 'gmm').
+        """
+        super().__init__()
+        self.num_experts = num_experts
+        self.input_size = input_size
+        assert top_k <= num_experts
+        self.top_k = top_k
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+    def extra_repr(self):
+        """
+        Return extra representation string for the module.
+        """
+        return 'k={}, num_experts={}'.format(
+            self.top_k, self.num_experts)
+    def compute_aux_loss(self, probs, logits, gates):
+        """
+        Calculate and return the auxiliary loss based on the accumulated statistics.
+        Args:
+            eps (float): Small epsilon value for numerical stability.
+        Returns:
+            torch.Tensor: The calculated auxiliary loss.
+        """
+        count = logits.size(0)
+        probs = probs.sum(0)
+        freq = (gates > 0).float().sum(0)
+        lsesq = (torch.log(torch.exp(logits).sum(dim=-1)) ** 2).sum()
+        switchloss =  self.num_experts * (
+            F.normalize(probs, p=1, dim=0) *
+            F.normalize(freq, p=1, dim=0)
+        ).sum()
+        zloss = lsesq / count
+        loss = switchloss + 0.1 * zloss
+        return loss
+    def forward(self, x):
+        """
+        Compute the top-k gating for the input.
+        See paper: https://arxiv.org/abs/1701.06538.
+        Args:
+            x (torch.Tensor): Input tensor with shape [batch_size, input_size].
+            skip_mask (torch.Tensor): Skip mask tensor (binary) with the same shape as `x`.
+            x: input Tensor with shape [batch_size, input_size]
+            train: a boolean - we only add noise at training time.
+            noise_epsilon: a float
+        Returns:
+            torch.Tensor: Top-k indices.
+            torch.Tensor: Top-k gating values.
+            torch.Tensor: Probability values for each expert.
+            gates: a Tensor with shape [batch_size, num_experts]
+            load: a Tensor with shape [num_experts]
+        """
+        logits = self.layer(x).float()
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(x)
+        if self.training:
+            probs = torch.softmax(logits, dim=1)
+            zeros = torch.zeros_like(probs)
+            zeros = zeros.to(top_k_gates.dtype)  # Convert zeros to match top_k_gates dtype
+            gates = zeros.scatter(1, top_k_indices, top_k_gates)
+            self.loss = self.compute_aux_loss(probs, logits, gates)
+        else:
+            self.loss = 0
+        return top_k_indices, top_k_gates

modeling_jetmoe.py CHANGED Viewed

@@ -9,7 +9,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
 from torch.nn import functional as F
-import megablocks
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -28,7 +28,7 @@ from transformers.utils import (
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.cache_utils import Cache, DynamicCache
 from .configuration_jetmoe import JetMoEConfig
-from jetmoe_model.utils import moe
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
@@ -701,9 +701,9 @@ class JetMoEBlock(nn.Module):
         self.self_attention = JETMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.post_attention_layernorm = JetMoERMSNorm(config.hidden_size)
-        moe_args = megablocks.layers.arguments.from_megatron(config)
-        moe_args.activation_fn = F.silu
-        moe_args.return_bias = False
         # self.mlp = megablocks.layers.dmoe.dMoE(moe_args)
         self.mlp = moe.MoE(
             input_size=config.hidden_size,

 from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
 from torch.nn import functional as F
+#import megablocks
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.cache_utils import Cache, DynamicCache
 from .configuration_jetmoe import JetMoEConfig
+from . import moe
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
         self.self_attention = JETMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.post_attention_layernorm = JetMoERMSNorm(config.hidden_size)
+        # moe_args = megablocks.layers.arguments.from_megatron(config)
+        # moe_args.activation_fn = F.silu
+        # moe_args.return_bias = False
         # self.mlp = megablocks.layers.dmoe.dMoE(moe_args)
         self.mlp = moe.MoE(
             input_size=config.hidden_size,

moe.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import math
+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import scattermoe
+from .gate import top_k_gating
+class MoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+    Args:
+        input_size: integer - size of the input
+        head_size: integer - size of the expert's hidden layer
+        num_experts: an integer - number of experts
+        top_k: an integer - how many experts to use for each batch element
+        bias: a boolean - whether to include bias in linear layers
+        activation: an activation function to apply to expert's outputs
+        acc_aux_loss: a boolean - whether to accumulate auxiliary loss
+        hidden_size: an integer - hidden size of the experts
+        gating_dropout: a float - dropout rate for gating network
+        sample_topk: an integer - how many experts to sample during training
+        gating_size: an integer - size of the gating network
+        aux_loss: a string - type of auxiliary loss ('mi' or 'sparse')
+        gate_type: a string - type of gating mechanism ('mlp' or 'topk')
+    """
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_experts,
+        top_k,
+        bias=True,
+        activation=None,
+        glu=True,
+        ):
+        super(MoE, self).__init__()
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.glu = glu
+        if bias:
+            self.bias = torch.nn.Parameter(torch.empty(input_size))
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.bias = None
+        self.input_linear = scattermoe.parallel_experts.ParallelExperts(num_experts, input_size, hidden_size * 2 if glu else hidden_size)
+        self.output_linear = scattermoe.parallel_experts.ParallelExperts(num_experts, hidden_size, input_size)
+        self.top_k = min(top_k, self.num_experts)
+        self.activation = activation
+        self.router = top_k_gating(
+            input_size=input_size,
+            num_experts=num_experts,
+            top_k=top_k,
+            )
+    def extra_repr(self):
+        return 'k={}, e={}'.format(
+            self.top_k, self.num_experts)
+    def get_aux_loss_and_clear(self):
+        """
+        Get the accumulated auxiliary loss and clear it.
+        Returns:
+            float: Accumulated auxiliary loss.
+        """
+        return self.gate.get_aux_loss_and_clear()
+    def compute_gate(self, x):
+        top_k_indices, self.top_k_gates = self.router(x)
+        with torch.no_grad():
+            self.sorted_expert_idxs, self.sorted_scattered_idxs = scattermoe.kernels.ops.flatten_and_sort(top_k_indices)
+            self.padded_block_idxs, self.expert_offsets = scattermoe.kernels.ops.padded_block_indices(self.sorted_expert_idxs, self.num_experts)
+        return self.router.loss
+    def batch_forward(self, x):
+        """
+        Forward pass of the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        x = x.reshape(-1, emb_size)
+        loss = self.compute_gate(x)
+        h = self.input_linear(
+            x, self.top_k,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+            grouped_out=True
+        )
+        if self.glu:
+            h, g = h.chunk(2, dim=-1)
+            h = self.activation(h) * g
+        else:
+            h = self.activation(h)
+        y = self.output_linear(
+            h, 1,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+            grouped_in=True,
+            gates=self.top_k_gates,
+        )
+        y = y.view(bsz, length, self.input_size)
+        if self.bias is not None:
+            y = y + self.bias
+        return y, loss
+    def single_forward(self, x):
+        bsz, length, emb_size = x.size()
+        x = x.reshape(1, self.input_size)
+        top_k_indices, top_k_gates = self.router(x)
+        loss = self.router.loss
+        y_list = []
+        for i in range(self.top_k):
+            expert_idx = top_k_indices[0,i]
+            h = F.linear(x, self.input_linear.weight[expert_idx])
+            if self.glu:
+                h, g = h.chunk(2, dim=-1)
+                h = self.activation(h) * g
+            else:
+                h = self.activation(h)
+            y = F.linear(h, self.output_linear.weight[expert_idx]) * top_k_gates[0,i]
+            y_list.append(y)
+        y = sum(y_list)
+        y = y.view(bsz, length, self.input_size)
+        if self.bias is not None:
+            y = y + self.bias
+        return y, loss
+    def forward(self, x):
+        """
+        Forward pass of the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        if bsz * length ==1:
+            return self.single_forward(x)
+        else:
+            return self.batch_forward(x)
+    def batch_map(self, x):
+        """
+        Map input through the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        x = x.reshape(-1, emb_size)
+        loss = self.compute_gate(x)
+        y = self.input_linear(
+            x, self.top_k,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+        )
+        y = y.view(bsz, length, self.top_k, -1)
+        return y, loss
+    def single_map(self, x):
+        bsz, length, emb_size = x.size()
+        x = x.reshape(1, self.input_size)
+        self.top_k_indices, self.top_k_gates = self.router(x)
+        loss = self.router.loss
+        y_list = []
+        for i in range(self.top_k):
+            expert_idx = self.top_k_indices[0,i]
+            y = F.linear(x, self.input_linear.weight[expert_idx])
+            y_list.append(y)
+        y = torch.cat(y_list, dim=0)
+        y = y.view(bsz, length, self.top_k, -1)
+        return y, loss
+    def map(self, x):
+        """
+        Map input through the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        if bsz * length ==1:
+            return self.single_map(x)
+        else:
+            return self.batch_map(x)
+    def batch_reduce(self, x):
+        """
+        Reduce the mapped output.
+        Args:
+            x (Tensor): Mapped output tensor.
+        Returns:
+            Tensor: Reduced output tensor.
+        """
+        bsz, length, k, emb_size = x.size()
+        assert k == self.top_k
+        x = x.reshape(-1, emb_size)
+        y = self.output_linear(
+            x, 1,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+            gates=self.top_k_gates,
+        )
+        y = y.view(bsz, length, self.input_size)
+        return y
+    def single_reduce(self, x):
+        bsz, length, k, emb_size = x.size()
+        x = x.reshape(k, emb_size)
+        y_list = []
+        for i in range(self.top_k):
+            expert_idx = self.top_k_indices[0,i]
+            y = F.linear(x[i], self.output_linear.weight[expert_idx]) * self.top_k_gates[0,i]
+            y_list.append(y)
+        y = sum(y_list)
+        y = y.view(bsz, length, self.input_size)
+        return y
+    def reduce(self, x):
+        """
+        Reduce the mapped output.
+        Args:
+            x (Tensor): Mapped output tensor.
+        Returns:
+            Tensor: Reduced output tensor.
+        """
+        bsz, length, k, emb_size = x.size()
+        if bsz * length ==1:
+            return self.single_reduce(x)
+        else:
+            return self.batch_reduce(x)