DiscoResearch
/

mixtral-7b-8expert

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

bjoernp commited on Dec 10, 2023

Commit

2989f91

·

1 Parent(s): 9b9979d

Update modeling_moe_mistral.py

Files changed (1) hide show

modeling_moe_mistral.py +4 -4

modeling_moe_mistral.py CHANGED Viewed

@@ -220,11 +220,11 @@ class MoE(nn.Module):
         flat_expert_indices = expert_indices.view(-1)
         x = x.repeat_interleave(self.num_experts_per_token, dim=0)
-        x = torch.empty_like(x)
         for i, expert in enumerate(self.experts):
-            x[flat_expert_indices == i] = expert(x[flat_expert_indices == i])
-        x = (x.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
-        return x.view(*orig_shape)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv

         flat_expert_indices = expert_indices.view(-1)
         x = x.repeat_interleave(self.num_experts_per_token, dim=0)
+        y = torch.empty_like(x)
         for i, expert in enumerate(self.experts):
+            y[flat_expert_indices == i] = expert(y[flat_expert_indices == i])
+        y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
+        return y.view(*orig_shape)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv