Crystalcareai
/

GemMoE-Beta-1

Text Generation

Model card Files Files and versions Community

Crystalcareai commited on Mar 15

Commit

bad6a57

•

1 Parent(s): 6f6cbec

Update modeling_gemmoe.py

Files changed (1) hide show

modeling_gemmoe.py +14 -11

modeling_gemmoe.py CHANGED Viewed

@@ -655,31 +655,34 @@ class GemmoeSparseMoeBlock(nn.Module):
         self.num_experts = config.num_local_experts
         self.top_k = 2
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
-        expert_outputs = []
         for i in range(self.num_experts):
-            expert_input = hidden_states[topk_idx[:, i]]
-            expert_output = self.experts[i](expert_input)
-            expert_outputs.append(expert_output)
-        expert_outputs = torch.stack(expert_outputs, dim=1)
-        expert_outputs = expert_outputs.view(batch_size, sequence_length, self.top_k, -1)
-        topk_weight = topk_weight.view(batch_size, sequence_length, self.top_k, 1)
-        final_hidden_states = (expert_outputs * topk_weight).sum(dim=2)
-        final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
         return final_hidden_states.to(hidden_states.dtype), router_logits.to(hidden_states.dtype)

         self.num_experts = config.num_local_experts
         self.top_k = 2
+        # gating
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
+        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
+        y = torch.empty_like(hidden_states)
+        flat_topk_idx = topk_idx.view(-1)
         for i in range(self.num_experts):
+            expert = self.experts[i]
+            expert_output = expert(hidden_states[flat_topk_idx == i])
+            y[flat_topk_idx == i] = expert_output
+        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states.to(hidden_states.dtype), router_logits.to(hidden_states.dtype)