Crystalcareai
/

GemMoE-Beta-1

Text Generation

Model card Files Files and versions Community

Crystalcareai commited on Mar 13, 2024

Commit

8825292

·

verified ·

1 Parent(s): 1318de9

Update modeling_gemmoe.py

Files changed (1) hide show

modeling_gemmoe.py +4 -1

modeling_gemmoe.py CHANGED Viewed

@@ -669,7 +669,7 @@ class GemmoeSparseMoeBlock(nn.Module):
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.to(self.gate.weight.device)
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -707,6 +707,9 @@ class GemmoeSparseMoeBlock(nn.Module):
             final_hidden_states.index_add_(0, token_indices, current_hidden_states)
 class GemmoeDecoderLayer(nn.Module):
     def __init__(self, config: GemmoeConfig, layer_idx: int):

         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         hidden_states = hidden_states.to(self.gate.weight.device)
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
             final_hidden_states.index_add_(0, token_indices, current_hidden_states)
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
 class GemmoeDecoderLayer(nn.Module):
     def __init__(self, config: GemmoeConfig, layer_idx: int):