keyfan
/

grok-1-hf

Text Generation

Transformers

PyTorch

Grok

custom_code

Model card Files Files and versions Community

keyfan commited on Mar 20

Commit

33fac84

•

1 Parent(s): 7de2158

Update modeling_grok.py

Browse files

Files changed (1) hide show

modeling_grok.py +4 -4

modeling_grok.py CHANGED Viewed

@@ -84,7 +84,7 @@ class GrokRMSNorm(nn.Module):
         GrokRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
     def forward(self, hidden_states):
@@ -92,7 +92,7 @@ class GrokRMSNorm(nn.Module):
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
 # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Grok
@@ -338,7 +338,7 @@ class GrokDecoderLayer(nn.Module):
        	self.top_k = config.num_experts_per_tok
         self.multi_head_attention = GrokAttention(config, layer_idx)
-       	self.router = nn.Linear(self.hidden_size, self.num_experts, bias=False)
         self.moe = nn.ModuleList([GrokBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
         self.rms_norm = GrokRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -400,7 +400,7 @@ class GrokDecoderLayer(nn.Module):
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
-        router_logits = self.router(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)

         GrokRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.float32))
         self.variance_epsilon = eps
     def forward(self, hidden_states):
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)
 # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Grok
        	self.top_k = config.num_experts_per_tok
         self.multi_head_attention = GrokAttention(config, layer_idx)
+       	self.router = nn.Linear(self.hidden_size, self.num_experts, dtype=torch.float32, bias=False)
         self.moe = nn.ModuleList([GrokBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
         self.rms_norm = GrokRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.router(hidden_states.to(torch.float))
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)