Charlie81
/

ThinExperts

Safetensors

Model card Files Files and versions

xet

Community

Charlie81 commited on Jun 7, 2025

Commit

7bf23fe

1 Parent(s): 170c7d7

Revert "refactor sparse"

Browse files

This reverts commit 170c7d7f55aeef1ca17e395ad279ca2098e57d53.

Files changed (1) hide show

modeling_myolmoe.py +12 -11

modeling_myolmoe.py CHANGED Viewed

@@ -223,7 +223,6 @@ class MyOLMoERouting(nn.Module):
         self.hidden_size = config.hidden_size
         self.routing_type = getattr(config, "routing_type", "sparse")
         self.router_temperature = getattr(config, "router_temperature", 1.0)
-        self.norm_topk_prob = getattr(config, "norm_topk_prob", False)
         # Shared components
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
@@ -231,13 +230,20 @@ class MyOLMoERouting(nn.Module):
         # For non-deterministic routing
         self.gumbel_noise = getattr(config, "gumbel_noise", 0.1)
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
-        print("TEST testtest123")
         hidden_states = hidden_states.view(-1, hidden_dim)
-        print("TEST 4564645testtest123")
         router_logits = self.gate(hidden_states)
         if self.routing_type == "dense":
             # Dense routing - use all experts equally
             routing_weights = torch.ones_like(router_logits) / self.num_experts
@@ -256,16 +262,11 @@ class MyOLMoERouting(nn.Module):
         else:  # Default sparse routing
             # Standard sparse top-k routing
-            routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float)
             routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
-        if self.norm_topk_prob:
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(hidden_states.dtype)
         return routing_weights, selected_experts, router_logits
 class OlmoeRotaryEmbedding(nn.Module):
     def __init__(self, config: OlmoeConfig, device=None):
         super().__init__()

         self.hidden_size = config.hidden_size
         self.routing_type = getattr(config, "routing_type", "sparse")
         self.router_temperature = getattr(config, "router_temperature", 1.0)
         # Shared components
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
         # For non-deterministic routing
         self.gumbel_noise = getattr(config, "gumbel_noise", 0.1)
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
+        # Always use softmax, even for "dense" routing
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
         if self.routing_type == "dense":
             # Dense routing - use all experts equally
             routing_weights = torch.ones_like(router_logits) / self.num_experts
         else:  # Default sparse routing
             # Standard sparse top-k routing
+            routing_weights = F.softmax(router_logits, dim=-1)
             routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         return routing_weights, selected_experts, router_logits
 class OlmoeRotaryEmbedding(nn.Module):
     def __init__(self, config: OlmoeConfig, device=None):
         super().__init__()