Crystalcareai
/

Gemma-7b-Fixed

Text Generation

Transformers

Safetensors

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 18, 2024

Commit

8f32857

verified ·

1 Parent(s): fe54712

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +6 -17

modeling_gemmoe.py CHANGED Viewed

@@ -85,14 +85,9 @@ class GemmoeRMSNorm(nn.Module):
         self.eps = eps
         self.weight = nn.Parameter(torch.zeros(dim))
-    def _norm(self, x):
         x_float = x.float()
         normed_x = x_float * torch.rsqrt(x_float.pow(2).mean(-1, keepdim=True) + self.eps)
-        return normed_x
-    def forward(self, x):
-        normed_x = self._norm(x)
-        # Downcast the result to the original dtype at the end
         normed_x = normed_x.type_as(x)
         return normed_x * (self.weight + 1)
@@ -108,11 +103,10 @@ class GemmoeRotaryEmbedding(nn.Module):
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
-        freq_exponents = (2.0 / self.dim) * (torch.arange(self.dim // 2, dtype=torch.float32, device="cpu").float())
         timescale = self.base ** freq_exponents
-        positions = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.float32).float()
-        radians_new = positions[..., None] / timescale[None, None, :]
-        radians_new = radians_new.squeeze(0)
         emb = torch.cat((radians_new, radians_new), dim=-1)
         cos = emb.cos().to(device=device, dtype=dtype, non_blocking=True)
         sin = emb.sin().to(device=device, dtype=dtype, non_blocking=True)
@@ -120,20 +114,15 @@ class GemmoeRotaryEmbedding(nn.Module):
         self.register_buffer("sin_cached", sin, persistent=False)
     def forward(self, x, position_ids=None, seq_len=None):
-        if seq_len is None:
-            seq_len = x.size(2)
         if seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
-# Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     seq_len, dim = q.shape[-2], q.shape[-1]
     cos = cos[:seq_len].view(1, 1, seq_len, dim)

         self.eps = eps
         self.weight = nn.Parameter(torch.zeros(dim))
+    def forward(self, x):
         x_float = x.float()
         normed_x = x_float * torch.rsqrt(x_float.pow(2).mean(-1, keepdim=True) + self.eps)
         normed_x = normed_x.type_as(x)
         return normed_x * (self.weight + 1)
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
+        freq_exponents = (2.0 / self.dim) * torch.arange(self.dim // 2, dtype=torch.float32, device="cpu")
         timescale = self.base ** freq_exponents
+        positions = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.float32)
+        radians_new = positions.view(-1, 1) / timescale.view(1, -1)
         emb = torch.cat((radians_new, radians_new), dim=-1)
         cos = emb.cos().to(device=device, dtype=dtype, non_blocking=True)
         sin = emb.sin().to(device=device, dtype=dtype, non_blocking=True)
         self.register_buffer("sin_cached", sin, persistent=False)
     def forward(self, x, position_ids=None, seq_len=None):
+        seq_len = x.size(2) if seq_len is None else seq_len
         if seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
 def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
     return torch.cat((-x2, x1), dim=-1)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     seq_len, dim = q.shape[-2], q.shape[-1]
     cos = cos[:seq_len].view(1, 1, seq_len, dim)