microsoft
/

Phi-3-mini-128k-instruct

@@ -163,18 +163,18 @@ class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
-        position_ids_expanded = position_ids[:, None, :].float()
-        if position_ids_expanded.shape[-1] > self.original_max_position_embeddings:
             ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
             ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-        if self.inv_freq is None:
-            self.inv_freq = 1.0 / (
-                ext_factors
-                * self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
-            )
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
@@ -215,18 +215,18 @@ class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
-        position_ids_expanded = position_ids[:, None, :].float()
-        if position_ids_expanded.shape[-1] > self.original_max_position_embeddings:
             ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
             ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-        if self.inv_freq is None:
-            self.inv_freq = 1.0 / (
-                ext_factors
-                * self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
-            )
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285

     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
             ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
             ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        self.inv_freq = 1.0 / (
+            ext_factors
+            * self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+        )
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
             ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
             ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        self.inv_freq = 1.0 / (
+            ext_factors
+            * self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+        )
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285