BucketOfFish
/

simplified_phi2

@@ -19,9 +19,7 @@ except ImportError:
 class RotaryEmbedding(nn.Module):
-    """Rotary positional embedding (RoPE) from Phi2.
-    See https://www.youtube.com/watch?v=C6rV8BsrrCc
-    """
     def __init__(
         self,
@@ -129,8 +127,6 @@ class RotaryEmbedding(nn.Module):
 class SelfAttention(nn.Module):
-    """Self-attention layer, taken from Phi2 model."""
     def __init__(
         self,
         qk_scale: float | None = None,  # will use 1/sqrt(d) if set to None
@@ -174,8 +170,6 @@ class SelfAttention(nn.Module):
 class CrossAttention(nn.Module):
-    """Cross-attention layer, taken from Phi2 model."""
     def __init__(
         self,
         qk_scale: float | None = None,  # will use 1/sqrt(d) if set to None
@@ -225,8 +219,6 @@ class CrossAttention(nn.Module):
 class MLP(nn.Module):
-    """Taken from Phi2 as well."""
     def __init__(
         self,
         d_embedding: int,
@@ -489,7 +481,7 @@ class MHA(nn.Module):
 class ParallelAttentionBlock(nn.Module):
-    """From Phi2. Calculates attention and MLP in parallel. See 'Simplifying Transformer Blocks', Fig. 1 'Parallel'."""
     def __init__(
         self,

 class RotaryEmbedding(nn.Module):
+    """Rotary positional embedding (RoPE). See https://www.youtube.com/watch?v=C6rV8BsrrCc"""
     def __init__(
         self,
 class SelfAttention(nn.Module):
     def __init__(
         self,
         qk_scale: float | None = None,  # will use 1/sqrt(d) if set to None
 class CrossAttention(nn.Module):
     def __init__(
         self,
         qk_scale: float | None = None,  # will use 1/sqrt(d) if set to None
 class MLP(nn.Module):
     def __init__(
         self,
         d_embedding: int,
 class ParallelAttentionBlock(nn.Module):
+    """Calculates attention and MLP in parallel."""
     def __init__(
         self,

phi2_model.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Phi2PreTrainedModel(PreTrainedModel):
         input_ids: torch.LongTensor,  # dim: (batch_size, seq_len)
         kv_cache: KVCache | None = None,
         key_padding_mask: torch.LongTensor | torch.BoolTensor | None = None,
-        **kwargs,
     ) -> dict[str, Any]:
         if not kv_cache:
             kv_cache = KVCache(
@@ -61,7 +61,7 @@ class Phi2PreTrainedModel(PreTrainedModel):
 class Embedding(nn.Module):
-    """Token embedding with dropout from Phi2."""
     def __init__(
         self,
@@ -150,7 +150,7 @@ class Phi2ModelForCausalLM(Phi2PreTrainedModel):
         kv_cache: KVCache | None = None,
         key_padding_mask: torch.BoolTensor | None = None,
         labels: torch.LongTensor | None = None,
-        **kwargs,
     ) -> CausalLMOutputWithPast:
         x = self.model(input_ids, kv_cache=kv_cache, key_padding_mask=key_padding_mask)
         x = self.lm_head_layer_norm(x)

         input_ids: torch.LongTensor,  # dim: (batch_size, seq_len)
         kv_cache: KVCache | None = None,
         key_padding_mask: torch.LongTensor | torch.BoolTensor | None = None,
+        **kwargs,  # has to be here
     ) -> dict[str, Any]:
         if not kv_cache:
             kv_cache = KVCache(
 class Embedding(nn.Module):
+    """Token embedding with dropout."""
     def __init__(
         self,
         kv_cache: KVCache | None = None,
         key_padding_mask: torch.BoolTensor | None = None,
         labels: torch.LongTensor | None = None,
+        **kwargs,  # has to be here
     ) -> CausalLMOutputWithPast:
         x = self.model(input_ids, kv_cache=kv_cache, key_padding_mask=key_padding_mask)
         x = self.lm_head_layer_norm(x)