Upload GPTRefactForCausalLM

Browse files

Files changed (5) hide show

config.json +4 -8
configuration_gpt_refact.py +20 -30
generation_config.json +0 -1
modeling_gpt_refact.py +67 -58
pytorch_model.bin +2 -2

config.json CHANGED Viewed

@@ -2,15 +2,13 @@
   "architectures": [
     "GPTRefactForCausalLM"
   ],
-  "attention_softmax_in_fp32": false,
-  "attn_pdrop": 0.1,
   "auto_map": {
     "AutoConfig": "configuration_gpt_refact.GPTRefactConfig",
     "AutoModelForCausalLM": "modeling_gpt_refact.GPTRefactForCausalLM"
   },
-  "bos_token_id": -1,
   "do_sample": true,
-  "embd_pdrop": 0.1,
   "eos_token_id": 0,
   "initializer_range": 0.02,
   "layer_norm_epsilon": 1e-05,
@@ -21,10 +19,8 @@
   "n_inner": null,
   "n_layer": 32,
   "n_positions": 4096,
-  "resid_pdrop": 0.1,
-  "scale_attention_softmax_in_fp32": false,
-  "scale_attn_weights": true,
-  "torch_dtype": "float16",
   "transformers_version": "4.31.0",
   "use_cache": true,
   "vocab_size": 49216

   "architectures": [
     "GPTRefactForCausalLM"
   ],
+  "attention_bias_in_fp32": true,
+  "attention_softmax_in_fp32": true,
   "auto_map": {
     "AutoConfig": "configuration_gpt_refact.GPTRefactConfig",
     "AutoModelForCausalLM": "modeling_gpt_refact.GPTRefactForCausalLM"
   },
   "do_sample": true,
   "eos_token_id": 0,
   "initializer_range": 0.02,
   "layer_norm_epsilon": 1e-05,
   "n_inner": null,
   "n_layer": 32,
   "n_positions": 4096,
+  "scale_attention_softmax_in_fp32": true,
+  "torch_dtype": "bfloat16",
   "transformers_version": "4.31.0",
   "use_cache": true,
   "vocab_size": 49216

configuration_gpt_refact.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -16,26 +15,23 @@ class GPTRefactConfig(PretrainedConfig):
     }
     def __init__(
-        self,
-        vocab_size: int = 49216,
-        n_positions: int = 4096,
-        n_embd: int = 1024,
-        n_layer: int = 32,
-        n_head: int = 64,
-        max_position_embeddings: int = 4096,
-        multi_query: bool = True,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        scale_attn_weights=True,
-        use_cache=True,
-        bos_token_id=-1,
-        eos_token_id=0,
-        attention_softmax_in_fp32=True,
-        scale_attention_softmax_in_fp32=True,
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
@@ -43,19 +39,13 @@ class GPTRefactConfig(PretrainedConfig):
         self.n_layer = n_layer
         self.n_head = n_head
         self.n_inner = None
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
         self.use_cache = use_cache
         self.attention_softmax_in_fp32 = attention_softmax_in_fp32
         self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
         self.multi_query = multi_query
         self.max_position_embeddings = max_position_embeddings
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
     }
     def __init__(
+            self,
+            vocab_size: int = 49216,
+            n_positions: int = 4096,
+            n_embd: int = 1024,
+            n_layer: int = 32,
+            n_head: int = 64,
+            max_position_embeddings: int = 4096,
+            multi_query: bool = True,
+            layer_norm_epsilon: float = 1e-5,
+            initializer_range: float = 0.02,
+            use_cache: bool = True,
+            eos_token_id: int = 0,
+            attention_softmax_in_fp32: bool = True,
+            scale_attention_softmax_in_fp32: bool = True,
+            attention_bias_in_fp32: bool = True,
+            torch_dtype: str = 'bfloat16',
+            **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
         self.n_layer = n_layer
         self.n_head = n_head
         self.n_inner = None
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.attention_softmax_in_fp32 = attention_softmax_in_fp32
         self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
+        self.attention_bias_in_fp32 = attention_bias_in_fp32
         self.multi_query = multi_query
         self.max_position_embeddings = max_position_embeddings
+        self.torch_dtype = torch_dtype
+        super().__init__(eos_token_id=eos_token_id, **kwargs)

generation_config.json CHANGED Viewed

@@ -1,6 +1,5 @@
 {
   "_from_model_config": true,
-  "bos_token_id": -1,
   "do_sample": true,
   "eos_token_id": 0,
   "transformers_version": "4.31.0"

 {
   "_from_model_config": true,
   "do_sample": true,
   "eos_token_id": 0,
   "transformers_version": "4.31.0"

modeling_gpt_refact.py CHANGED Viewed

@@ -21,29 +21,23 @@ logger = logging.get_logger(__name__)
 @torch.jit.script
 def upcast_masked_softmax(
-        x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
 ):
     input_dtype = x.dtype
-    x = x.to(softmax_dtype) * scale
     x = torch.where(mask, x, mask_value)
     x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
     return x
 @torch.jit.script
-def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
     input_dtype = x.dtype
-    x = x.to(softmax_dtype) * scale
     x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
     return x
-@torch.jit.script
-def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
-    x = torch.where(mask, x, mask_value)
-    x = torch.nn.functional.softmax(x, dim=-1)
-    return x
 @torch.jit.script
 def _get_slopes(attn_heads: int, dev: torch.device) -> torch.Tensor:
     """
@@ -76,7 +70,6 @@ def _get_slopes(attn_heads: int, dev: torch.device) -> torch.Tensor:
         m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (attn_heads - n), 2, device=dev))
         # Concatenate the slopes with the remaining slopes.
         m = torch.cat([m, m_hat])
     return m
 @torch.jit.script
@@ -85,8 +78,7 @@ def get_alibi_biases(
         T: int,
         attn_heads: int,
         dev: torch.device,
-        dtype: torch.dtype,
-        causal: bool = True) -> torch.Tensor:
     """
     ## Calculate the attention biases matrix
     * `n_heads` is the number of heads in the attention layer
@@ -95,28 +87,26 @@ def get_alibi_biases(
     """
     # Get slopes $m$ for each head
-    if causal:
-        mask = (torch.triu(torch.ones((T, T), device=dev)) == 1).transpose(0, 1)
-    else:
-        mask = torch.ones((T, T), device=dev, dtype=torch.bool)
-    m = _get_slopes(attn_heads, dev)
     # Calculate distances $[0, 1, \dots, N]$
     # Here we calculate the distances using the mask.
     #
     # Since it's causal mask we can just use $[0, 1, \dots, N]$ too.
     # `distance = torch.arange(mask.shape[1], dtype=torch.long, device=mask.device)[None, :]`
-    distance = mask.cumsum(dim=-1)
     # Multiply them pair-wise to get the AliBi bias matrix
     biases = distance[:, :, None] * m[None, None, :]
     biases = biases.permute(2, 0, 1)[None, :, :T, :T]
     biases = biases.repeat(B, 1, 1, 1)
-    return biases.to(dtype).contiguous()
 class Attention(nn.Module):
     def __init__(self, config, layer_idx=None):
         super().__init__()
         self.mask_value = None
@@ -126,7 +116,7 @@ class Attention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         self.kv_attn_heads = 1
-        self.scale = self.head_dim ** -0.5
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
@@ -139,41 +129,64 @@ class Attention(nn.Module):
         self.scale_attention_softmax_in_fp32 = (
                 config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
         )
         self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.k = nn.Linear(self.embed_dim, self.head_dim, bias=False)
         self.v = nn.Linear(self.embed_dim, self.head_dim, bias=False)
         self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
     def _attn(self, query, key, value, attention_mask=None, alibi=None):
         dtype = query.dtype
         softmax_dtype = torch.float32 if self.attention_softmax_in_fp32 else dtype
         upcast = dtype != softmax_dtype
-        unscale = self.layer_idx + 1 if self.scale_attention_softmax_in_fp32 and upcast else 1
-        attn_weights = (alibi + torch.matmul(query * self.scale, key)).to(query.dtype)
         if upcast:
             if attention_mask is None:
-                attn_weights = upcast_softmax(attn_weights, unscale, softmax_dtype)
             else:
-                mask_value = self._get_mask_value(attn_weights.device, softmax_dtype)
-                attn_weights = upcast_masked_softmax(attn_weights, attention_mask, mask_value, unscale, softmax_dtype)
         else:
             if attention_mask is not None:
-                attn_weights = torch.masked_fill(attn_weights, attention_mask, -10000)
             attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
-        attn_output = torch.matmul(attn_weights, value)
         return attn_output, attn_weights
-    def _split_heads(self, tensor):
-        new_shape = tensor.shape[:-1] + (self.num_heads, self.head_dim)
-        tensor = tensor.view(new_shape)
-        return tensor.permute(0, 2, 1, 3)
     def forward(
             self,
             hidden_states: torch.Tensor,
@@ -186,13 +199,9 @@ class Attention(nn.Module):
         Tuple[torch.Tensor, Optional[torch.Tensor]],
         Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
     ]:
-        b, t, _ = hidden_states.shape
         query = self.q(hidden_states)
         key = self.k(hidden_states)
         value = self.v(hidden_states)
-        query = self._split_heads(query)
-        key = key.view(b, t, self.kv_attn_heads, self.head_dim).permute(0, 2, 1, 3)
-        value = value.view(b, t, self.kv_attn_heads, self.head_dim).permute(0, 2, 1, 3)
         if layer_past is not None:
             past_key, past_value = layer_past
@@ -205,18 +214,18 @@ class Attention(nn.Module):
             present = None
         attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, alibi)
-        attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape)
         attn_output = self.c_proj(attn_output)
         outputs = (attn_output, present)
         if output_attentions:
             outputs += (attn_weights,)
         return outputs  # a, present, (attentions)
 class MLP(nn.Module):
     def __init__(self, intermediate_size, config, multiple_of: int = 256):
         super().__init__()
         embed_dim = config.hidden_size
@@ -227,7 +236,7 @@ class MLP(nn.Module):
         self.linear_3 = nn.Linear(embed_dim, hidden_dim, bias=False)
         self.c_proj = nn.Linear(hidden_dim, embed_dim, bias=False)
-    def forward(self, x: Optional[Tuple[torch.Tensor]]) -> torch.Tensor:
         x1 = F.silu(self.linear_1(x))
         x2 = self.linear_3(x)
         x = self.c_proj(x1 * x2)
@@ -297,6 +306,7 @@ class GPTRefactBlock(nn.Module):
 class GPTRefactPreTrainedModel(PreTrainedModel):
     config_class = GPTRefactConfig
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
@@ -337,6 +347,7 @@ class GPTRefactPreTrainedModel(PreTrainedModel):
 class GPTRefactModel(GPTRefactPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.embed_dim = config.hidden_size
@@ -347,6 +358,7 @@ class GPTRefactModel(GPTRefactPreTrainedModel):
         self.h = nn.ModuleList([GPTRefactBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
         self.max_positions = config.max_position_embeddings
         self.register_buffer(
             "bias", torch.tril(torch.ones((self.max_positions, self.max_positions), dtype=torch.bool)),
             persistent=False
@@ -357,16 +369,6 @@ class GPTRefactModel(GPTRefactPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    @staticmethod
-    def _make_mask(seq_len: int, past_key_values_length: int):
-        # prompt
-        if past_key_values_length == 0:
-            mask = torch.ones((seq_len, seq_len + past_key_values_length), dtype=torch.bool)
-            mask = torch.triu(mask, 1)
-        else:
-            mask = torch.zeros((seq_len, seq_len + past_key_values_length), dtype=torch.bool)
-        return mask
     def forward(
             self,
             input_ids: Optional[torch.Tensor] = None,
@@ -408,19 +410,25 @@ class GPTRefactModel(GPTRefactPreTrainedModel):
         else:
             past_length = past_key_values[0][0].size(-2)
-        # Self-attention mask.
         query_length = input_shape[-1]
         seq_length_with_past = past_length + query_length
-        if attention_mask is None:
-            attention_mask = self._make_mask(query_length, past_length).to(device)
-        else:
-            attention_mask = attention_mask.to(device)
         hidden_states = self.wte(input_ids) if inputs_embeds is None else inputs_embeds
         alibi = get_alibi_biases(hidden_states.shape[0], seq_length_with_past,
-                                 self.num_heads, device, torch.float32)[:, :, -query_length:, :]
         output_shape = input_shape + (hidden_states.size(-1),)
@@ -489,6 +497,7 @@ class GPTRefactModel(GPTRefactPreTrainedModel):
 class GPTRefactForCausalLM(GPTRefactPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight", "ln_f.weight"]
     def __init__(self, config):

 @torch.jit.script
 def upcast_masked_softmax(
+        x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, softmax_dtype: torch.dtype
 ):
     input_dtype = x.dtype
+    x = x.to(softmax_dtype)
     x = torch.where(mask, x, mask_value)
     x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
     return x
 @torch.jit.script
+def upcast_softmax(x: torch.Tensor, softmax_dtype: torch.dtype):
     input_dtype = x.dtype
+    x = x.to(softmax_dtype)
     x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
     return x
 @torch.jit.script
 def _get_slopes(attn_heads: int, dev: torch.device) -> torch.Tensor:
     """
         m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (attn_heads - n), 2, device=dev))
         # Concatenate the slopes with the remaining slopes.
         m = torch.cat([m, m_hat])
     return m
 @torch.jit.script
         T: int,
         attn_heads: int,
         dev: torch.device,
+        dtype: torch.dtype) -> torch.Tensor:
     """
     ## Calculate the attention biases matrix
     * `n_heads` is the number of heads in the attention layer
     """
     # Get slopes $m$ for each head
+    mask = torch.ones((T, T), device=dev, dtype=torch.bool)
+    m = _get_slopes(attn_heads, dev).to(dtype)
     # Calculate distances $[0, 1, \dots, N]$
     # Here we calculate the distances using the mask.
     #
     # Since it's causal mask we can just use $[0, 1, \dots, N]$ too.
     # `distance = torch.arange(mask.shape[1], dtype=torch.long, device=mask.device)[None, :]`
+    distance = mask.cumsum(dim=-1).to(dtype)
     # Multiply them pair-wise to get the AliBi bias matrix
     biases = distance[:, :, None] * m[None, None, :]
     biases = biases.permute(2, 0, 1)[None, :, :T, :T]
     biases = biases.repeat(B, 1, 1, 1)
+    return biases.contiguous()
 class Attention(nn.Module):
     def __init__(self, config, layer_idx=None):
         super().__init__()
         self.mask_value = None
         self.head_dim = self.embed_dim // self.num_heads
         self.kv_attn_heads = 1
+        self.scale_factor = self.head_dim ** -0.5
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
         self.scale_attention_softmax_in_fp32 = (
                 config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
         )
+        self.attention_bias_in_fp32 = config.attention_bias_in_fp32
         self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.k = nn.Linear(self.embed_dim, self.head_dim, bias=False)
         self.v = nn.Linear(self.embed_dim, self.head_dim, bias=False)
         self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+    def _get_mask_value(self, device, dtype):
+        # torch.where expects a tensor. We use a cache to avoid recreating it every time.
+        if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
+            self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
+        return self.mask_value
     def _attn(self, query, key, value, attention_mask=None, alibi=None):
         dtype = query.dtype
         softmax_dtype = torch.float32 if self.attention_softmax_in_fp32 else dtype
+        mask_value = self._get_mask_value(query.device, softmax_dtype)
         upcast = dtype != softmax_dtype
+        query_shape = query.shape
+        batch_size = query_shape[0]
+        key_length = key.size(-1)
+        # (batch_size, query_length, num_heads, head_dim) x (batch_size, head_dim, key_length)
+        # -> (batch_size, query_length, num_heads, key_length)
+        query_length = query_shape[1]
+        attn_shape = (batch_size, query_length, self.num_heads, key_length)
+        attn_view = (batch_size, query_length * self.num_heads, key_length)
+        # No copy needed for MQA 2, or when layer_past is provided.
+        query = query.reshape(batch_size, query_length * self.num_heads, self.head_dim)
+        alibi = alibi.transpose(2, 1).reshape(alibi.shape[0], -1, alibi.shape[-1])
+        initial_dtype = query.dtype
+        new_dtype = torch.float32 if self.attention_bias_in_fp32 else initial_dtype
+        attn_weights = alibi.baddbmm(
+            batch1=query.to(new_dtype),
+            batch2=key.to(new_dtype),
+            beta=1,
+            alpha=self.scale_factor
+        ).view(attn_shape).to(initial_dtype)
         if upcast:
+            # Use a fused kernel to prevent a large overhead from casting and scaling.
+            # Sub-optimal when the key length is not a multiple of 8.
             if attention_mask is None:
+                attn_weights = upcast_softmax(attn_weights, softmax_dtype)
             else:
+                attn_weights = upcast_masked_softmax(attn_weights, attention_mask, mask_value, softmax_dtype)
         else:
             if attention_mask is not None:
+                # The fused kernel is very slow when the key length is not a multiple of 8, so we skip fusion.
+                attn_weights = torch.where(attention_mask, attn_weights, mask_value)
             attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.bmm(attn_weights.view(attn_view), value).view(query_shape)
         return attn_output, attn_weights
     def forward(
             self,
             hidden_states: torch.Tensor,
         Tuple[torch.Tensor, Optional[torch.Tensor]],
         Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
     ]:
         query = self.q(hidden_states)
         key = self.k(hidden_states)
         value = self.v(hidden_states)
         if layer_past is not None:
             past_key, past_value = layer_past
             present = None
         attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, alibi)
         attn_output = self.c_proj(attn_output)
         outputs = (attn_output, present)
         if output_attentions:
+            attn_weights = attn_weights.transpose(1, 2)
             outputs += (attn_weights,)
         return outputs  # a, present, (attentions)
 class MLP(nn.Module):
     def __init__(self, intermediate_size, config, multiple_of: int = 256):
         super().__init__()
         embed_dim = config.hidden_size
         self.linear_3 = nn.Linear(embed_dim, hidden_dim, bias=False)
         self.c_proj = nn.Linear(hidden_dim, embed_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x1 = F.silu(self.linear_1(x))
         x2 = self.linear_3(x)
         x = self.c_proj(x1 * x2)
 class GPTRefactPreTrainedModel(PreTrainedModel):
     config_class = GPTRefactConfig
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
 class GPTRefactModel(GPTRefactPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.embed_dim = config.hidden_size
         self.h = nn.ModuleList([GPTRefactBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
         self.max_positions = config.max_position_embeddings
+        self.attention_bias_in_fp32 = config.attention_bias_in_fp32
         self.register_buffer(
             "bias", torch.tril(torch.ones((self.max_positions, self.max_positions), dtype=torch.bool)),
             persistent=False
         # Initialize weights and apply final processing
         self.post_init()
     def forward(
             self,
             input_ids: Optional[torch.Tensor] = None,
         else:
             past_length = past_key_values[0][0].size(-2)
         query_length = input_shape[-1]
         seq_length_with_past = past_length + query_length
+        # Self-attention mask.
+        key_length = past_length + query_length
+        self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
+        if attention_mask is not None:
+            self_attention_mask = self_attention_mask * attention_mask.view(batch_size, 1, -1).to(
+                dtype=torch.bool, device=self_attention_mask.device
+            )
+        # MQA models: (batch_size, query_length, n_heads, key_length)
+        attention_mask = self_attention_mask.unsqueeze(2)
         hidden_states = self.wte(input_ids) if inputs_embeds is None else inputs_embeds
+        alibi_dtype = torch.float32 if self.attention_bias_in_fp32 else self.wte.weight.dtype
         alibi = get_alibi_biases(hidden_states.shape[0], seq_length_with_past,
+                                 self.num_heads, device, alibi_dtype)[:, :, -query_length:, :]
         output_shape = input_shape + (hidden_states.size(-1),)
 class GPTRefactForCausalLM(GPTRefactPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight", "ln_f.weight"]
     def __init__(self, config):

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c9761aabc16466fdf738d4fe42f12ee6844a360db07bde307ca808d0bfb6b8a
-size 6343461637

 version https://git-lfs.github.com/spec/v1
+oid sha256:1092c5efe56fe5b04360ba0d4ac231e8b03f9d1d0b8633b8ed678f73bdcb021a
+size 3171776281