add support for AutoModelForCausalLM#from_pretrained()'s device_map='auto'. support gradient checkpointing, probably. add lots of type hints so I could understand what's going on. multiline long method signatures/calls (for easier comparison between checkpointed/non-checkpointed variants, and because these lines got even longer when I added type hints). make MPTForCausalLM#forward accept additional kwargs, since PeftModelForCausalLM#forward tries to send it an argument inputs_embeds=None, which it didn't like too much.

Browse files

Files changed (4) hide show

attention.py +195 -18
blocks.py +9 -4
is_torch_version.py +56 -0
modeling_mpt.py +68 -5

attention.py CHANGED Viewed

@@ -1,13 +1,72 @@
 """Attention layers."""
 import math
 import warnings
-from typing import Optional
 import torch
 import torch.nn as nn
 from einops import rearrange
 from packaging import version
 from torch import nn
 from .norm import LPLayerNorm
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
     if original_is_causal and num_query_tokens != num_key_tokens:
@@ -17,7 +76,20 @@ def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_cau
             return False
     return original_is_causal
-def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
     q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
     k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
     v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
@@ -33,7 +105,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_s
         attn_weight = attn_weight + attn_bias
     if key_padding_mask is not None:
         if attn_bias is not None:
-            warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
     if is_causal:
         s = max(s_q, s_k)
@@ -50,7 +122,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_s
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
         return (out, attn_weight)
-    return (out, None)
 def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
     for tensor in tensors:
@@ -59,7 +131,20 @@ def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
         if not tensor.is_cuda:
             raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
-def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
     try:
         from flash_attn import bert_padding, flash_attn_interface
     except:
@@ -84,9 +169,22 @@ def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
-    return (output, None)
-def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
     try:
         from .flash_attn_triton import flash_attn_func
     except:
@@ -119,14 +217,16 @@ def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bi
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
-    return (output, None)
-class MultiheadAttention(nn.Module):
     """Multi-head self attention.
     Using torch or triton attention implemetation enables user to also use
     additive bias.
     """
     def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
         super().__init__()
@@ -160,7 +260,15 @@ class MultiheadAttention(nn.Module):
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
@@ -174,13 +282,73 @@ class MultiheadAttention(nn.Module):
             if len(past_key_value) != 0:
                 key = torch.cat([past_key_value[0], key], dim=1)
                 value = torch.cat([past_key_value[1], value], dim=1)
-            past_key_value = (key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
-        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
-        return (self.out_proj(context), attn_weights, past_key_value)
-class MultiQueryAttention(nn.Module):
     """Multi-Query self attention.
     Using torch or triton attention implemetation enables user to also use
@@ -220,7 +388,15 @@ class MultiQueryAttention(nn.Module):
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
@@ -234,11 +410,12 @@ class MultiQueryAttention(nn.Module):
             if len(past_key_value) != 0:
                 key = torch.cat([past_key_value[0], key], dim=1)
                 value = torch.cat([past_key_value[1], value], dim=1)
-            past_key_value = (key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
-        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
-        return (self.out_proj(context), attn_weights, past_key_value)
 def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
     if attn_impl == 'flash':

 """Attention layers."""
 import math
 import warnings
+from typing import Optional, Dict, Any, NamedTuple, Protocol, Tuple, Union
 import torch
 import torch.nn as nn
 from einops import rearrange
 from packaging import version
 from torch import nn
+from torch.utils.checkpoint import checkpoint
 from .norm import LPLayerNorm
+from .is_torch_version import is_torch_version
+class PastKeyValue(NamedTuple):
+    key: torch.Tensor
+    value: torch.Tensor
+class AttnFnOutput(NamedTuple):
+    attns: torch.Tensor
+    attn_probs: Optional[torch.Tensor]
+class AttnFn(Protocol):
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        n_heads: int,
+        softmax_scale: Optional[float] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.ByteTensor] = None,
+        is_causal = False,
+        dropout_p = 0.0,
+        training = False,
+        needs_weights = False,
+        multiquery = False,
+    ) -> AttnFnOutput: ...
+class AttnFnCheckpointed(Protocol):
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        n_heads: int,
+        softmax_scale: Optional[float],
+        attn_bias: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.ByteTensor],
+        is_causal: bool,
+        dropout_p: float,
+        training: bool,
+        needs_weights: bool,
+    ) -> AttnFnOutput: ...
+class AttnOutput(NamedTuple):
+    projected_context: torch.Tensor
+    attn_weights: Optional[torch.Tensor]
+    past_key_value: Union[PastKeyValue, Tuple, None]
+class Attn(Protocol):
+    def __call__(
+        self,
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal = True,
+        needs_weights = False,
+    ) -> AttnOutput: ...
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
     if original_is_causal and num_query_tokens != num_key_tokens:
             return False
     return original_is_causal
+def scaled_multihead_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.ByteTensor] = None,
+    is_causal = False,
+    dropout_p = 0.0,
+    training = False,
+    needs_weights = False,
+    multiquery = False,
+) -> AttnFnOutput:
     q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
     k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
     v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
         attn_weight = attn_weight + attn_bias
     if key_padding_mask is not None:
         if attn_bias is not None:
+            warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
     if is_causal:
         s = max(s_q, s_k)
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
         return (out, attn_weight)
+    return AttnFnOutput(out, None)
 def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
     for tensor in tensors:
         if not tensor.is_cuda:
             raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
+def flash_attn_fn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.ByteTensor] = None,
+    is_causal = False,
+    dropout_p = 0.0,
+    training = False,
+    needs_weights = False,
+    multiquery = False,
+) -> AttnFnOutput:
     try:
         from flash_attn import bert_padding, flash_attn_interface
     except:
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
+    return AttnFnOutput(output, None)
+def triton_flash_attn_fn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.ByteTensor] = None,
+    is_causal = False,
+    dropout_p = 0.0,
+    training = False,
+    needs_weights = False,
+    multiquery = False,
+) -> AttnFnOutput:
     try:
         from .flash_attn_triton import flash_attn_func
     except:
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
+    return AttnFnOutput(output, None)
+class MultiheadAttention(nn.Module, Attn):
     """Multi-head self attention.
     Using torch or triton attention implemetation enables user to also use
     additive bias.
     """
+    gradient_checkpointing = False
+    attn_fn: AttnFn
     def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
         super().__init__()
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal = True,
+        needs_weights = False,
+    ) -> AttnOutput:
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
             if len(past_key_value) != 0:
                 key = torch.cat([past_key_value[0], key], dim=1)
                 value = torch.cat([past_key_value[1], value], dim=1)
+            past_key_value = PastKeyValue(key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
+            def create_custom_forward(attn_fn: AttnFn) -> AttnFnCheckpointed:
+                def custom_forward(
+                    query: torch.Tensor,
+                    key: torch.Tensor,
+                    value: torch.Tensor,
+                    n_heads: int,
+                    softmax_scale: Optional[float],
+                    attn_bias: Optional[torch.Tensor],
+                    key_padding_mask: Optional[torch.ByteTensor],
+                    is_causal: bool,
+                    dropout_p: float,
+                    training: bool,
+                    needs_weights: bool,
+                ):
+                    return attn_fn(
+                        query,
+                        key,
+                        value,
+                        n_heads,
+                        softmax_scale,
+                        attn_bias,
+                        key_padding_mask,
+                        is_causal,
+                        dropout_p,
+                        training,
+                        needs_weights,
+                        False, # multiquery
+                    )
+                return custom_forward
+            attn_out: AttnOutput = checkpoint(
+                create_custom_forward(self.attn_fn),
+                query,
+                key,
+                value,
+                self.n_heads,
+                self.softmax_scale,
+                attn_bias,
+                key_padding_mask,
+                is_causal,
+                self.attn_dropout_p,
+                self.training,
+                needs_weights,
+                **ckpt_kwargs,
+            )
+        else:
+            attn_out: AttnOutput = self.attn_fn(
+                query,
+                key,
+                value,
+                self.n_heads,
+                softmax_scale=self.softmax_scale,
+                attn_bias=attn_bias,
+                key_padding_mask=key_padding_mask,
+                is_causal=is_causal,
+                dropout_p=self.attn_dropout_p,
+                training=self.training,
+                needs_weights=needs_weights,
+            )
+        context, attn_weights = attn_out
+        return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
+class MultiQueryAttention(nn.Module, Attn):
     """Multi-Query self attention.
     Using torch or triton attention implemetation enables user to also use
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal = True,
+        needs_weights = False,
+    ) -> AttnOutput:
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
             if len(past_key_value) != 0:
                 key = torch.cat([past_key_value[0], key], dim=1)
                 value = torch.cat([past_key_value[1], value], dim=1)
+            past_key_value = PastKeyValue(key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
+        attn_fn_output: AttnFnOutput = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
+        context, attn_weights = attn_fn_output
+        return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
     if attn_impl == 'flash':

blocks.py CHANGED Viewed

@@ -1,10 +1,14 @@
 """GPT Blocks used for the GPT Model."""
-from typing import Dict, Optional, Tuple
 import torch
 import torch.nn as nn
-from .attention import ATTN_CLASS_REGISTRY
 from .norm import NORM_CLASS_REGISTRY
 class MPTMLP(nn.Module):
     def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
@@ -18,6 +22,7 @@ class MPTMLP(nn.Module):
         return self.down_proj(self.act(self.up_proj(x)))
 class MPTBlock(nn.Module):
     def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
         del kwargs
@@ -31,11 +36,11 @@ class MPTBlock(nn.Module):
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
-    def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
         a = self.norm_1(x)
         (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
         m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
-        return (x, past_key_value)

 """GPT Blocks used for the GPT Model."""
+from typing import Dict, Optional, Tuple, NamedTuple, Union
 import torch
 import torch.nn as nn
+from .attention import ATTN_CLASS_REGISTRY, Attn, PastKeyValue
 from .norm import NORM_CLASS_REGISTRY
+class MPTBlockOutput(NamedTuple):
+    hidden_states: torch.Tensor
+    past_key_value: Union[PastKeyValue, Tuple, None]
 class MPTMLP(nn.Module):
     def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
         return self.down_proj(self.act(self.up_proj(x)))
 class MPTBlock(nn.Module):
+    attn: Attn
     def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
         del kwargs
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+    def forward(self, x: torch.Tensor, past_key_value: Union[PastKeyValue, Tuple, None] = None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> MPTBlockOutput:
         a = self.norm_1(x)
         (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
         m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
+        return MPTBlockOutput(x, past_key_value)

is_torch_version.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import sys
+import logging
+import operator as op
+from packaging import version
+from packaging.version import Version, parse
+from typing import Union
+import importlib.util
+# The package importlib_metadata is in a different place, depending on the python version.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
+logger = logging.getLogger(__name__)
+_torch_available = importlib.util.find_spec("torch") is not None
+if _torch_available:
+  try:
+    _torch_version = importlib_metadata.version("torch")
+    logger.info(f"PyTorch version {_torch_version} available.")
+  except importlib_metadata.PackageNotFoundError:
+    _torch_available = False
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
+  """
+  Args:
+  Compares a library version to some requirement using a given operation.
+    library_or_version (`str` or `packaging.version.Version`):
+      A library name or a version to check.
+    operation (`str`):
+      A string representation of an operator, such as `">"` or `"<="`.
+    requirement_version (`str`):
+      The version to compare the library version against
+  """
+  if operation not in STR_OPERATION_TO_FUNC.keys():
+    raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
+  operation = STR_OPERATION_TO_FUNC[operation]
+  if isinstance(library_or_version, str):
+    library_or_version = parse(importlib_metadata.version(library_or_version))
+  return operation(library_or_version, parse(requirement_version))
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
+def is_torch_version(operation: str, version: str):
+  """
+  Args:
+  Compares the current PyTorch version to a given reference with an operation.
+    operation (`str`):
+      A string representation of an operator, such as `">"` or `"<="`
+    version (`str`):
+      A string version of PyTorch
+  """
+  return compare_versions(parse(_torch_version), operation, version)

modeling_mpt.py CHANGED Viewed

@@ -4,25 +4,45 @@ Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
 """
 import math
 import warnings
-from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from .attention import attn_bias_shape, build_attn_bias
-from .blocks import MPTBlock
 from .norm import NORM_CLASS_REGISTRY
 from .configuration_mpt import MPTConfig
 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
 from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
     base_model_prefix = 'model'
 class MPTModel(MPTPreTrainedModel):
@@ -64,6 +84,7 @@ class MPTModel(MPTPreTrainedModel):
         if self.config.init_config['verbose'] > 1:
             init_fn_name = self.config.init_config['name']
             warnings.warn(f'Using {init_fn_name} initialization.')
     def get_input_embeddings(self):
         return self.wte
@@ -130,6 +151,12 @@ class MPTModel(MPTPreTrainedModel):
     def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if attention_mask is not None:
             attention_mask = attention_mask.bool()
         if prefix_mask is not None:
@@ -180,7 +207,43 @@ class MPTModel(MPTPreTrainedModel):
                 assert all_hidden_states is not None
                 all_hidden_states = all_hidden_states + (x,)
             past_key_value = past_key_values[b_idx] if past_key_values is not None else None
-            (x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
             if past_key_values is not None:
                 past_key_values[b_idx] = past_key_value
         x = self.norm_f(x)
@@ -231,7 +294,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
     def get_decoder(self):
         return self.transformer
-    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)

 """
 import math
 import warnings
+from typing import Any, List, Optional, Tuple, Union, Protocol, Dict
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.utils import logging
+from .attention import attn_bias_shape, build_attn_bias, PastKeyValue
+from .blocks import MPTBlock, MPTBlockOutput
 from .norm import NORM_CLASS_REGISTRY
 from .configuration_mpt import MPTConfig
 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
 from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
+from .is_torch_version import is_torch_version
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+logger = logging.get_logger(__name__)
+class MPTBlockCheckpointedForward(Protocol):
+    def __call__(
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None],
+        attn_bias: Optional[torch.Tensor],
+        attention_mask: Optional[torch.ByteTensor],
+        is_causal: bool,
+    ) -> MPTBlockOutput: ...
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
     base_model_prefix = 'model'
+    _no_split_modules = ['MPTBlock']
+    supports_gradient_checkpointing = True
+    def _set_gradient_checkpointing(self, module: nn.Module, value=False) -> None:
+        if isinstance(module, MPTModel):
+            module.gradient_checkpointing = value
 class MPTModel(MPTPreTrainedModel):
         if self.config.init_config['verbose'] > 1:
             init_fn_name = self.config.init_config['name']
             warnings.warn(f'Using {init_fn_name} initialization.')
+        self.gradient_checkpointing = False
     def get_input_embeddings(self):
         return self.wte
     def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         if attention_mask is not None:
             attention_mask = attention_mask.bool()
         if prefix_mask is not None:
                 assert all_hidden_states is not None
                 all_hidden_states = all_hidden_states + (x,)
             past_key_value = past_key_values[b_idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
+                def create_custom_forward(module: MPTBlock) -> MPTBlockCheckpointedForward:
+                    def custom_forward(
+                        x: torch.Tensor,
+                        past_key_value: Union[PastKeyValue, Tuple, None],
+                        attn_bias: Optional[torch.Tensor],
+                        attention_mask: Optional[torch.ByteTensor],
+                        is_causal: bool
+                    ):
+                        return module.forward(
+                            x,
+                            past_key_value,
+                            attn_bias,
+                            attention_mask,
+                            is_causal,
+                        )
+                    return custom_forward
+                block_out: MPTBlockOutput = checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    past_key_value,
+                    attn_bias,
+                    attention_mask,
+                    self.is_causal,
+                    **ckpt_kwargs,
+                )
+            else:
+                block_out: MPTBlockOutput = block(
+                    x,
+                    past_key_value=past_key_value,
+                    attn_bias=attn_bias,
+                    attention_mask=attention_mask,
+                    is_causal=self.is_causal,
+                )
+            x, past_key_value = block_out
+            del block_out
             if past_key_values is not None:
                 past_key_values[b_idx] = past_key_value
         x = self.norm_f(x)
     def get_decoder(self):
         return self.transformer
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, *args, **kwargs):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)