Upload MosaicGPT

Browse files

Files changed (9) hide show

attention.py +408 -0
config.json +46 -0
configuration_mosaic_gpt.py +168 -0
generation_config.json +5 -0
gpt_blocks.py +90 -0
low_precision_layernorm.py +31 -0
mosaic_gpt.py +446 -0
param_init_fns.py +464 -0
pytorch_model.bin +3 -0

attention.py ADDED Viewed

	@@ -0,0 +1,408 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layers."""
+import math
+import warnings
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch import nn
+from .low_precision_layernorm import LPLayerNorm
+def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
+                     original_is_causal: bool):
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError(
+                'MosaicGPT does not support query and key with different number of tokens, unless number of query tokens is 1.'
+            )
+        else:
+            return False
+    return original_is_causal
+def scaled_multihead_dot_product_attention(
+    query,
+    key,
+    value,
+    n_heads,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+):
+    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    k = rearrange(key, 'b s (h d) -> b h d s', h=n_heads)  # includes key.t()
+    v = rearrange(value, 'b s (h d) -> b h s d', h=n_heads)
+    min_val = torch.finfo(q.dtype).min
+    b, _, s_q, d = q.shape
+    s_k = k.size(-1)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        if (attn_bias.size(-1) != 1 and
+                attn_bias.size(-1) != s_k) or (attn_bias.size(-2) != 1 and
+                                               attn_bias.size(-2) != s_q):
+            raise RuntimeError(
+                f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.'
+            )
+        attn_weight = attn_weight + attn_bias
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn(
+                'Propogating key_padding_mask to the attention module ' +\
+                'and applying it within the attention module can cause ' +\
+                'unneccessary computation/memory usage. Consider integrating ' +\
+                'into attn_bias once and passing that to each attention ' +\
+                'module instead.'
+            )
+        attn_weight = attn_weight.masked_fill(
+            ~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal:
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k),
+                                              min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(attn_weight,
+                                                  p=dropout_p,
+                                                  training=training,
+                                                  inplace=True)
+    out = attn_weight.matmul(v)
+    out = rearrange(out, 'b h s d -> b s (h d)')
+    if needs_weights:
+        return out, attn_weight
+    return out, None
+def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(f'{tensor.dtype=} must be in {valid_dtypes=}.')
+        if not tensor.is_cuda:
+            raise TypeError(f'Inputs must be cuda tensors ({tensor.is_cuda=}).')
+def flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+):
+    try:
+        from flash_attn import bert_padding, flash_attn_interface
+    except:
+        raise RuntimeError('Please install flash_attn==0.2.8')
+    check_valid_inputs(query, key, value)
+    if attn_bias is not None:
+        raise NotImplementedError(f'attn_bias not implemented for flash attn.')
+    batch_size, seqlen = query.shape[:2]
+    if key_padding_mask is None:
+        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+    query_padding_mask = key_padding_mask[:, -query.size(1):]
+    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
+        query, query_padding_mask)
+    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
+        key, key_padding_mask)
+    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
+    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
+        query_unpad,
+        key_unpad,
+        value_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale=softmax_scale,
+        causal=reset_is_causal,
+        return_attn_probs=needs_weights)
+    output = bert_padding.pad_input(
+        rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size,
+        seqlen)
+    return output, None
+def triton_flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+):
+    try:
+        from flash_attn import flash_attn_triton  # type: ignore
+    except:
+        raise RuntimeError('Please install flash_attn==0.2.8 and triton==2.0.0.dev20221202.')
+    check_valid_inputs(query, key, value)
+    if dropout_p:
+        raise NotImplementedError(
+            f'Dropout not implemented for attn_impl: triton.')
+    if needs_weights:
+        raise NotImplementedError(
+            f'attn_impl: triton cannot return attn weights.')
+    if key_padding_mask is not None:
+        warnings.warn(
+            'Propagating key_padding_mask to the attention module ' +\
+            'and applying it within the attention module can cause ' +\
+            'unnecessary computation/memory usage. Consider integrating ' +\
+            'into attn_bias once and passing that to each attention ' +\
+            'module instead.'
+        )
+        b_size, s_k = key_padding_mask.shape[:2]
+        if attn_bias is None:
+            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
+        attn_bias = attn_bias.masked_fill(
+            ~key_padding_mask.view((b_size, 1, 1, s_k)),
+            torch.finfo(query.dtype).min)
+    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
+    key = rearrange(key, 'b s (h d) -> b s h d', h=n_heads)
+    value = rearrange(value, 'b s (h d) -> b s h d', h=n_heads)
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    attn_output = flash_attn_triton.flash_attn_func(query, key, value,
+                                                    attn_bias, reset_is_causal,
+                                                    softmax_scale)
+    output = attn_output.view(*attn_output.shape[:2], -1)
+    return output, None
+class MultiheadAttention(nn.Module):
+    """Multi-head self attention.
+    Using torch or triton attention implemetation enables user to also use
+    additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        attn_impl: str = 'triton',
+        attn_clip_qkv: Optional[float] = None,
+        attn_qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        low_precision_layernorm: bool = False,
+        device: Optional[str] = None,
+    ):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = attn_clip_qkv
+        self.attn_qk_ln = attn_qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = attn_pdrop
+        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
+        # for param init fn; enables shape based init of fused layers
+        fuse_splits = (d_model, 2 * d_model)
+        self.Wqkv._fused = (0, fuse_splits)  # type: ignore
+        if self.attn_qk_ln:
+            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+            self.q_ln = layernorm_class(self.d_model, device=device)
+            self.k_ln = layernorm_class(self.d_model, device=device)
+        if self.attn_impl == 'flash':
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == 'triton':
+            self.attn_fn = triton_flash_attn_fn
+            warnings.warn(
+                'While `attn_impl: triton` can be faster than `attn_impl: flash` ' +\
+                'it uses more memory. When training larger models this can trigger '  +\
+                'alloc retries which hurts performance. If encountered, we recommend ' +\
+                'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
+        elif self.attn_impl == 'torch':
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available():
+                warnings.warn(
+                    'Using `attn_impl: torch`. If your model does not use `alibi` or ' +\
+                    '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' +\
+                    'we recommend using `attn_impl: triton`.'
+                )
+        else:
+            raise ValueError(f'{attn_impl=} is an invalid setting.')
+        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
+        self.out_proj._is_residual = True  # type: ignore
+    def forward(self,
+                x,
+                past_key_value=None,
+                attn_bias=None,
+                attention_mask=None,
+                is_causal=True,
+                needs_weights=False):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        query, key, value = qkv.chunk(3, dim=2)
+        key_padding_mask = attention_mask
+        if self.attn_qk_ln:
+            # Applying layernorm to qk
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        if past_key_value is not None:
+            if len(past_key_value) != 0:
+                key = torch.cat([past_key_value[0], key], dim=1)
+                value = torch.cat([past_key_value[1], value], dim=1)
+            past_key_value = (key, value)
+        if attn_bias is not None:
+            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
+        context, attn_weights = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+        )
+        return self.out_proj(context), attn_weights, past_key_value
+def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal,
+                    use_sequence_id):
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            if (prefix_lm or not causal) or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif prefix_lm or use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f'{attn_impl=} is an invalid setting.')
+def attn_bias(attn_impl,
+              attn_bias,
+              n_heads,
+              seq_len,
+              causal=False,
+              alibi=False,
+              alibi_bias_max=8):
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            # in place add alibi to attn bias
+            device, dtype = attn_bias.device, attn_bias.dtype
+            attn_bias = attn_bias.add(
+                alibi_bias(n_heads,
+                           seq_len,
+                           full=not causal,
+                           alibi_bias_max=alibi_bias_max,
+                           device=device,
+                           dtype=dtype))
+        return attn_bias
+    else:
+        raise ValueError(f'{attn_impl=} is an invalid setting.')
+def alibi_bias(n_heads,
+               seq_len,
+               full=False,
+               alibi_bias_max=8,
+               device=None,
+               dtype=None):
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=dtype,
+                              device=device).view(1, 1, 1, seq_len)
+    if full:
+        # generate 1 x Heads x SeqLen x SeqLen alibi bias mask
+        # otherwise the mask is 1 x Heads x 1 x SeqLen (which is broadcast to the appropriate size)
+        alibi_bias = alibi_bias - torch.arange(
+            1 - seq_len, 1, dtype=dtype, device=device).view(1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    m = torch.arange(1, n_heads + 1, dtype=dtype, device=device)
+    m = m.mul(alibi_bias_max / n_heads)
+    alibi_bias = alibi_bias * (1. / (2**m.view(1, n_heads, 1, 1)))
+    return alibi_bias

config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "_name_or_path": "mosaicml/mosaic-llama-redpajama-final-candidate",
+  "alibi": true,
+  "alibi_bias_max": 8,
+  "architectures": [
+    "MosaicGPT"
+  ],
+  "attn_clip_qkv": null,
+  "attn_impl": "torch",
+  "attn_pdrop": 0,
+  "attn_qk_ln": true,
+  "attn_uses_sequence_id": false,
+  "auto_map": {
+    "AutoConfig": "configuration_mosaic_gpt.MosaicGPTConfig",
+    "AutoModelForCausalLM": "mosaic_gpt.MosaicGPT"
+  },
+  "d_model": 2048,
+  "emb_init_std": null,
+  "emb_init_uniform_lim": null,
+  "emb_pdrop": 0,
+  "embedding_fraction": 1.0,
+  "fan_mode": "fan_in",
+  "init_device": "cpu",
+  "init_div_is_residual": true,
+  "init_gain": 0,
+  "init_nonlinearity": "relu",
+  "init_std": 0.02,
+  "logit_scale": null,
+  "low_precision_layernorm": true,
+  "max_seq_len": 2048,
+  "mlp_ratio": 4,
+  "model_type": "mosaic_gpt",
+  "n_heads": 16,
+  "n_layers": 24,
+  "no_bias": true,
+  "param_init_fn": "kaiming_normal_",
+  "prefix_lm": false,
+  "resid_pdrop": 0,
+  "softmax_scale": null,
+  "tokenizer_name": "EleutherAI/gpt-neox-20b",
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.4",
+  "use_cache": false,
+  "verbose": 0,
+  "vocab_size": 50432
+}

configuration_mosaic_gpt.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+"""A HuggingFace-style model configuration."""
+from typing import Optional, Tuple, Union
+from transformers import PretrainedConfig
+class MosaicGPTConfig(PretrainedConfig):
+    model_type = 'mosaic_gpt'
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        mlp_ratio: int = 4,
+        max_seq_len: int = 2048,
+        vocab_size: int = 50368,
+        attn_pdrop: float = 0.0,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_impl: str = 'triton',
+        attn_qk_ln: bool = False,
+        attn_clip_qkv: Optional[float] = None,
+        softmax_scale: Optional[float] = None,
+        prefix_lm: Optional[bool] = False,
+        attn_uses_sequence_id: Optional[bool] = False,
+        alibi: bool = False,
+        alibi_bias_max: int = 8,
+        init_device: str = 'cpu',
+        logit_scale: Optional[Union[float, str]] = None,
+        no_bias: bool = False,
+        verbose: int = 0,
+        param_init_fn: str = 'kaiming_normal_',
+        init_div_is_residual: Union[int, float, str, bool] = True,
+        init_std: float = 0.02,
+        emb_init_std: Optional[float] = None,
+        emb_init_uniform_lim: Optional[Union[Tuple[float, float],
+                                             float]] = None,
+        init_gain: float = 0,
+        fan_mode: str = 'fan_in',
+        init_nonlinearity: str = 'relu',
+        embedding_fraction: float = 1.0,
+        low_precision_layernorm: bool = True,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        """The MosaicGPT configuration class.
+        Args:
+            d_model (int): The size of the embedding dimension of the model.
+            n_heads (int): The number of attention heads.
+            n_layers (int): The number of layers in the model.
+            mlp_ratio (int): The ratio of the up/down scale in the MLP.
+            max_seq_len (int): The maximum sequence length of the model.
+            vocab_size (int): The size of the vocabulary.
+            attn_pdrop (float): The dropout probability for the attention layers.
+            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
+            emb_pdrop (float): The dropout probability for the embedding layer.
+            attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
+            attn_qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
+            attn_clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
+                this value.
+            softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
+                use the default scale of ``1/sqrt(d_keys)``.
+            prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
+                extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
+                can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
+            attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
+                When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
+                which sub-sequence each token belongs to.
+                Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+            alibi (bool): Whether to use the alibi bias instead of position embeddings.
+            alibi_bias_max (int): The maximum value of the alibi bias.
+            init_device (str): The device to use for parameter initialization.
+            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
+            no_bias (bool): Whether to use bias in all layers.
+            verbose (int): The verbosity level. 0 is silent.
+            param_init_fn (str): The parameter initialization scheme to use. One of 'default_', 'baseline_', 'kaiming_uniform_',
+                'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or 'xavier_normal_'.
+            init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
+            init_std (float): The standard deviation of the normal distribution used to initialize the model,
+                if using the baseline_ parameter initialization scheme.
+            emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
+            emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
+                used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
+            init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
+            fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
+            init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
+            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
+            low_precision_layernorm (bool): Whether to use low precision layer normalization.
+            use_cache (bool): Whether or not the model should return the last key/values attentions
+        """
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.mlp_ratio = mlp_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.attn_pdrop = attn_pdrop
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.attn_impl = attn_impl
+        self.attn_qk_ln = attn_qk_ln
+        self.attn_clip_qkv = attn_clip_qkv
+        self.softmax_scale = softmax_scale
+        self.prefix_lm = prefix_lm
+        self.attn_uses_sequence_id = attn_uses_sequence_id
+        self.alibi = alibi
+        self.alibi_bias_max = alibi_bias_max
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.verbose = verbose
+        self.param_init_fn = param_init_fn
+        self.init_div_is_residual = init_div_is_residual
+        self.init_std = init_std
+        self.emb_init_std = emb_init_std
+        self.emb_init_uniform_lim = emb_init_uniform_lim
+        self.init_std = init_std
+        self.init_gain = init_gain
+        self.fan_mode = fan_mode
+        self.init_nonlinearity = init_nonlinearity
+        self.embedding_fraction = embedding_fraction
+        self.low_precision_layernorm = low_precision_layernorm
+        self.use_cache = use_cache
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        super().__init__(**kwargs)
+        self._validate_config()
+    def _validate_config(self):
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any(prob < 0 or prob > 1
+               for prob in [self.attn_pdrop, self.resid_pdrop, self.emb_pdrop]):
+            raise ValueError(
+                'attn_pdrop, resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1'
+            )
+        if self.attn_impl not in ['torch', 'flash', 'triton']:
+            raise ValueError(f'Unknown attn_impl={self.attn_impl}')
+        if self.prefix_lm and self.attn_impl not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'prefix_lm only implemented with torch and triton attention.')
+        if self.alibi and self.attn_impl not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'alibi only implemented with torch and triton attention.')
+        if self.attn_uses_sequence_id and self.attn_impl not in [
+                'torch', 'triton'
+        ]:
+            raise NotImplementedError(
+                'attn_uses_sequence_id only implemented with torch and triton attention.'
+            )
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError(
+                'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
+            )
+        if isinstance(self.logit_scale,
+                      str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(
+                f"{self.logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+            )

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.27.4",
+  "use_cache": false
+}

gpt_blocks.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+"""GPT Blocks used for the GPT Model."""
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from .attention import MultiheadAttention
+from .low_precision_layernorm import LPLayerNorm
+class GPTMLP(nn.Module):
+    def __init__(self,
+                 d_model: int,
+                 mlp_ratio: int,
+                 device: Optional[str] = None):
+        super().__init__()
+        self.mlp_up = nn.Linear(d_model, mlp_ratio * d_model, device=device)
+        self.mlp_act = nn.GELU(approximate='none')
+        self.mlp_down = nn.Linear(mlp_ratio * d_model, d_model, device=device)
+        self.mlp_down._is_residual = True  # type: ignore
+    def forward(self, x):
+        return self.mlp_down(self.mlp_act(self.mlp_up(x)))
+class GPTBlock(nn.Module):
+    def __init__(self,
+                 attn_impl: str,
+                 d_model: int,
+                 n_heads: int,
+                 mlp_ratio: int,
+                 attn_clip_qkv: Optional[float] = None,
+                 attn_qk_ln: bool = False,
+                 softmax_scale: Optional[float] = None,
+                 attn_pdrop: float = 0.0,
+                 alibi: bool = False,
+                 resid_pdrop: float = 0.0,
+                 low_precision_layernorm: bool = False,
+                 device: Optional[str] = None,
+                 **kwargs):
+        del kwargs  # unused, just to capture any extra args from the config
+        super().__init__()
+        layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+        self.ln_1 = layernorm_class(d_model, device=device)
+        self.attn = MultiheadAttention(
+            attn_impl=attn_impl,
+            attn_clip_qkv=attn_clip_qkv,
+            attn_qk_ln=attn_qk_ln,
+            softmax_scale=softmax_scale,
+            attn_pdrop=attn_pdrop,
+            d_model=d_model,
+            n_heads=n_heads,
+            device=device,
+        )
+        self.ln_2 = layernorm_class(d_model, device=device)
+        self.mlp = GPTMLP(
+            d_model=d_model,
+            mlp_ratio=mlp_ratio,
+            device=device,
+        )
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(resid_pdrop)
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal: bool = True,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
+        a = self.ln_1(x)
+        b, _, past_key_value = self.attn(a,
+                                         past_key_value=past_key_value,
+                                         attn_bias=attn_bias,
+                                         attention_mask=attention_mask,
+                                         is_causal=is_causal)
+        x = x + self.resid_attn_dropout(b)
+        m = self.ln_2(x)
+        n = self.mlp(m)
+        x = x + self.resid_mlp_dropout(n)
+        return x, past_key_value

low_precision_layernorm.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import torch.nn.functional as F
+class LPLayerNorm(torch.nn.LayerNorm):
+    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
+        super().__init__(
+            normalized_shape=normalized_shape,
+            eps=eps,
+            elementwise_affine=elementwise_affine,
+            device=device,
+            dtype=dtype,
+        )
+    def forward(self, x):
+        module_device = x.device
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        with torch.autocast(enabled=False, device_type=module_device.type):
+            return F.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
+def _cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor

mosaic_gpt.py ADDED Viewed

	@@ -0,0 +1,446 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+"""A simple, flexible implementation of a GPT model.
+Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+"""
+import math
+import warnings
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .attention import attn_bias as module_attn_bias, attn_bias_shape as module_attn_bias_shape
+from .gpt_blocks import GPTBlock
+from .configuration_mosaic_gpt import \
+    MosaicGPTConfig
+from .param_init_fns import MODEL_INIT_REGISTRY
+from .low_precision_layernorm import LPLayerNorm
+class MosaicGPT(PreTrainedModel):
+    config_class = MosaicGPTConfig
+    base_model_prefix = 'mosaic_gpt'
+    def __init__(self, config: MosaicGPTConfig):
+        super().__init__(config)
+        self.attn_impl = config.attn_impl
+        self.prefix_lm = config.prefix_lm
+        self.attn_uses_sequence_id = config.attn_uses_sequence_id
+        self.alibi = config.alibi
+        self.alibi_bias_max = config.alibi_bias_max
+        layernorm_class = LPLayerNorm if config.low_precision_layernorm else nn.LayerNorm
+        # CogView (https://arxiv.org/abs/2105.13290) and GLM-130B (https://arxiv.org/abs/2210.02414)
+        # both report this helping with stabilizing training
+        self.embedding_fraction = config.embedding_fraction
+        self.transformer = nn.ModuleDict({
+            'wte':
+                nn.Embedding(config.vocab_size,
+                             config.d_model,
+                             device=config.init_device)
+        })
+        if not self.alibi:
+            self.transformer.update({
+                'wpe':
+                    nn.Embedding(config.max_seq_len,
+                                 config.d_model,
+                                 device=config.init_device)
+            })
+        self.transformer.update({'emb_drop': nn.Dropout(config.emb_pdrop)})
+        self.transformer.update({
+            'blocks':
+                nn.ModuleList([
+                    GPTBlock(device=config.init_device,
+                                        **config.to_dict())
+                    for _ in range(config.n_layers)
+                ])
+        })
+        self.transformer.update({
+            'ln_f': layernorm_class(config.d_model, device=config.init_device)
+        })
+        # enables scaling output logits; similar to a softmax "temperature"
+        # PaLM paper uses scale 1/sqrt(config.d_model)
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == 'inv_sqrt_d_model':
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(
+                        f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+                    )
+            self.logit_scale = logit_scale
+        if config.init_device != 'meta':
+            print(
+                f'You are using {config.init_device=}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
+            )
+            self.apply(self.param_init_fn)
+        self.is_causal = not self.prefix_lm
+        # define attn mask
+        self._attn_bias_initialized = False
+        self.attn_bias = None
+        self.attn_bias_shape = module_attn_bias_shape(
+            self.attn_impl,
+            config.n_heads,
+            config.max_seq_len,
+            self.alibi,
+            prefix_lm=self.prefix_lm,
+            causal=self.is_causal,
+            use_sequence_id=self.attn_uses_sequence_id)
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, 'bias') and isinstance(
+                        module.bias, nn.Parameter):
+                    if config.verbose:
+                        print(f'Removing bias ({module.bias}) from {module}.')
+                    module.register_parameter('bias', None)
+        if config.verbose and config.verbose > 2:
+            print(self)
+    @torch.no_grad()
+    def _attn_bias(self,
+                   device,
+                   dtype,
+                   attention_mask: Optional[torch.ByteTensor] = None,
+                   prefix_mask: Optional[torch.ByteTensor] = None,
+                   sequence_id: Optional[torch.LongTensor] = None):
+        if not self._attn_bias_initialized:
+            if self.attn_bias_shape:
+                self.attn_bias = torch.zeros(self.attn_bias_shape,
+                                             device=device,
+                                             dtype=dtype)
+                self.attn_bias = module_attn_bias(
+                    self.attn_impl,
+                    self.attn_bias,
+                    self.config.n_heads,
+                    self.config.max_seq_len,
+                    causal=self.is_causal,
+                    alibi=self.alibi,
+                    alibi_bias_max=self.alibi_bias_max)
+            self._attn_bias_initialized = True
+        # flash does not support prefix_lm and will incorporate any
+        # attention_mask inside the attention module
+        if self.attn_impl == 'flash':
+            return self.attn_bias, attention_mask
+        attn_bias = self.attn_bias
+        # If using torch or triton, we incorporate the prefix_mask (if appropriate)
+        if self.prefix_lm:
+            assert isinstance(attn_bias, torch.Tensor)  # pyright
+            assert isinstance(prefix_mask, torch.Tensor)  # pyright
+            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
+        # If using torch or triton, we incorporate sequence_id (if appropriate)
+        if self.attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)  # pyright
+            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
+        # If using torch or triton, we incorporate attention_mask. This will output
+        # None in place of attention_mask since it will not be further needed in the
+        # attention modules.
+        if attention_mask is not None:
+            s_k = attention_mask.shape[-1]
+            if attn_bias is None:
+                attn_bias = torch.zeros((1, 1, 1, s_k),
+                                        device=device,
+                                        dtype=dtype)
+            else:
+                attn_bias = attn_bias[:, :, :, -s_k:]
+            if prefix_mask is not None and (attention_mask.shape !=
+                                            prefix_mask.shape):
+                raise ValueError(
+                    f'attention_mask shape={attention_mask.shape} ' +\
+                    f'and prefix_mask shape={prefix_mask.shape} are not equal.'
+                )
+            min_val = torch.finfo(attn_bias.dtype).min
+            attn_bias = attn_bias.masked_fill(
+                ~attention_mask.view(-1, 1, 1, s_k), min_val)
+        return attn_bias, None
+    def _apply_prefix_mask(self, attn_bias: torch.Tensor,
+                           prefix_mask: torch.Tensor):
+        s_k, s_q = attn_bias.shape[-2:]
+        if (s_k != self.config.max_seq_len) or (s_q != self.config.max_seq_len):
+            raise ValueError(
+                'attn_bias does not match the expected shape. ' +\
+                f'The last two dimensions should both be {self.config.max_length} ' +\
+                f'but are {s_k} and {s_q}.'
+            )
+        seq_len = prefix_mask.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
+            )
+        # select seq_len subset of attn mask
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        # Mix the causal max and the bidirectional mask to get the full
+        # allowable attention (i.e. full = not accounting for padding yet)
+        causal = torch.tril(
+            torch.ones((seq_len, seq_len),
+                       dtype=torch.bool,
+                       device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
+        prefix = prefix_mask.view(-1, 1, 1, seq_len)
+        cannot_attend = ~torch.logical_or(causal, prefix.bool())
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+    def _apply_sequence_id(self, attn_bias: torch.Tensor,
+                           sequence_id: torch.LongTensor):
+        seq_len = sequence_id.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
+            )
+        # select seq_len subset of attn mask
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        # Restrict attention to tokens that share the same value
+        # in sequence_id
+        cannot_attend = torch.logical_not(
+            torch.eq(sequence_id.view(-1, seq_len, 1),
+                     sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+    def forward(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+            attention_mask: Optional[torch.ByteTensor] = None,
+            prefix_mask: Optional[torch.ByteTensor] = None,
+            sequence_id: Optional[torch.LongTensor] = None,
+            return_dict: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            use_cache: Optional[bool] = None):
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        # These args are passed in by keyword in huggingface's generate function
+        # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
+        # but have not yet been fully implemented in MosaicGPT
+        if not return_dict:
+            raise NotImplementedError(
+                'return_dict False is not implemented yet for MosaicGPT')
+        if output_attentions:
+            raise NotImplementedError(
+                'output_attentions is not implemented yet for MosaicGPT')
+        if attention_mask is not None and attention_mask[:, 0].sum(
+        ) != attention_mask.shape[0] and self.training:
+            raise NotImplementedError(
+                'MosaicGPT does not support training with left padding.')
+        if self.prefix_lm and prefix_mask is None:
+            raise ValueError(
+                'prefix_mask is a required argument when MosaicGPT is configured with prefix_lm=True.'
+            )
+        if self.training:
+            if self.attn_uses_sequence_id and sequence_id is None:
+                raise ValueError(
+                    'sequence_id is a required argument when MosaicGPT is configured with attn_uses_sequence_id=True ' +\
+                    'and the model is in train mode.'
+                )
+            elif (self.attn_uses_sequence_id is False) and (sequence_id
+                                                            is not None):
+                warnings.warn(
+                    'MosaicGPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' +\
+                    'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.'
+                )
+        S = input_ids.size(1)
+        assert (
+            S <= self.config.max_seq_len
+        ), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
+        tok_emb = self.transformer.wte(input_ids)  # type: ignore
+        if self.alibi:
+            x = tok_emb
+        else:
+            past_position = 0
+            if past_key_values is not None:
+                if len(past_key_values) != self.config.n_layers:
+                    raise ValueError(
+                        f'past_key_values must provide a past_key_value for each attention ' +\
+                        f'layer in the network ({len(past_key_values)=}; {self.config.n_layers=}).'
+                    )
+                # get the key tensor whose spec should be (batch, seq, dim), and
+                # collect the `seq`, so that the position embedding is shifted
+                past_position = past_key_values[0][0].size(1)
+            if S + past_position > self.config.max_seq_len:
+                raise ValueError(
+                    f'Cannot forward input with past sequence length {past_position} and current sequence length '
+                    f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.'
+                )
+            pos = torch.arange(past_position,
+                               S + past_position,
+                               dtype=torch.long,
+                               device=input_ids.device).unsqueeze(0)
+            if attention_mask is not None:
+                # adjust the position indices to account for padding tokens
+                pos = torch.clamp(pos - torch.cumsum(
+                    (~attention_mask).to(torch.int32), dim=1)[:,
+                                                              past_position:],
+                                  min=0)
+            pos_emb = self.transformer.wpe(pos)  # type: ignore
+            x = tok_emb + pos_emb
+        if self.embedding_fraction == 1:
+            x = self.transformer.emb_drop(x)  # type: ignore
+        else:
+            # this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
+            x_shrunk = (x * self.embedding_fraction) + (
+                x.detach() * (1 - self.embedding_fraction))
+            assert isinstance(self.transformer.emb_drop, nn.Module)  # pyright
+            x = self.transformer.emb_drop(x_shrunk)
+        attn_bias, attention_mask = self._attn_bias(
+            device=x.device,
+            dtype=x.dtype,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id)
+        # initialize the past key values cache if it should be used
+        if use_cache and past_key_values is None:
+            past_key_values = [() for _ in range(self.config.n_layers)
+                              ]  # type: ignore
+        all_hidden_states = () if output_hidden_states else None
+        for b_idx, block in enumerate(self.transformer.blocks):  # type: ignore
+            if output_hidden_states:
+                assert all_hidden_states is not None  # pyright
+                all_hidden_states = all_hidden_states + (x,)
+            past_key_value = past_key_values[
+                b_idx] if past_key_values is not None else None
+            x, past_key_value = block(x,
+                                      past_key_value=past_key_value,
+                                      attn_bias=attn_bias,
+                                      attention_mask=attention_mask,
+                                      is_causal=self.is_causal)
+            if past_key_values is not None:
+                past_key_values[b_idx] = past_key_value
+        x = self.transformer.ln_f(x)  # type: ignore
+        # output embedding weight tied to input embedding
+        assert isinstance(self.transformer.wte, nn.Module)  # pyright
+        assert isinstance(self.transformer.wte.weight, torch.Tensor)  # pyright
+        logits = F.linear(x, self.transformer.wte.weight, None)
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(
+                    f'Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.'
+                )
+            logits *= self.logit_scale
+        return CausalLMOutputWithPast(logits=logits,
+                                      past_key_values=past_key_values,
+                                      hidden_states=all_hidden_states)
+    # Param Initialization, needed for device='meta' fast initialization
+    def param_init_fn(self, module):
+        init_fn_name = self.config.param_init_fn
+        if self.config.verbose > 1:
+            warnings.warn(f'Using {init_fn_name} initialization.')
+        MODEL_INIT_REGISTRY[init_fn_name](module=module,
+                                          **self.config.to_dict())
+    # FSDP Wrap function
+    def fsdp_wrap_fn(self, module):
+        return isinstance(module, GPTBlock)
+    # Activation Checkpointing
+    def activation_checkpointing_fn(self, module):
+        return isinstance(module, GPTBlock)
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past_key_values=None,
+                                      inputs_embeds=None,
+                                      **kwargs):
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                'inputs_embeds is not implemented for MosaicGPT yet')
+        attention_mask = kwargs['attention_mask'].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError(
+                'MosaicGPT does not support generation with right padding.')
+        if self.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.prefix_lm:
+            # Leverage a convenience of sequential generation!
+            prefix_mask = torch.ones_like(attention_mask)
+            # This requires that we're using the cache
+            if kwargs.get('use_cache') == False:
+                raise NotImplementedError(
+                    'MosaicGPT with prefix_lm=True does not support use_cache=False.'
+                )
+        else:
+            prefix_mask = None
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'prefix_mask': prefix_mask,
+            'sequence_id': sequence_id,
+            'past_key_values': past_key_values,
+            'use_cache': kwargs.get('use_cache', True),
+        }
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        """Used by HuggingFace generate when using beam search with kv-caching.
+        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
+        for an example in transformers.
+        """
+        reordered_past = []
+        for layer_past in past_key_values:
+            reordered_past += [
+                tuple(
+                    past_state.index_select(0, beam_idx)
+                    for past_state in layer_past)
+            ]
+        return reordered_past

param_init_fns.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+import math
+import warnings
+from collections.abc import Sequence
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+def torch_default_param_init_fn_(
+    module: nn.Module,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    if verbose > 1:
+        warnings.warn(
+            f"Initializing network using module's reset_parameters attribute")
+    if hasattr(module, 'reset_parameters'):
+        module.reset_parameters()  # type: ignore
+def fused_init_helper_(module: nn.Module, init_fn_):
+    # parameter initialization is often based on the parameters shape.
+    # If a layer is fused, initialization should be based on the shapes
+    # of the original tensor instead of the shape of the fused tensor.
+    # Layers which are fused should have the _fused attibute defined.
+    # The first element of _fused is the dimension along which the tensor is fused.
+    # This is followed by an iterable of split indices."
+    _fused = getattr(module, '_fused', None)
+    if _fused is None:
+        raise RuntimeError(f'Internal logic error')
+    dim, splits = _fused
+    splits = (0, *splits, module.weight.size(dim))  # type: ignore
+    for s, e in zip(splits[:-1], splits[1:]):
+        slice_indices = [slice(None)] * module.weight.ndim  # type: ignore
+        slice_indices[dim] = slice(s, e)
+        init_fn_(module.weight[slice_indices])  # type: ignore
+def generic_param_init_fn_(
+    module: nn.Module,
+    init_fn_,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    if verbose > 1:
+        warnings.warn(
+            f'If model has bias parameters they are initialized to 0.')
+    # enable user to divide _is_residual weights by
+    # a value which defaults to math.sqrt(2 * cfg.n_layers)
+    init_div_is_residual = init_div_is_residual
+    if init_div_is_residual is False:
+        # not used, for pyright
+        div_is_residual = 1.0
+    elif init_div_is_residual is True:
+        div_is_residual = math.sqrt(2 * n_layers)
+    elif isinstance(init_div_is_residual, float) or isinstance(
+            init_div_is_residual, int):
+        div_is_residual = init_div_is_residual
+    elif isinstance(init_div_is_residual,
+                    str) and init_div_is_residual.isnumeric():
+        # do not trust YAML parsing to always convert numbers to numbers
+        div_is_residual = float(init_div_is_residual)
+    else:
+        # not used, for pyright
+        div_is_residual = 1.0
+        raise ValueError(
+            f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}'
+        )
+    if init_div_is_residual is not False:
+        if verbose > 1:
+            warnings.warn(
+                f'Initializing _is_residual layers then dividing them by {div_is_residual}.' +\
+                f'set `init_div_is_residual: false` in model config to disable this.'
+            )
+    if isinstance(module, nn.Linear):
+        # Linear
+        if hasattr(module, '_fused'):
+            fused_init_helper_(module, init_fn_)
+        else:
+            init_fn_(module.weight)
+        if module.bias is not None:
+            torch.nn.init.zeros_(module.bias)
+        if init_div_is_residual is not False and getattr(
+                module, '_is_residual', False):
+            with torch.no_grad():
+                module.weight.div_(div_is_residual)
+    elif isinstance(module, nn.Embedding):
+        # Embedding
+        if emb_init_std is not None:
+            std = emb_init_std
+            if std == 0:
+                warnings.warn(f'Embedding layer initialized to 0.')
+            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
+            if verbose > 1:
+                warnings.warn(
+                    f'Embedding layer initialized using normal distribution with mean=0 and {std=}.'
+                )
+        elif emb_init_uniform_lim is not None:
+            lim = emb_init_uniform_lim
+            if isinstance(lim, Sequence):
+                if len(lim) > 2:
+                    raise ValueError(
+                        f'Uniform init requires a min and a max limit. User input: {lim}.'
+                    )
+                if lim[0] == lim[1]:
+                    warnings.warn(f'Embedding layer initialized to {lim[0]}.')
+            else:
+                if lim == 0:
+                    warnings.warn(f'Embedding layer initialized to 0.')
+                lim = [-lim, lim]
+            a, b = lim
+            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
+            if verbose > 1:
+                warnings.warn(
+                    f'Embedding layer initialized using uniform distribution in range {lim}.'
+                )
+        else:
+            emb_init_fn_ = init_fn_
+        emb_init_fn_(module.weight)
+    elif isinstance(module, nn.LayerNorm):
+        # LayerNorm
+        if verbose > 1:
+            warnings.warn(
+                f'LayerNorm gamma weights are set to 1. If the layer has a bias it is initialized to 0.'
+            )
+        torch.nn.init.ones_(module.weight)
+        if module.bias is not None:
+            torch.nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.MultiheadAttention):
+        # torch's MultiheadAttention
+        if module._qkv_same_embed_dim:
+            assert module.in_proj_weight is not None
+            assert module.q_proj_weight is None and module.k_proj_weight is None and module.v_proj_weight is None
+            assert d_model is not None
+            # in_proj_weight is actually 3 layers and should be split up for width based init
+            _d = d_model
+            splits = (0, _d, 2 * _d, 3 * _d)
+            for s, e in zip(splits[:-1], splits[1:]):
+                init_fn_(module.in_proj_weight[s:e])
+        else:
+            assert module.q_proj_weight is not None and module.k_proj_weight is not None and module.v_proj_weight is not None
+            assert module.in_proj_weight is None
+            init_fn_(module.q_proj_weight)
+            init_fn_(module.k_proj_weight)
+            init_fn_(module.v_proj_weight)
+        # bias
+        if module.in_proj_bias is not None:
+            torch.nn.init.zeros_(module.in_proj_bias)
+        if module.bias_k is not None:
+            torch.nn.init.zeros_(module.bias_k)
+        if module.bias_v is not None:
+            torch.nn.init.zeros_(module.bias_v)
+        # out proj
+        init_fn_(module.out_proj.weight)
+        if init_div_is_residual is not False and getattr(
+                module.out_proj, '_is_residual', False):
+            with torch.no_grad():
+                module.out_proj.weight.div_(div_is_residual)
+        if module.out_proj.bias is not None:
+            torch.nn.init.zeros_(module.out_proj.bias)
+    else:
+        for _ in module.parameters(recurse=False):
+            # raise error if uninitialized module has any parameters
+            raise NotImplementedError(
+                f'{module.__class__.__name__} parameters are not initialized by param_init_fn.'
+            )
+def _normal_init_(std, mean=0.0):
+    return partial(torch.nn.init.normal_, mean=mean, std=std)
+def _normal_param_init_fn_(
+    module: nn.Module,
+    std: float,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    init_fn_ = _normal_init_(std=std)
+    if verbose > 1:
+        warnings.warn(
+            f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=init_fn_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def baseline_param_init_fn_(
+    module: nn.Module,
+    init_std: float,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    if init_std is None:
+        raise ValueError(
+            'You must set model.init_std to a float value to use the default initialization scheme.'
+        )
+    _normal_param_init_fn_(
+        module=module,
+        std=init_std,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def small_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: int,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    # very close to kaiming normal
+    # from Transformers without Tears (2019) - Nguyen & Salazar
+    std = math.sqrt(2 / (5 * d_model))
+    _normal_param_init_fn_(
+        module=module,
+        std=std,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def neox_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: int,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    """From section 2.3.1 of GPT-NeoX-20B:
+    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
+    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
+    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
+    """
+    del kwargs  # unused, just to capture any extra args from the config
+    residual_div = n_layers / math.sqrt(10)  # small std / wang std
+    if verbose > 1:
+        warnings.warn(f'setting init_div_is_residual to {residual_div}')
+    small_param_init_fn_(
+        module=module,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=residual_div,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def kaiming_uniform_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    fan_mode: str = 'fan_in',
+    init_nonlinearity: str = 'leaky_relu',
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    if verbose > 1:
+        warnings.warn(
+            f'Using nn.init.kaiming_uniform_ init fn with parameters: ' +\
+            f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
+        )
+    kaiming_uniform_ = partial(nn.init.kaiming_uniform_,
+                               a=init_gain,
+                               mode=fan_mode,
+                               nonlinearity=init_nonlinearity)
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=kaiming_uniform_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def kaiming_normal_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    fan_mode: str = 'fan_in',
+    init_nonlinearity: str = 'leaky_relu',
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    if verbose > 1:
+        warnings.warn(
+            f'Using nn.init.kaiming_normal_ init fn with parameters: ' +\
+            f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
+        )
+    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_,
+                              a=init_gain,
+                              mode=fan_mode,
+                              nonlinearity=init_nonlinearity)
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=kaiming_normal_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def xavier_uniform_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs  # unused, just to capture any extra args from the config
+    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
+    if verbose > 1:
+        warnings.warn(
+            f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' +\
+            f'gain={init_gain}'
+        )
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=xavier_uniform_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def xavier_normal_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    verbose: int = 0,
+    **kwargs,
+):
+    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
+    if verbose > 1:
+        warnings.warn(
+            f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' +\
+            f'gain={init_gain}'
+        )
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=xavier_normal_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+MODEL_INIT_REGISTRY = {
+    'default_': torch_default_param_init_fn_,
+    'baseline_': baseline_param_init_fn_,
+    'kaiming_uniform_': kaiming_uniform_param_init_fn_,
+    'kaiming_normal_': kaiming_normal_param_init_fn_,
+    'neox_init_': neox_param_init_fn_,
+    'small_init_': small_param_init_fn_,
+    'xavier_uniform_': xavier_uniform_param_init_fn_,
+    'xavier_normal_': xavier_normal_param_init_fn_,
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f195ac04c4300f0c0cf51f97d1e77580353699d0f56285072e38f555dbd68c1
+size 5245834073