granite-3b-code-base-2k / modeling_granite.py
Mayank Mishra
update script
6bb0180
raw
history blame
58.3 kB
import math
import numbers
import warnings
from enum import Enum
from typing import Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import DynamicCache, PreTrainedModel
from transformers.activations import get_activation as get_base_activation
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
from transformers.utils import is_flash_attn_2_available
from .configuration_granite import GraniteConfig
class PositionEmbeddingType(Enum):
learned_absolute = "learned_absolute"
alibi = "alibi"
rope = "rope"
class AttentionHeadType(Enum):
mha = "mha"
mqa = "mqa"
gqa = "gqa"
if is_flash_attn_2_available():
from flash_attn.bert_padding import IndexFirstAxis, pad_input, unpad_input
from flash_attn.flash_attn_interface import flash_attn_varlen_func
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return indices, cu_seqlens, max_seqlen_in_batch
def repeat_key_value(x: torch.Tensor, num_heads: int, num_key_value_heads: int) -> torch.Tensor:
num_groups = num_heads // num_key_value_heads
# mha
if num_groups == 1:
return x
# mqa
if num_key_value_heads == 1:
return x.expand(-1, num_heads, -1, -1)
# gqa
return x.repeat_interleave(num_groups, dim=1)
##################################################
# activation functions
_GLU_BASE_MAPPING = {
"geglu": "gelu",
"miglu": "mish",
"mishglu": "mish",
"swiglu": "swish",
}
class GLUActivation(nn.Module):
def __init__(self, base_activation: nn.Module) -> None:
super().__init__()
self.base_activation = base_activation
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x.chunk(2, dim=-1)
return x[0] * self.base_activation(x[1])
def is_glu(name: str) -> bool:
return name.endswith("glu")
def get_activation_function(name: str) -> nn.Module:
if is_glu(name):
# for glu and sigmoid_glu, we directly return the pytorch's GLU
if name in ["glu", "sigmoid_glu"]:
activation_function = nn.modules.GLU()
else:
if name in _GLU_BASE_MAPPING:
name = _GLU_BASE_MAPPING[name]
elif name.endswith("_glu"):
name = name.rstrip("_glu")
else:
raise ValueError("invalid activation function")
base_activation = get_base_activation(name)
activation_function = GLUActivation(base_activation)
else:
activation_function = get_base_activation(name)
return activation_function
##################################################
# normalization functions
class RMSNorm(nn.Module):
def __init__(self, normalized_shape: int, eps: float = 1e-6) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.eps = eps
if isinstance(normalized_shape, numbers.Integral):
normalized_shape = (normalized_shape,)
self.normalized_shape = normalized_shape
def forward(self, input: torch.Tensor) -> torch.Tensor:
input_dtype = input.dtype
input = input.to(torch.float32)
variance = input.pow(2).mean(-1, keepdim=True)
input = input * torch.rsqrt(variance + self.eps)
return self.weight * input.to(input_dtype)
def extra_repr(self) -> str:
return f"{self.normalized_shape}, eps={self.eps}"
def reset_parameters(self) -> None:
nn.init.ones_(self.weight)
_NORMALIZATION_FUNCTIONS = {
"layernorm": nn.LayerNorm,
"rmsnorm": RMSNorm,
}
def get_normalization_function(name: str, normalized_shape: int, eps: float = 1e-5) -> nn.Module:
if name in _NORMALIZATION_FUNCTIONS:
return _NORMALIZATION_FUNCTIONS[name](normalized_shape, eps=eps)
raise ValueError(f"unexpected `normalization_function` {name}")
##################################################
# attention modules
class GraniteAttention(nn.Module):
def __init__(self, config: GraniteConfig, causal: bool, layer_idx: Optional[int] = None) -> None:
super().__init__()
self.causal = causal
self.hidden_size = config.n_embd
self.num_heads = config.n_head
self.num_key_value_heads = config.num_key_value_heads
self.add_bias = config.add_bias
assert (
self.hidden_size % self.num_heads == 0
), f"`hidden_size` ({self.hidden_size}) must be divisible by `num_heads` ({self.num_heads})"
self.head_dim = self.hidden_size // self.num_heads
self.attention_head_type = AttentionHeadType(config.attention_head_type)
self.position_embedding_type = PositionEmbeddingType(config.position_embedding_type)
self.scale_attn_weights = config.scale_attn_weights
self.attention_multiplier = config.attention_multiplier
self.layer_idx = layer_idx
self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
self.scale_attention_softmax_in_fp32 = (
config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
)
if self.attention_head_type == AttentionHeadType.mha:
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_heads
assert (
self.num_heads == self.num_key_value_heads
), f"{self.__class__.__name__} should have same number of heads for query, keys and values"
elif self.attention_head_type == AttentionHeadType.gqa:
assert (
self.num_key_value_heads is not None
), "`num_key_value_heads` needs to be specified with GroupedQueryAttention"
assert self.num_heads % self.num_key_value_heads == 0, (
f"`num_heads` ({self.num_heads}) should be a multiple of `num_key_value_heads` "
f"({self.num_key_value_heads})"
)
elif self.attention_head_type == AttentionHeadType.mqa:
if self.num_key_value_heads is None:
self.num_key_value_heads = 1
assert self.num_key_value_heads == 1, f"{self.__class__.__name__} should have 1 head for keys and values"
else:
raise ValueError(f"unexpected attention_head_type ({self.attention_head_type})")
# note that the actual layout is different for the output and depends on whether we are using MHA, MQA or GQA
# (self.hidden_size + 2 * self.num_key_value_heads * self.head_dim) is just the actual number output features
self.c_attn = nn.Linear(
self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=self.add_bias
)
self.c_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=self.add_bias)
self.attn_pdrop = config.attn_pdrop
self.resid_pdrop = config.resid_pdrop
self.attn_dropout = nn.Identity() if self.attn_pdrop == 0 else nn.Dropout(self.attn_pdrop)
self.resid_dropout = nn.Identity() if self.resid_pdrop == 0 else nn.Dropout(self.resid_pdrop)
def _prepare_qkv_for_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
# ==========================================================================================
# hidden_states -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
# the output of following is a tuple if using MQA with tensor parallel
hidden_states = self.c_attn(hidden_states)
# ==========================================================================================
# hidden_states -> (batch_size, query_length, [num_heads + num_key_value_heads * 2] * head_dim)
# ==========================================================================================
# for MHA, we can get away with doing just 1 transpose which is not true for GQA
if self.attention_head_type == AttentionHeadType.mha:
query, key, value = self._prepare_qkv_for_forward_mha(hidden_states)
elif self.attention_head_type == AttentionHeadType.gqa:
query, key, value = self._prepare_qkv_for_forward_gqa(hidden_states)
elif self.attention_head_type == AttentionHeadType.mqa:
query, key, value = self._prepare_qkv_for_forward_mqa(hidden_states)
else:
raise ValueError(f"unexpected attention_head_type ({self.attention_head_type})")
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, query_length, head_dim)
# value -> (batch_size, num_key_value_heads, query_length, head_dim)
# ==========================================================================================
return query, key, value
def _prepare_qkv_for_forward_mha(
self, hidden_states: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
batch_size, query_length = hidden_states.shape[:-1]
hidden_states = hidden_states.view(batch_size, query_length, self.num_heads, -1)
hidden_states = hidden_states.transpose(1, 2)
query, key, value = hidden_states.chunk(3, dim=-1)
return query, key, value
def _prepare_qkv_for_forward_gqa(
self, hidden_states: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
batch_size, query_length = hidden_states.shape[:-1]
hidden_states = hidden_states.view(batch_size, query_length, self.num_key_value_heads, -1)
query, key, value = hidden_states.split(
((self.num_heads // self.num_key_value_heads) * self.head_dim, self.head_dim, self.head_dim), dim=-1
)
# this needs to be a reshape instead of view sadly
query = query.reshape(batch_size, query_length, -1, self.head_dim)
query = query.transpose(1, 2)
key = key.transpose(1, 2)
value = value.transpose(1, 2)
return query, key, value
def _prepare_qkv_for_forward_mqa(
self, hidden_states: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
batch_size, query_length = hidden_states.shape[:-1]
query, key, value = hidden_states.split((self.hidden_size, self.head_dim, self.head_dim), dim=-1)
query = query.view(batch_size, query_length, self.num_heads, -1)
query = query.transpose(1, 2)
key = key.unsqueeze(1)
value = value.unsqueeze(1)
return query, key, value
def forward(
self,
hidden_states: torch.Tensor,
past_key_values: Optional[DynamicCache] = None,
attention_mask: Optional[torch.Tensor] = None,
rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> torch.Tensor:
# ==========================================================================================
# hidden_states -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
query, key, value = self._prepare_qkv_for_forward(hidden_states)
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, query_length, head_dim)
# value -> (batch_size, num_key_value_heads, query_length, head_dim)
# ==========================================================================================
if self.position_embedding_type == PositionEmbeddingType.rope:
query = apply_rotary_pos_emb(query, rope_cos_sin)
key = apply_rotary_pos_emb(key, rope_cos_sin)
if past_key_values is not None:
key, value = past_key_values.update(key, value, self.layer_idx)
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, key_length, head_dim)
# value -> (batch_size, num_key_value_heads, key_length, head_dim)
# ==========================================================================================
key = key.transpose(-1, -2)
dtype = query.dtype
softmax_dtype = torch.float32 if self.attention_softmax_in_fp32 else dtype
if self.scale_attn_weights:
if self.attention_multiplier is None:
scale_factor = 1 / self.head_dim**0.5
else:
scale_factor = self.attention_multiplier
else:
scale_factor = 1
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, head_dim, key_length)
# value -> (batch_size, num_key_value_heads, key_length, head_dim)
# ==========================================================================================
batch_size = query.shape[0]
query_length = query.shape[2]
key_length = key.shape[-1]
key = repeat_key_value(key, self.num_heads, self.num_key_value_heads)
value = repeat_key_value(value, self.num_heads, self.num_key_value_heads)
# Always copies
query = query.reshape(batch_size * self.num_heads, query_length, self.head_dim)
# No copy when layer_past is provided.
key = key.reshape(batch_size * self.num_heads, self.head_dim, key_length)
# ==========================================================================================
# query -> (batch_size * num_heads, query_length, head_dim)
# key -> (batch_size * num_heads, head_dim, key_length)
# value -> (batch_size, num_heads, key_length, head_dim)
# ==========================================================================================
attn_weights = torch.empty(
(batch_size * self.num_heads, query_length, key_length), device=query.device, dtype=query.dtype
)
attn_weights = torch.baddbmm(attn_weights, query, key, beta=0, alpha=scale_factor).view(
batch_size, self.num_heads, query_length, key_length
)
# ==========================================================================================
# attn_weights -> (batch_size, num_heads, query_length, key_length)
# ==========================================================================================
attn_weights = attn_weights.to(softmax_dtype)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = F.softmax(attn_weights, dim=-1).to(dtype)
attn_weights = self.attn_dropout(attn_weights)
# ==========================================================================================
# value -> (batch_size, num_heads, key_length, head_dim)
# attn_weights -> (batch_size, num_heads, query_length, key_length)
# ==========================================================================================
attn_output = torch.matmul(attn_weights, value)
# ==========================================================================================
# attn_output -> (batch_size, num_heads, query_length, head_dim)
# ==========================================================================================
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
# ==========================================================================================
# attn_output -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
attn_output = self.c_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
return attn_output
class GraniteSDPA(GraniteAttention):
def forward(
self,
hidden_states: torch.Tensor,
past_key_values: Optional[DynamicCache] = None,
attention_mask: Optional[torch.Tensor] = None,
rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> torch.Tensor:
# ==========================================================================================
# hidden_states -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
query, key, value = self._prepare_qkv_for_forward(hidden_states)
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, query_length, head_dim)
# value -> (batch_size, num_key_value_heads, query_length, head_dim)
# ==========================================================================================
if self.position_embedding_type == PositionEmbeddingType.rope:
query = apply_rotary_pos_emb(query, rope_cos_sin)
key = apply_rotary_pos_emb(key, rope_cos_sin)
if past_key_values is not None:
key, value = past_key_values.update(key, value, self.layer_idx)
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, key_length, head_dim)
# value -> (batch_size, num_key_value_heads, key_length, head_dim)
# ==========================================================================================
key = repeat_key_value(key, self.num_heads, self.num_key_value_heads)
value = repeat_key_value(value, self.num_heads, self.num_key_value_heads)
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_heads, key_length, head_dim)
# value -> (batch_size, num_heads, key_length, head_dim)
# ==========================================================================================
attn_output = F.scaled_dot_product_attention(
query,
key,
value,
attn_mask=attention_mask,
dropout_p=self.attn_pdrop if self.training else 0,
is_causal=self.causal if attention_mask is None else False,
scale=self.attention_multiplier if self.scale_attn_weights else 1,
)
# ==========================================================================================
# attn_output -> (batch_size, num_heads, query_length, head_dim)
# ==========================================================================================
batch_size = attn_output.shape[0]
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
# ==========================================================================================
# attn_output -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
attn_output = self.c_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
return attn_output
class GraniteFlashAttention2(GraniteAttention):
def forward(
self,
hidden_states: torch.Tensor,
past_key_values: Optional[DynamicCache] = None,
attention_mask: Optional[torch.Tensor] = None,
rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> torch.Tensor:
# ==========================================================================================
# hidden_states -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
query, key, value = self._prepare_qkv_for_forward(hidden_states)
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, query_length, head_dim)
# value -> (batch_size, num_key_value_heads, query_length, head_dim)
# ==========================================================================================
if self.position_embedding_type == PositionEmbeddingType.rope:
query = apply_rotary_pos_emb(query, rope_cos_sin)
key = apply_rotary_pos_emb(key, rope_cos_sin)
if past_key_values is not None:
key, value = past_key_values.update(key, value, self.layer_idx)
# ==========================================================================================
# query -> (batch_size, num_heads, query_length, head_dim)
# key -> (batch_size, num_key_value_heads, key_length, head_dim)
# value -> (batch_size, num_key_value_heads, key_length, head_dim)
# ==========================================================================================
# TODO avoid this extra transpose
query = query.transpose(1, 2)
if self.attention_head_type == AttentionHeadType.mqa:
key = key.squeeze(1).unsqueeze(2)
value = value.squeeze(1).unsqueeze(2)
else:
key = key.transpose(1, 2)
value = value.transpose(1, 2)
# ==========================================================================================
# query -> (batch_size, query_length, num_heads, head_dim)
# key -> (batch_size, key_length, num_heads, head_dim)
# value -> (batch_size, key_length, num_heads, head_dim)
# ==========================================================================================
batch_size, query_length = query.shape[:2]
key_length = key.shape[1]
indices_k, cu_seqlens_k, max_seqlen_k = get_unpad_data(attention_mask)
key = IndexFirstAxis.apply(
key.reshape(batch_size * key_length, self.num_key_value_heads, self.head_dim), indices_k
)
value = IndexFirstAxis.apply(
value.reshape(batch_size * key_length, self.num_key_value_heads, self.head_dim), indices_k
)
if query_length == key_length:
query = IndexFirstAxis.apply(
query.reshape(batch_size * key_length, self.num_heads, self.head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_q = max_seqlen_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query.device
) # There is a memcpy here, that is very bad.
indices_q = cu_seqlens_q[:-1]
query = query.squeeze(1)
else:
# The -q_len: slice assumes left padding.
attention_mask = attention_mask[:, -query_length:]
query, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(query, attention_mask)
# ==========================================================================================
# query -> (total_q, num_heads, head_dim)
# key -> (total_q, num_heads, head_dim)
# value -> (total_q, num_heads, head_dim)
# ==========================================================================================
attn_output = flash_attn_varlen_func(
query,
key,
value,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_q,
max_seqlen_k=max_seqlen_k,
dropout_p=self.attn_pdrop if self.training else 0,
softmax_scale=self.attention_multiplier if self.scale_attn_weights else 1,
causal=self.causal,
)
# ==========================================================================================
# attn_output -> (total_q, num_heads, head_dim)
# ==========================================================================================
attn_output = pad_input(attn_output, indices_q, batch_size, query_length)
attn_output = attn_output.view(batch_size, query_length, -1)
# ==========================================================================================
# attn_output -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
attn_output = self.c_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
return attn_output
_ATTENTION_MODULES = {
"eager": GraniteAttention,
"sdpa": GraniteSDPA,
"flash_attention_2": GraniteFlashAttention2,
}
def get_attention_module(
config: GraniteConfig, causal: bool, attention_implementation: str, layer_idx: int
) -> GraniteAttention:
if attention_implementation in _ATTENTION_MODULES:
return _ATTENTION_MODULES[attention_implementation](config, causal=causal, layer_idx=layer_idx)
raise ValueError(f"unexpected `attention_implementation` {attention_implementation}")
##################################################
# position embeddings
class Alibi(nn.Module):
def __init__(self, num_heads: int) -> None:
super().__init__()
self.num_heads = num_heads
self.reset_parameters()
def forward(
self, attention_mask: torch.Tensor, batch_size: int, key_length: int, device: torch.device, dtype: torch.dtype
) -> torch.Tensor:
"""
Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
`softmax(l+a) = softmax(l)`. Based on
https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
Args:
attention_mask (torch.Tensor): attention_mask tensor of shape (`batch_size`, `key_length`)
num_heads (int): `num_heads` for the model
batch_size (int): `batch_size`
key_length (int): `key_length`
device (torch.device): device for the tensors
dtype (torch.dtype): dtype to use for the tensors
Returns:
torch.Tensor: alibi tensor of shape (`batch_size`, `num_heads`, `key_length`)
"""
# Note: alibi will added to the attention bias that will be applied to the query, key product of attention
# => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
# => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
# => the query_length dimension will then be broadcasted correctly
# This is more or less identical to T5's relative position bias:
# https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
if attention_mask is None:
arange_tensor = (
torch.arange(key_length, device=device).unsqueeze(0).unsqueeze(0).expand(batch_size, -1, -1)
)
else:
arange_tensor = (attention_mask.cumsum(dim=-1) - 1).masked_fill_(attention_mask == 0, 0).unsqueeze(1)
alibi = self.slopes.unsqueeze(1) * arange_tensor
return alibi.to(dtype)
def reset_parameters(self) -> None:
closest_power_of_2 = 2 ** math.floor(math.log2(self.num_heads))
base = torch.tensor(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=torch.float32)
powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
slopes = torch.pow(base, powers)
if closest_power_of_2 != self.num_heads:
extra_base = torch.tensor(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=torch.float32)
num_remaining_heads = min(closest_power_of_2, self.num_heads - closest_power_of_2)
extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=torch.int32)
slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
self.register_buffer("slopes", slopes, persistent=False)
class RoPE(nn.Module):
def __init__(
self,
head_dim: int,
max_position_embeddings: int = 2048,
base: int = 10000,
) -> None:
super().__init__()
self.head_dim = head_dim
self.max_position_embeddings = max_position_embeddings
self.base = base
self.mscale = 1
self.reset_parameters()
def forward(self, seq_len: int, dtype: torch.dtype, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=device, dtype=dtype)
cos = self.cos_cached[:seq_len].to(dtype)
sin = self.sin_cached[:seq_len].to(dtype)
return cos, sin
def reset_parameters(self) -> None:
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.head_dim, 2).float() / self.head_dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(
seq_len=self.max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
@torch.no_grad()
def _set_cos_sin_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> None:
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
freqs = torch.outer(t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", (emb.cos() * self.mscale).to(dtype), persistent=False)
self.register_buffer("sin_cached", (emb.sin() * self.mscale).to(dtype), persistent=False)
def apply_rotary_pos_emb(x: torch.Tensor, cos_sin: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
cos, sin = cos_sin
x = (x * cos) + (_rotate_half(x) * sin)
return x
def _rotate_half(x: torch.Tensor) -> torch.Tensor:
x1, x2 = torch.chunk(x, 2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
##################################################
# MLP
class GraniteMLP(nn.Module):
def __init__(self, config: GraniteConfig) -> None:
super().__init__()
hidden_size = config.n_embd
intermediate_size = config.n_inner
activation_function = config.activation_function
add_bias = config.add_bias
residual_dropout = config.resid_pdrop
self.c_fc = nn.Linear(
hidden_size,
2 * intermediate_size if is_glu(activation_function) else intermediate_size,
bias=add_bias,
)
self.act = get_activation_function(activation_function)
self.c_proj = nn.Linear(intermediate_size, hidden_size, bias=add_bias)
self.dropout = nn.Identity() if residual_dropout == 0 else nn.Dropout(residual_dropout)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.c_fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
##################################################
# transformer layer
class GraniteBlock(nn.Module):
def __init__(
self,
config: GraniteConfig,
attention_implementation: str,
layer_idx: Optional[int] = None,
) -> None:
super().__init__()
hidden_size = config.hidden_size
self.inner_dim = config.n_inner
self.layer_idx = layer_idx
self.ln_1 = get_normalization_function(
config.normalization_function,
hidden_size,
eps=config.layer_norm_epsilon,
)
self.attn = get_attention_module(config, True, attention_implementation, layer_idx)
self.ln_2 = get_normalization_function(
config.normalization_function,
hidden_size,
eps=config.layer_norm_epsilon,
)
self.mlp = GraniteMLP(config)
def forward(
self,
hidden_states: torch.Tensor,
past_key_values: Optional[DynamicCache] = None,
attention_mask: Optional[torch.Tensor] = None,
rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> torch.Tensor:
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_output = self.attn(
hidden_states,
past_key_values=past_key_values,
attention_mask=attention_mask,
rope_cos_sin=rope_cos_sin,
)
# residual connection
hidden_states = attn_output + residual
residual = hidden_states
hidden_states = self.ln_2(hidden_states)
feed_forward_hidden_states = self.mlp(hidden_states)
# residual connection
hidden_states = residual + feed_forward_hidden_states
return hidden_states
##################################################
# model classes
class GranitePreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GraniteConfig
base_model_prefix = "transformer"
causal = True
_no_split_modules = ["GraniteBlock"]
_skip_keys_device_placement = "past_key_values"
_supports_sdpa = True
_supports_flash_attn_2 = True
def __init__(self, config: GraniteConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.attention_implementation = self.config._attn_implementation
self._use_eager_attention = self.attention_implementation == "eager"
self._use_sdpa = self.attention_implementation == "sdpa"
self._use_flash_attention_2 = self.attention_implementation == "flash_attention_2"
self.initializer_range = config.initializer_range
def _init_weights(self, module: nn.Module) -> None:
if isinstance(module, (nn.LayerNorm, RMSNorm, Alibi, RoPE)):
module.reset_parameters()
elif isinstance(module, nn.Linear):
nn.init.normal_(module.weight, mean=0, std=self.initializer_range)
if module.bias is not None:
module.bias.zero_()
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, mean=0, std=self.initializer_range)
if module.padding_idx is not None:
module.weight[module.padding_idx].zero_()
class GraniteModel(GranitePreTrainedModel):
_keys_to_ignore_on_load_missing = ["attn.masked_bias"]
mask_value = None
def __init__(self, config: GraniteConfig, **kwargs) -> None:
super().__init__(config, **kwargs)
self.attention_head_type = AttentionHeadType(config.attention_head_type)
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.num_key_value_heads = config.num_key_value_heads
assert (
self.embed_dim % self.num_heads == 0
), f"`embed_dim` ({self.embed_dim}) must be divisible by `num_heads` ({self.num_heads})"
self.head_dim = self.embed_dim // self.num_heads
self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
self.drop = nn.Identity() if config.embd_pdrop == 0 else nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList(
[GraniteBlock(config, self.attention_implementation, layer_idx=i) for i in range(config.num_hidden_layers)]
)
self.ln_f = get_normalization_function(
config.normalization_function,
self.embed_dim,
eps=config.layer_norm_epsilon,
)
self.position_embedding_type = PositionEmbeddingType(config.position_embedding_type)
if self.position_embedding_type == PositionEmbeddingType.learned_absolute:
self.wpe = nn.Embedding(config.n_positions, self.embed_dim)
elif self.position_embedding_type == PositionEmbeddingType.alibi:
assert not self._use_flash_attention_2, "alibi is not implemented with FlashAttention"
self.alibi = Alibi(self.num_heads)
elif self.position_embedding_type == PositionEmbeddingType.rope:
self.rope = RoPE(self.head_dim, max_position_embeddings=config.n_positions, base=config.rope_theta)
else:
raise NotImplementedError()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Embedding:
return self.wte
def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
self.wte = new_embeddings
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[DynamicCache] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
(
output_hidden_states,
use_cache,
return_dict,
input_shape,
hidden_states,
attention_mask,
position_ids,
rope_cos_sin,
past_key_values,
) = self._prepare_a_bunch_of_stuff(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# ==========================================================================================
# flash:
# attention_mask -> (batch_size, key_length)
# else:
# attention_mask -> (batch_size, 1, query_length, key_length)
# ==========================================================================================
output_shape = input_shape + (hidden_states.size(-1),)
past_key_values = DynamicCache() if use_cache and past_key_values is None else past_key_values
all_hidden_states = () if output_hidden_states else None
for block in self.h:
if output_hidden_states:
all_hidden_states += (hidden_states,)
hidden_states = block(
hidden_states,
past_key_values=past_key_values,
attention_mask=attention_mask,
rope_cos_sin=rope_cos_sin,
)
hidden_states = self.ln_f(hidden_states)
hidden_states = hidden_states.view(output_shape)
# Add last hidden state
if output_hidden_states:
all_hidden_states += (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, past_key_values, all_hidden_states] if v is not None)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=past_key_values,
hidden_states=all_hidden_states,
)
def _get_position_ids(
self, attention_mask: torch.Tensor, past_length: int, query_length: int, key_length: int, device: torch.device
) -> torch.Tensor:
if attention_mask is not None and len(attention_mask.shape) == 2:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 0)
if past_length > 0:
position_ids = position_ids[:, past_length:key_length:]
else:
position_ids = torch.arange(past_length, key_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).view(-1, query_length)
return position_ids
def _get_alibi_bias(
self,
attention_mask: torch.Tensor,
batch_size: int,
query_length: int,
key_length: int,
device: torch.device,
dtype: torch.dtype,
) -> torch.Tensor:
if self.position_embedding_type != PositionEmbeddingType.alibi:
return None
alibi_bias = self.alibi(attention_mask, batch_size, key_length, device, dtype)
# ==========================================================================================
# alibi_bias -> (batch_size, num_heads, key_length)
# ==========================================================================================
alibi_bias = alibi_bias.unsqueeze(2)
if query_length != 1:
alibi_bias = alibi_bias.expand(-1, -1, query_length, -1)
# ==========================================================================================
# alibi_bias -> (batch_size, num_heads, query_length, key_length)
# ==========================================================================================
return alibi_bias
def _get_rope_cos_sin(
self, key_length: int, position_ids: torch.Tensor, dtype: torch.dtype, device: torch.device
) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
if self.position_embedding_type == PositionEmbeddingType.rope:
cos, sin = self.rope(key_length, dtype=dtype, device=device)
cos = cos[position_ids].unsqueeze(1)
sin = sin[position_ids].unsqueeze(1)
return cos, sin
def _prepare_causal_attention_mask(
self, attention_mask: torch.Tensor, batch_size: int, query_length: int, key_length: int, device: torch.device
) -> torch.Tensor:
past_length = key_length - query_length
# ==========================================================================================
# attention_mask -> (batch_size, key_length)
# ==========================================================================================
if query_length > 1:
# (query_length, key_length)
causal_mask = torch.empty((query_length, key_length), dtype=torch.bool, device=device)
causal_mask[:, past_length:] = torch.tril(
torch.ones(query_length, query_length, dtype=torch.bool, device=device)
)
if past_length > 0:
causal_mask[:, :past_length] = True
# (query_length, key_length) -> (1, query_length, key_length)
causal_mask = causal_mask.unsqueeze(0)
if attention_mask is None:
# (1, query_length, key_length) -> (batch_size, query_length, key_length)
causal_mask = causal_mask.expand(batch_size, -1, -1)
else:
# (1, query_length, key_length) & (batch_size, 1, key_length) -> (batch_size, query_length, key_length)
causal_mask = causal_mask & attention_mask.unsqueeze(1).to(torch.bool)
else:
if attention_mask is None:
# (batch_size, query_length, key_length)
causal_mask = torch.ones(batch_size, query_length, key_length, dtype=torch.bool, device=device)
else:
# (batch_size, query_length, key_length)
causal_mask = attention_mask.unsqueeze(1).to(dtype=torch.bool, device=device)
# ==========================================================================================
# attention_mask -> (batch_size, query_length, key_length)
# ==========================================================================================
causal_mask = causal_mask.unsqueeze(1)
# ==========================================================================================
# attention_mask -> (batch_size, 1, query_length, key_length)
# ==========================================================================================
return causal_mask
def _get_initial_hidden_state(
self,
input_ids: torch.Tensor,
inputs_embeds: torch.Tensor,
position_ids: torch.Tensor,
token_type_ids: torch.Tensor,
) -> torch.Tensor:
if inputs_embeds is None:
inputs_embeds = self.wte(input_ids)
if self.position_embedding_type == PositionEmbeddingType.learned_absolute:
inputs_embeds = inputs_embeds + self.wpe(position_ids)
if token_type_ids is not None:
inputs_embeds = inputs_embeds + self.wte(token_type_ids)
inputs_embeds = self.drop(inputs_embeds)
return inputs_embeds
def _prepare_a_bunch_of_stuff(
self,
input_ids: torch.Tensor,
past_key_values: DynamicCache,
attention_mask: torch.Tensor,
token_type_ids: torch.Tensor,
position_ids: torch.Tensor,
inputs_embeds: torch.Tensor,
use_cache: bool,
output_hidden_states: bool,
return_dict: bool,
) -> Tuple[
bool,
bool,
bool,
torch.Size,
torch.Tensor,
torch.Tensor,
torch.Tensor,
Optional[Tuple[torch.Tensor, torch.Tensor]],
DynamicCache,
]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = self.config.use_cache if use_cache is None else use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
elif inputs_embeds is not None:
# TODO special handling for padding free transformer needed here if we support inputs_embeds argument
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
batch_size = input_shape[0]
device = input_ids.device if input_ids is not None else inputs_embeds.device
if self.position_embedding_type == PositionEmbeddingType.alibi:
if position_ids is not None:
warnings.warn("`position_ids` have no functionality with Alibi.", FutureWarning)
if token_type_ids is not None:
token_type_ids = token_type_ids.view(-1, input_shape[-1])
# ==========================================================================================
# input_ids -> (batch_size, query_length)
# attention_mask -> None or (batch_size, key_length)
# position_ids -> None or (batch_size, key_length)
# ==========================================================================================
past_length = 0 if past_key_values is None else past_key_values.get_seq_length()
query_length = input_shape[-1]
key_length = past_length + query_length
if position_ids is None:
position_ids = self._get_position_ids(attention_mask, past_length, query_length, key_length, device)
# ==========================================================================================
# input_ids -> (batch_size, query_length)
# attention_mask -> None or (batch_size, key_length)
# position_ids -> (batch_size, query_length)
# ==========================================================================================
hidden_states = self._get_initial_hidden_state(input_ids, inputs_embeds, position_ids, token_type_ids)
# ==========================================================================================
# hidden_states -> (batch_size, query_length, num_heads * head_dim)
# ==========================================================================================
alibi_bias = self._get_alibi_bias(
attention_mask, batch_size, query_length, key_length, device, hidden_states.dtype
)
# ==========================================================================================
# alibi_bias -> (batch_size, num_heads, query_length, key_length)
# ==========================================================================================
rope_cos_sin = self._get_rope_cos_sin(
key_length, position_ids, dtype=hidden_states.dtype, device=hidden_states.device
)
# ==========================================================================================
# rope_cos_sin -> 2 * (key_length, head_dim)
# ==========================================================================================
# prepare causal mask only if not using flash attention
if self._use_flash_attention_2:
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
elif self._use_sdpa:
# we use the causal/non-causal argument of SDPA for attention in this case
if attention_mask is not None:
attention_mask = self._prepare_causal_attention_mask(
attention_mask, batch_size, query_length, key_length, device
)
attention_mask = torch.where(
attention_mask,
~attention_mask if alibi_bias is None else alibi_bias,
self._get_mask_value(attention_mask.device, hidden_states.dtype),
)
else:
attention_mask = self._prepare_causal_attention_mask(
attention_mask, batch_size, query_length, key_length, device
)
attention_mask = torch.where(
attention_mask,
~attention_mask if alibi_bias is None else alibi_bias,
self._get_mask_value(attention_mask.device, hidden_states.dtype),
)
return (
output_hidden_states,
use_cache,
return_dict,
input_shape,
hidden_states,
attention_mask,
position_ids,
rope_cos_sin,
past_key_values,
)
def _get_mask_value(self, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
# torch.where expects a tensor. We use a cache to avoid recreating it every time.
if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
self.mask_value = torch.full([], torch.finfo(torch.float16).min, dtype=dtype, device=device)
return self.mask_value
class GraniteForCausalLM(GranitePreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
def __init__(self, config: GraniteConfig, **kwargs) -> None:
super().__init__(config, **kwargs)
self.transformer = GraniteModel(config, **kwargs)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Embedding:
return self.transformer.wte
def set_input_embeddings(self, value: nn.Embedding) -> None:
self.transformer.wte = value
def get_output_embeddings(self) -> nn.Linear:
return self.lm_head
def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
self.lm_head = new_embeddings
# FIXME typing
def prepare_inputs_for_generation(
self,
input_ids: torch.Tensor,
past_key_values: Optional[DynamicCache] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs,
) -> dict:
token_type_ids = kwargs.get("token_type_ids", None)
# Omit tokens covered by past_key_values
if past_key_values:
past_length = past_key_values.get_seq_length()
# Some generation methods already pass only the last input ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# Default to old behavior: keep only final ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 0)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
else:
position_ids = None
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
)
return model_inputs
def forward(
self,
input_ids: Optional[Union[torch.Tensor]] = None,
past_key_values: Optional[DynamicCache] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[Union[torch.Tensor]] = None,
position_ids: Optional[Union[torch.Tensor]] = None,
inputs_embeds: Optional[Union[torch.Tensor]] = None,
labels: Optional[Union[torch.Tensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# ==========================================================================================
# input_ids -> (batch_size, query_length)
# attention_mask -> None or (batch_size, key_length)
# position_ids -> None or (batch_size, key_length)
# ==========================================================================================
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
loss = None
# Shift so that tokens < n predict n
if labels is not None:
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
# Flatten the tokens
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)