extended-mind-mpt-7b-chat / modeling_mpt.py
phoebeklett's picture
Upload 4 files
ff7af82
raw
history blame
No virus
35.6 kB
# Adapted from https://github.com/mosaicml/llm-foundry
# Classes changed: MPTModel, MPTForCausalLM
# SPDX-License-Identifier: Apache-2.0
"""A simple, flexible implementation of a GPT model.
Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
"""
import math
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.linalg import vector_norm
import faiss
from einops import rearrange
from composer.utils import dist
from omegaconf import DictConfig
from transformers import (PreTrainedModel, PreTrainedTokenizer,
PreTrainedTokenizerFast)
from transformers.modeling_outputs import (BaseModelOutputWithPast,
CausalLMOutputWithPast)
from llmfoundry.models.layers.custom_embedding import SharedEmbedding
from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
from llmfoundry.models.utils.param_init_fns import MODEL_INIT_REGISTRY
from extended_mind_transformers.mpt.configuration import ExtendedMPTConfig
from extended_mind_transformers.mpt.attention import attn_bias_shape, build_attn_bias
from extended_mind_transformers.mpt.blocks import MPTBlock
from extended_mind_transformers.utils import instantiate_from_config
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
class MPTPreTrainedModel(PreTrainedModel):
config_class = ExtendedMPTConfig
base_model_prefix = 'model'
_no_split_modules = ['MPTBlock']
class ExtendedMPTModel(MPTPreTrainedModel):
def __init__(self, config: ExtendedMPTConfig):
config._validate_config()
super().__init__(config)
self.attn_impl = config.attn_config['attn_impl']
self.prefix_lm = config.attn_config['prefix_lm']
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
self.alibi = config.attn_config['alibi']
self.alibi_bias_max = config.attn_config['alibi_bias_max']
self.mask_by_sim = config.attn_config['mask_by_sim']
self.sim_threshold = config.attn_config['sim_threshold']
self.topk = config.attn_config['topk']
self.use_active_externalism = config.attn_config['use_active_externalism']
self.use_active_externalism_by_layer = config.use_active_externalism_by_layer
if config.init_device == 'mixed':
if dist.get_local_rank() == 0:
config.init_device = 'cpu'
else:
config.init_device = 'meta'
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
raise NotImplementedError(
f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).'
)
norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
# CogView (https://arxiv.org/abs/2105.13290) and GLM-130B (https://arxiv.org/abs/2210.02414)
# both report this helping with stabilizing training
self.embedding_fraction = config.embedding_fraction
self.wte = SharedEmbedding(config.vocab_size,
config.d_model,
device=config.init_device)
if not self.alibi:
self.wpe = torch.nn.Embedding(config.max_seq_len,
config.d_model,
device=config.init_device)
self.emb_drop = nn.Dropout(config.emb_pdrop)
self.blocks = nn.ModuleList([
MPTBlock(
device=config.init_device,
**config.to_dict(),
) for _ in range(config.n_layers)
])
self.norm_f = norm_class(config.d_model, device=config.init_device)
if config.init_device != 'meta':
print(
f'You are using {config.init_device=}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
)
self.apply(self.param_init_fn)
self.is_causal = not self.prefix_lm
# define attn mask
self._attn_bias_initialized = False
self.attn_bias = None
self.attn_bias_shape = attn_bias_shape(
self.attn_impl,
config.n_heads,
config.max_seq_len,
self.alibi,
prefix_lm=self.prefix_lm,
causal=self.is_causal,
use_sequence_id=self.attn_uses_sequence_id,
)
self._attn_bias_ae_initialized = False #for active externalism
self.attn_bias_ae = None
if self.config.no_bias:
for module in self.modules():
if hasattr(module, 'bias') and isinstance(
module.bias, nn.Parameter):
if self.config.verbose:
warnings.warn(
f'Removing bias ({module.bias}) from {module}.')
module.register_parameter('bias', None)
# Print verbose info
if config.verbose and config.verbose > 2:
print(self)
if 'verbose' not in self.config.init_config:
self.config.init_config['verbose'] = self.config.verbose
if self.config.init_config['verbose'] > 1:
init_fn_name = self.config.init_config['name']
warnings.warn(f'Using {init_fn_name} initialization.')
def get_input_embeddings(self):
return self.wte
def set_input_embeddings(self, value: nn.Embedding):
self.wte = value
@torch.no_grad()
def _attn_bias(
self,
device,
dtype,
attention_mask: Optional[torch.ByteTensor] = None,
prefix_mask: Optional[torch.ByteTensor] = None,
sequence_id: Optional[torch.LongTensor] = None,
seq_len: Optional[int] = None,
use_active_externalism:bool=None,
topk=None,
):
if not self._attn_bias_initialized:
if self.attn_bias_shape:
self.attn_bias = torch.zeros(self.attn_bias_shape,
device=device,
dtype=dtype)
self.attn_bias = build_attn_bias(
self.attn_impl,
self.config.n_heads,
self.config.max_seq_len,
device=device,
dtype=dtype,
attn_bias = self.attn_bias,
causal=self.is_causal,
alibi=self.alibi,
alibi_bias_max=self.alibi_bias_max
)
self._attn_bias_initialized = True
if use_active_externalism: #for active externalism, init every time since seq_len changes
self.attn_bias_ae = build_attn_bias(
self.attn_impl,
self.config.n_heads,
seq_len,
device=device,
dtype=dtype,
causal=self.is_causal,
alibi=self.alibi,
alibi_bias_max=self.alibi_bias_max,
for_ae=use_active_externalism,
topk=topk
)
self._attn_bias_ae_initialized = True
# flash does not support prefix_lm and will incorporate any
# attention_mask inside the attention module
if self.attn_impl == 'flash':
return self.attn_bias, attention_mask
if self.attn_bias is not None:
# .to(*args, **kwargs) is a no-op if tensor is already on
# specified device or of specificed dtype
self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
attn_bias = self.attn_bias
if self.attn_bias_ae is not None: #for active externalism
self.attn_bias_ae = self.attn_bias_ae.to(dtype=dtype, device=device)
attn_bias_ae = self.attn_bias_ae
# If using torch or triton, we incorporate the prefix_mask (if appropriate)
if self.prefix_lm:
assert isinstance(attn_bias, torch.Tensor) # pyright
assert isinstance(prefix_mask, torch.Tensor) # pyright
attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
# If using torch or triton, we incorporate sequence_id (if appropriate)
if self.attn_uses_sequence_id and sequence_id is not None:
assert isinstance(attn_bias, torch.Tensor) # pyright
attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
# If using torch or triton, we incorporate attention_mask. This will output
# None in place of attention_mask since it will not be further needed in the
# attention modules.
if attention_mask is not None:
s_k = attention_mask.shape[-1]
if attn_bias is None:
attn_bias = torch.zeros((1, 1, 1, s_k),
device=device,
dtype=dtype)
else:
# clamp to 0 necessary for torch 2.0 compile()
_s_k = max(0, attn_bias.size(-1) - s_k)
attn_bias = attn_bias[:, :, :, _s_k:]
if prefix_mask is not None and (attention_mask.shape !=
prefix_mask.shape):
raise ValueError(
f'attention_mask shape={attention_mask.shape} ' +
f'and prefix_mask shape={prefix_mask.shape} are not equal.')
min_val = torch.finfo(attn_bias.dtype).min
attn_bias = attn_bias.masked_fill(
~attention_mask.view(-1, 1, 1, s_k), min_val)
return attn_bias, attn_bias_ae, None
def _apply_prefix_mask(self, attn_bias: torch.Tensor,
prefix_mask: torch.Tensor):
s_k, s_q = attn_bias.shape[-2:]
if (s_k != self.config.max_seq_len) or (s_q != self.config.max_seq_len):
raise ValueError(
'attn_bias does not match the expected shape. ' +
f'The last two dimensions should both be {self.config.max_length} '
+ f'but are {s_k} and {s_q}.')
seq_len = prefix_mask.shape[-1]
if seq_len > self.config.max_seq_len:
raise ValueError(
f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
)
# select seq_len subset of attn mask
attn_bias = attn_bias[..., :seq_len, :seq_len]
# Mix the causal max and the bidirectional mask to get the full
# allowable attention (i.e. full = not accounting for padding yet)
causal = torch.tril(
torch.ones((seq_len, seq_len),
dtype=torch.bool,
device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
prefix = prefix_mask.view(-1, 1, 1, seq_len)
cannot_attend = ~torch.logical_or(causal, prefix.bool())
min_val = torch.finfo(attn_bias.dtype).min
attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
return attn_bias
def _apply_sequence_id(self, attn_bias: torch.Tensor,
sequence_id: torch.LongTensor):
seq_len = sequence_id.shape[-1]
if seq_len > self.config.max_seq_len:
raise ValueError(
f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
)
# select seq_len subset of attn mask
attn_bias = attn_bias[..., :seq_len, :seq_len]
# Restrict attention to tokens that share the same value
# in sequence_id
cannot_attend = torch.logical_not(
torch.eq(
sequence_id.view(-1, seq_len, 1),
sequence_id.view(-1, 1, seq_len),
)).unsqueeze(1)
min_val = torch.finfo(attn_bias.dtype).min
attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
return attn_bias
def forward(
self,
input_ids: torch.LongTensor,
past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
attention_mask: Optional[torch.ByteTensor] = None,
prefix_mask: Optional[torch.ByteTensor] = None,
sequence_id: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
use_cache: Optional[bool] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_active_externalism:Optional[bool]=None,
long_range_past_key_values:Optional[List[Tuple[torch.FloatTensor]]] = None,
faiss_indexes:Tuple=None,
topk:int=None,
):
return_dict = (return_dict
if return_dict is not None else self.config.return_dict)
use_cache = (use_cache
if use_cache is not None else self.config.use_cache)
use_active_externalism = (use_active_externalism
if use_active_externalism is not None else self.use_active_externalism)
topk = (topk if topk is not None else self.topk)
if attention_mask is not None:
attention_mask = attention_mask.bool()
if prefix_mask is not None:
prefix_mask = prefix_mask.bool()
# These args are passed in by keyword in huggingface's generate function
# https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
# but have not yet been fully implemented in MPTModel
if not return_dict:
raise NotImplementedError(
'return_dict False is not implemented yet for MPT')
if output_attentions:
if self.attn_impl != 'torch':
raise NotImplementedError(
'output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.'
)
if (attention_mask is not None and
attention_mask[:, 0].sum() != attention_mask.shape[0] and
self.training):
raise NotImplementedError(
'MPT does not support training with left padding.')
if self.prefix_lm and prefix_mask is None:
raise ValueError(
'prefix_mask is a required argument when MPT is configured with prefix_lm=True.'
)
# Raise a not implemented error if input_embeds is not None (this is an arg in huggingface transformers and we need to support it for PEFT)
if inputs_embeds is not None:
raise NotImplementedError(
'inputs_embeds is not implemented for MPT.')
if self.training:
if self.attn_uses_sequence_id and sequence_id is None:
raise ValueError(
'sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True '
+ 'and the model is in train mode.')
elif (self.attn_uses_sequence_id is False) and (sequence_id
is not None):
warnings.warn(
'MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. '
+
'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.'
)
S = input_ids.size(1)
assert (
S <= self.config.max_seq_len
), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
tok_emb = self.wte(input_ids) # type: ignore
if self.alibi:
x = tok_emb
else:
past_position = 0
if past_key_values is not None:
if len(past_key_values) != self.config.n_layers:
raise ValueError(
f'past_key_values must provide a past_key_value for each attention '
+
f'layer in the network ({len(past_key_values)=}; {self.config.n_layers=}).'
)
# For attn_impl: triton and flash the past key tensor spec is (batch, seq, dim).
# For attn_impl: torch the past key tensor spec is (batch, heads, head_dim, seq).
# Here we shift position embedding using the `seq` dim of the past key
past_position = past_key_values[0][0].size(1)
if self.attn_impl == 'torch':
past_position = past_key_values[0][0].size(3)
if S + past_position > self.config.max_seq_len:
raise ValueError(
f'Cannot forward input with past sequence length {past_position} and current sequence length '
f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.'
)
pos = torch.arange(
past_position,
S + past_position,
dtype=torch.long,
device=input_ids.device,
).unsqueeze(0)
if attention_mask is not None:
# adjust the position indices to account for padding tokens
pos = torch.clamp(
pos - torch.cumsum((~attention_mask).to(torch.int32),
dim=1)[:, past_position:],
min=0,
)
pos_emb = self.wpe(pos) # type: ignore
x = tok_emb + pos_emb
if self.embedding_fraction == 1:
x = self.emb_drop(x) # type: ignore
else:
# this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
x_shrunk = (x * self.embedding_fraction) + (
x.detach() * (1 - self.embedding_fraction))
assert isinstance(self.emb_drop, nn.Module) # pyright
x = self.emb_drop(x_shrunk)
seq_len = S #for active externalism
if past_key_values is not None:
past_position = past_key_values[0][0].size(-1)
seq_len += past_position
attn_bias, attn_bias_ae, attention_mask = self._attn_bias(
device=x.device,
dtype=torch.float32,
attention_mask=attention_mask,
prefix_mask=prefix_mask,
sequence_id=sequence_id,
seq_len = seq_len,
use_active_externalism=use_active_externalism,
topk=topk
)
# initialize the past key values cache if it should be used
if use_cache and past_key_values is None:
past_key_values = [() for _ in range(self.config.n_layers)
] # type: ignore
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
all_idx = () if output_attentions else None
for b_idx, block in enumerate(self.blocks): # type: ignore
if output_hidden_states:
assert all_hidden_states is not None # pyright
all_hidden_states = all_hidden_states + (x,)
past_key_value = (past_key_values[b_idx]
if past_key_values is not None else None)
long_range_past_key_value = (long_range_past_key_values[b_idx]
if (long_range_past_key_values is not None and self.use_active_externalism_by_layer[b_idx] and use_active_externalism is True) else None)
if long_range_past_key_value is not None and faiss_indexes is not None:
raise NotImplementedError(
'Using faiss and passing key value pairs manually are mutually exclusive right now.')
x, attn_weights, past_key_value, reshaped_idx = block(
x,
past_key_value=past_key_value,
long_range_past_key_value=long_range_past_key_value,
attn_bias=attn_bias,
attention_mask=attention_mask,
attn_bias_ae=attn_bias_ae,
is_causal=self.is_causal,
topk=topk,
needs_weights=output_attentions,
faiss_indexes=faiss_indexes,
n_layers=self.config.n_layers,
current_layer=b_idx,
mask_by_sim=self.mask_by_sim,
sim_threshold=self.sim_threshold,
)
if past_key_values is not None:
past_key_values[b_idx] = past_key_value
if output_attentions:
assert all_self_attns is not None # pyright
all_self_attns = all_self_attns + (attn_weights,)
assert all_idx is not None
all_idx = all_idx + (reshaped_idx,)
x = self.norm_f(x) # type: ignore
# add hidden states from the last decoder layer
if output_hidden_states:
assert all_hidden_states is not None # pyright
all_hidden_states = all_hidden_states + (x,)
return BaseModelOutputWithPast(
last_hidden_state=x,
past_key_values=past_key_values,
hidden_states=all_hidden_states,
attentions=(all_self_attns, all_idx), #return reshaped_idx for active externalism
)
# Param Initialization, needed for device='meta' fast initialization
def param_init_fn(self, module):
init_fn_name = self.config.init_config['name']
MODEL_INIT_REGISTRY[init_fn_name](
module=module,
n_layers=self.config.n_layers,
d_model=self.config.d_model,
**self.config.init_config,
)
# FSDP Wrap function
def fsdp_wrap_fn(self, module):
return isinstance(module, MPTBlock)
# Activation Checkpointing
def activation_checkpointing_fn(self, module):
return isinstance(module, MPTBlock)
class ExtendedMPTForCausalLM(MPTPreTrainedModel):
def __init__(self, config:ExtendedMPTConfig, external_memories=None):
if isinstance(config, DictConfig):
config = instantiate_from_config(config)
super().__init__(config)
if not config.tie_word_embeddings:
raise ValueError(
'MPTForCausalLM only supports tied word embeddings')
print(f'Instantiating an MPTForCausalLM model from {__file__}')
self.transformer: ExtendedMPTModel = ExtendedMPTModel(config)
self.use_active_externalism = config.attn_config['use_active_externalism']
self.memory_type = config.attn_config['memory_type']
self._memories = None
self.memory_device = config.memory_device
for child in self.transformer.children():
if isinstance(child, torch.nn.ModuleList):
continue
if isinstance(child, torch.nn.Module):
child._fsdp_wrap = True
# enables scaling output logits; similar to a softmax "temperature"
# PaLM paper uses scale 1/sqrt(config.d_model)
self.logit_scale = None
if config.logit_scale is not None:
logit_scale = config.logit_scale
if isinstance(logit_scale, str):
if logit_scale == 'inv_sqrt_d_model':
logit_scale = 1 / math.sqrt(config.d_model)
else:
raise ValueError(
f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
)
self.logit_scale = logit_scale
if external_memories is not None:
self._memories = external_memories
self.memories = None
def set_memories(self, memories):
self.memories = memories
def empty_memories(self):
self.memories = None
def get_input_embeddings(self):
return self.transformer.wte
def set_input_embeddings(self, value):
self.transformer.wte = value
def get_output_embeddings(self):
return self.transformer.wte
def set_output_embeddings(self, new_embeddings):
self.transformer.wte = new_embeddings
def set_decoder(self, decoder):
self.transformer = decoder
def get_decoder(self):
return self.transformer
def forward(
self,
input_ids: torch.LongTensor,
past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
attention_mask: Optional[torch.ByteTensor] = None,
prefix_mask: Optional[torch.ByteTensor] = None,
sequence_id: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
use_cache: Optional[bool] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_active_externalism: Optional[bool]=None,
topk:int=None
):
if self._memories is not None and self.memories is None: #init memories once on first call
self.memories = self.generate_cache(self._memories, cache_type=self.memory_type)
return_dict = (return_dict
if return_dict is not None else self.config.return_dict)
use_cache = (use_cache
if use_cache is not None else self.config.use_cache)
use_active_externalism = (use_active_externalism
if use_active_externalism is not None else self.use_active_externalism)
topk = topk if topk is not None else None
# if input_embeds is not none, raise a not implemented error
if inputs_embeds is not None:
raise NotImplementedError(
'inputs_embeds has to be None (for hf/peft support).')
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
if hasattr(self, "memories") and type(self.memories)==list:
long_range_past_key_values = self.memories
faiss_indexes = None
elif hasattr(self, "memories"):
long_range_past_key_values = None
faiss_indexes = self.memories
else:
long_range_past_key_values = None
faiss_indexes = None
outputs = self.transformer(
input_ids=input_ids,
past_key_values=past_key_values,
long_range_past_key_values=long_range_past_key_values,
faiss_indexes=faiss_indexes,
attention_mask=attention_mask,
prefix_mask=prefix_mask,
sequence_id=sequence_id,
return_dict=return_dict,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
use_cache=use_cache,
use_active_externalism=use_active_externalism,
topk=topk
)
# move outputs to same device as weights for token embedding
# needed to support HF `device_map`
logits = self.transformer.wte(
outputs.last_hidden_state.to(self.transformer.wte.weight.device),
True,
)
if self.logit_scale is not None:
if self.logit_scale == 0:
warnings.warn(
f'Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.'
)
logits *= self.logit_scale
loss = None
if labels is not None:
_labels = torch.roll(labels, shifts=-1)
_labels[:, -1] = -100
loss = F.cross_entropy(
logits.view(-1, logits.size(-1)),
_labels.to(logits.device).view(-1),
)
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# Param Initialization, needed for device='meta' fast initialization
def param_init_fn(self, module):
init_fn_name = self.config.init_config['name']
MODEL_INIT_REGISTRY[init_fn_name](
module=module,
n_layers=self.config.n_layers,
d_model=self.config.d_model,
**self.config.init_config,
)
# FSDP Wrap function
def fsdp_wrap_fn(self, module):
return isinstance(module, MPTBlock)
# Activation Checkpointing
def activation_checkpointing_fn(self, module):
return isinstance(module, MPTBlock)
def generate_cache(self,
input_ids:torch.LongTensor,
stride:int=512,
max_len:int=2048,
cache_type:str='manual'):
if cache_type not in ['manual', 'faiss']:
raise NotImplementedError(f"Cache type {cache_type} not implemented.")
prev_end_loc=0
long_range_past_key_values = None
faiss_indexes= None
for b_idx in range(0, input_ids.size(-1), stride): #generate kv-pairs using stride
end_loc = min(b_idx + max_len, input_ids.size(-1))
trg_len = end_loc - prev_end_loc
subseq = input_ids[:, b_idx:end_loc].to(self.device)
with torch.no_grad():
outputs = self.transformer(subseq, use_cache=True, use_active_externalism=False)
to_cache = [(
kv[0][:,:,:,-trg_len:],
kv[1][:,:,-trg_len:])
for kv in outputs.past_key_values
]
long_range_past_key_values, faiss_indexes = self.cache(to_cache, cache_type, long_range_past_key_values=long_range_past_key_values, faiss_indexes=faiss_indexes)
prev_end_loc = end_loc
if end_loc == input_ids.size(-1):
break
if long_range_past_key_values is not None:
return long_range_past_key_values
else:
return faiss_indexes
def cache(self,
to_cache:List,
cache_type:str='manual',
long_range_past_key_values:List=None,
faiss_indexes:faiss.IndexFlatIP=None,
max_length_cache=100000,
verbose=False):
if long_range_past_key_values is not None and faiss_indexes is not None:
raise NotImplementedError("Using faiss and passing key value pairs manually are mutually exclusive right now.")
if cache_type=='faiss': #add one-hot encoding to match layer, head indices
one_hot_encodings = F.one_hot(torch.arange(0, self.config.n_heads*self.config.n_layers))*10
if faiss_indexes is None:
faiss_indexes = (faiss.IndexFlatIP(to_cache[0][0].size(-2)+one_hot_encodings.size(-1)), faiss.IndexFlatIP(to_cache[0][1].size(-1)*2))
kn_index, kv_index = faiss_indexes
for b_idx, (k, v) in enumerate(to_cache):
k_n = (k/vector_norm(k, ord=2, dim=-2, keepdim=True)).to('cpu')
k_n = torch.concat([rearrange(k_n, 'b h d s -> b (h s) d', h=self.config.n_heads), one_hot_encodings[self.config.n_heads*b_idx:self.config.n_heads*(b_idx+1)].unsqueeze(0).repeat_interleave(repeats=k.size(-1), dim=-2)], dim=-1)
kn_index.add(k_n.squeeze().numpy())
k= rearrange(k, 'b h d s -> b (h s) d', h=self.config.n_heads)
v= rearrange(v, 'b h s d -> b (h s) d', h=self.config.n_heads)
kv_index.add(torch.concat([v.squeeze(), k.squeeze()], dim=1).to('cpu').numpy())
else:
if long_range_past_key_values is None:
long_range_past_key_values = [(k.to(self.memory_device),v.to(self.memory_device)) for k,v in to_cache]
else:
long_range_past_key_values = [
(
torch.concat([kv[0], to_cache[ind][0].to(self.memory_device)], dim=3),
torch.concat([kv[1], to_cache[ind][1].to(self.memory_device)], dim=2)
)
for ind, kv in enumerate(long_range_past_key_values)
]
if long_range_past_key_values is not None: #set a limit on manual memory length
if long_range_past_key_values[0][0].size(-1) > max_length_cache:
long_range_past_key_values = [
(
kv[0][:, :, :, -max_length_cache:],
kv[1][:, :, -max_length_cache:]
)
for kv in long_range_past_key_values]
if verbose:
if cache_type == 'faiss':
print(f"{kn_index.ntotal} keys in faiss index")
else:
print(f"{long_range_past_key_values[0][0].size(-1)} cached kvs")
return long_range_past_key_values, (kn_index, kv_index) if cache_type == 'faiss' else None
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
inputs_embeds=None,
**kwargs,
):
if inputs_embeds is not None:
raise NotImplementedError(
'inputs_embeds is not implemented for MPT yet')
attention_mask = kwargs['attention_mask'].bool()
if attention_mask[:, -1].sum() != attention_mask.shape[0]:
raise NotImplementedError(
'MPT does not support generation with right padding.')
if self.transformer.attn_uses_sequence_id and self.training:
sequence_id = torch.zeros_like(input_ids[:1])
else:
sequence_id = None
if past_key_values is not None:
input_ids = input_ids[:, -1].unsqueeze(-1)
if self.transformer.prefix_lm:
# Leverage a convenience of sequential generation!
prefix_mask = torch.ones_like(attention_mask)
# This requires that we're using the cache
if kwargs.get('use_cache') == False:
raise NotImplementedError(
'MPT with prefix_lm=True does not support use_cache=False.')
else:
prefix_mask = None
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'prefix_mask': prefix_mask,
'sequence_id': sequence_id,
'past_key_values': past_key_values,
'use_cache': kwargs.get('use_cache', True),
'use_active_externalism': kwargs.get('use_active_externalism'), #add a few more kwargs for active externalism
'topk': kwargs.get('topk', None),
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
"""Used by HuggingFace generate when using beam search with kv-caching.
See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
for an example in transformers.
"""
reordered_past = []
for layer_past in past_key_values:
reordered_past += [
tuple(
past_state.index_select(0, beam_idx)
for past_state in layer_past)
]
return reordered_past