test-flex-gpt / modeling_flexbert.py
oweller2
minor
e9e8f85
raw
history blame
81.3 kB
# Copyright 2024 **AUTHORS_TODO**
# License: Apache-2.0
# RMSNorm Implementation: Copyright Meta (from their Llama RMSNorm implementation)
# License: LLAMA 2 COMMUNITY LICENSE AGREEMENT
# Copyright 2022 Jonas Geiping
# License: MIT
# Copyright 2022 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2023, Tri Dao.
"""Implements Mosaic BERT, with an eye towards the Hugging Face API.
Mosaic BERT improves performance over Hugging Face BERT through the following:
1. ALiBi. This architectural change removes positional embeddings and instead encodes positional
information through attention biases based on query-key position distance. It improves the effectiveness
of training with shorter sequence lengths by enabling extrapolation to longer sequences.
2. Gated Linear Units (GLU). This architectural change replaces the FFN component of the BERT layer
to improve overall expressiveness, providing better convergence properties.
3. Flash Attention. The MosaicBERT's self-attention layer makes use of Flash Attention, which dramatically
improves the speed of self-attention. Our implementation utilizes a bleeding edge implementation that
supports attention biases, which allows us to use Flash Attention with ALiBi.
4. Unpadding. Padding is often used to simplify batching across sequences of different lengths. Standard BERT
implementations waste computation on padded tokens. MosaicBERT internally unpads to reduce unnecessary computation
and improve speed. It does this without changing how the user interfaces with the model, thereby
preserving the simple API of standard implementations.
Currently, MosaicBERT is available for masked language modeling :class:`BertForMaskedLM` and sequence
classification :class:`BertForSequenceClassification`. We aim to expand this catalogue in future releases.
See :file:`./mosaic_bert.py` for utilities to simplify working with MosaicBERT in Composer, and for example usage
of the core Mosaic BERT classes.
"""
import logging
import os
import sys
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
# Add folder root to path to allow us to use relative imports regardless of what directory the script is run from
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
import torch
import torch.nn as nn
from einops import rearrange
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
from transformers.modeling_outputs import (
MaskedLMOutput,
ModelOutput,
CausalLMOutput,
MultipleChoiceModelOutput,
SequenceClassifierOutput,
)
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
from .bert_padding import index_put_first_axis
from .activation import get_act_fn
from .attention import (
FlexBertPaddedAttention,
FlexBertPaddedParallelAttention,
FlexBertPaddedRopeAttention,
FlexBertPaddedRopeParallelAttention,
FlexBertUnpadAttention,
FlexBertUnpadParallelAttention,
FlexBertUnpadRopeAttention,
FlexBertUnpadRopeParallelAttention,
)
from .configuration_bert import FlexBertConfig
from .embeddings import (
BertAlibiEmbeddings,
FlexBertAbsoluteEmbeddings,
FlexBertCompiledSansPositionEmbeddings,
FlexBertSansPositionEmbeddings,
get_embedding_layer,
)
from .initialization import (
ModuleType,
TileLinear,
TileMode,
init_weights,
tile_embedding,
tile_linear,
tile_norm,
)
from .layers import (
BertAlibiEncoder,
BertPooler,
BertPredictionHeadTransform,
FlexBertCompileUnpadPreNormLayer,
FlexBertPaddedEncoder,
FlexBertPaddedParallelPreNormLayer,
FlexBertPaddedPostNormLayer,
FlexBertPaddedPreNormLayer,
FlexBertUnpadEncoder,
FlexBertUnpadParallelPreNormLayer,
FlexBertUnpadPostNormLayer,
FlexBertUnpadPreNormLayer,
get_encoder_layer,
)
from .mlp import FlexBertGLU, FlexBertMLP, FlexBertParallelGLU
from .normalization import get_norm_layer
from .padding import pad_input, unpad_input
from .loss import get_loss_fn
# TODO: This is not used here, but this is so these files are copied when saving the model in ST/PyLate
from .utils import StrEnum
from .rotary import UnpaddedRotaryEmbedding
logger = logging.getLogger(__name__)
def _count_parameters(model: nn.Module, trainable: bool = True) -> int:
if trainable:
return sum(p.numel() for p in model.parameters() if p.requires_grad)
else:
return sum(p.numel() for p in model.parameters())
class BertModel(BertPreTrainedModel):
"""Overall BERT model.
Args:
config: a BertConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controlled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(
self,
config,
add_pooling_layer: bool = True,
):
super(BertModel, self).__init__(config)
self.embeddings = BertAlibiEmbeddings(config)
self.encoder = BertAlibiEncoder(config)
self.pooler = BertPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def forward(
self,
input_ids: torch.Tensor,
token_type_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_all_encoded_layers: Optional[bool] = False,
masked_tokens_mask: Optional[torch.Tensor] = None,
**kwargs,
) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
embedding_output = self.embeddings(input_ids, token_type_ids, position_ids)
subset_mask = []
first_col_mask = []
if masked_tokens_mask is None:
subset_mask = None
else:
first_col_mask = torch.zeros_like(masked_tokens_mask)
first_col_mask[:, 0] = True
subset_mask = masked_tokens_mask | first_col_mask
encoder_outputs = self.encoder(
embedding_output,
attention_mask,
output_all_encoded_layers=output_all_encoded_layers,
subset_mask=subset_mask,
)
if masked_tokens_mask is None:
sequence_output = encoder_outputs[-1]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
else:
# TD [2022-03-01]: the indexing here is very tricky.
attention_mask_bool = attention_mask.bool()
subset_idx = subset_mask[attention_mask_bool] # type: ignore
sequence_output = encoder_outputs[-1][masked_tokens_mask[attention_mask_bool][subset_idx]]
if self.pooler is not None:
pool_input = encoder_outputs[-1][first_col_mask[attention_mask_bool][subset_idx]]
pooled_output = self.pooler(pool_input, pool=False)
else:
pooled_output = None
if not output_all_encoded_layers:
encoder_outputs = sequence_output
if self.pooler is not None:
return encoder_outputs, pooled_output
return encoder_outputs, None
###################
# Bert Heads
###################
class BertLMPredictionHead(nn.Module):
def __init__(self, config, bert_model_embedding_weights):
super().__init__()
self.transform = BertPredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(bert_model_embedding_weights.size(1), bert_model_embedding_weights.size(0))
self.decoder.weight = bert_model_embedding_weights
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class BertOnlyMLMHead(nn.Module):
def __init__(self, config, bert_model_embedding_weights):
super().__init__()
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class BertOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
#####################
# Various Bert models
#####################
class BertForPreTraining(BertPreTrainedModel):
# TBD: Coming in Future Commit
pass
class BertLMHeadModel(BertPreTrainedModel):
# TBD: Coming in Future Commit
pass
class BertForMaskedLM(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
warnings.warn(
"If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
# Initialize weights and apply final processing
self.post_init()
@classmethod
def from_composer(
cls,
pretrained_checkpoint,
state_dict=None,
cache_dir=None,
from_tf=False,
config=None,
*inputs,
**kwargs,
):
"""Load from pre-trained."""
model = cls(config, *inputs, **kwargs)
if from_tf:
raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")
state_dict = torch.load(pretrained_checkpoint)
# If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(missing_keys) > 0:
logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
if len(unexpected_keys) > 0:
logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
return model
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
# labels should be a `torch.LongTensor` of shape
# `(batch_size, sequence_length)`. These are used for computing the
# masked language modeling loss.
#
# Indices should be in `[-100, 0, ..., config.vocab_size]` (see
# `input_ids` docstring) Tokens with indices set to `-100` are ignored
# (masked), the loss is only computed for the tokens with labels in `[0,
# ..., config.vocab_size]`
#
# Prediction scores are only computed for masked tokens and the (bs,
# seqlen) dimensions are flattened
if (input_ids is not None) == (inputs_embeds is not None):
raise ValueError("Must specify either input_ids or input_embeds!")
if labels is None:
masked_tokens_mask = None
else:
masked_tokens_mask = labels > 0
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
masked_tokens_mask=masked_tokens_mask,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
loss = None
if labels is not None:
# Compute loss
loss_fct = nn.CrossEntropyLoss()
masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
loss = loss_fct(prediction_scores, labels.flatten()[masked_token_idx])
assert input_ids is not None, "Coding error; please open an issue"
batch, seqlen = input_ids.shape[:2]
prediction_scores = rearrange(
index_put_first_axis(prediction_scores, masked_token_idx, batch * seqlen),
"(b s) d -> b s d",
b=batch,
)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=None,
attentions=None,
)
def prepare_inputs_for_generation(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **model_kwargs):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
# add a dummy token
if self.config.pad_token_id is None:
raise ValueError("The PAD token should be defined for generation")
attention_mask = torch.cat(
[attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))],
dim=-1,
)
dummy_token = torch.full(
(effective_batch_size, 1),
self.config.pad_token_id,
dtype=torch.long,
device=input_ids.device,
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
return {"input_ids": input_ids, "attention_mask": attention_mask}
class BertForNextSentencePrediction(BertPreTrainedModel):
# TBD: Push in future commit
pass
class BertForSequenceClassification(BertPreTrainedModel):
"""Bert Model transformer with a sequence classification/regression head.
This head is just a linear layer on top of the pooled output. Used for,
e.g., GLUE tasks.
"""
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.bert = BertModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@classmethod
def from_composer(
cls,
pretrained_checkpoint,
state_dict=None,
cache_dir=None,
from_tf=False,
config=None,
*inputs,
**kwargs,
):
"""Load from pre-trained."""
model = cls(config, *inputs, **kwargs)
if from_tf:
raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")
state_dict = torch.load(pretrained_checkpoint)
# If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(missing_keys) > 0:
logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
if len(unexpected_keys) > 0:
logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
return model
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
# labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
# Labels for computing the sequence classification/regression loss.
# Indices should be in `[0, ..., config.num_labels - 1]`.
# If `config.num_labels == 1` a regression loss is computed
# (mean-square loss). If `config.num_labels > 1` a classification loss
# is computed (cross-entropy).
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
# Compute loss
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = nn.MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = nn.BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=None,
attentions=None,
)
class BertForMultipleChoice(BertPreTrainedModel):
"""
Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.bert = BertModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
# In multiple choice tasks, all choices are submitted in a batch, and
# we compute a logit for each option independently. The logits are then
# normalized in the forward pass to get a probability distribution over
# the choices.
self.classifier = nn.Linear(config.hidden_size, 1)
# Initialize weights and apply final processing
self.post_init()
@classmethod
def from_composer(
cls,
pretrained_checkpoint,
state_dict=None,
cache_dir=None,
from_tf=False,
config=None,
*inputs,
**kwargs,
):
"""Load from pre-trained."""
model = cls(config, *inputs, **kwargs)
if from_tf:
raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")
state_dict = torch.load(pretrained_checkpoint)
# If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(missing_keys) > 0:
logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
if len(unexpected_keys) > 0:
logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
return model
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=None,
attentions=None,
)
class BertForTokenClassification(BertPreTrainedModel):
# TBD: Push in future commit
pass
class BertForQuestionAnswering(BertPreTrainedModel):
"""Bert Model with a span classification head.
This is used for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden states' output to compute `span start logits`
and `span end logits`).
"""
# TBD: Push in future commit
###################
# FlexBert Heads
###################
class FlexBertPredictionHead(nn.Module):
def __init__(self, config: FlexBertConfig):
super().__init__()
self.config = config
self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.head_pred_bias)
self.act = get_act_fn(config.head_pred_act) if config.head_pred_act else nn.Identity()
self.norm = (
get_norm_layer(config, compiled_norm=config.compile_model) if config.head_pred_norm else nn.Identity()
)
def _init_weights(self, reset_params: bool = False):
if reset_params:
self.norm.reset_parameters()
init_weights(self.config, self.dense, layer_dim=self.config.hidden_size, type_of_module=ModuleType.in_module)
def reset_parameters(self):
self._init_weights(reset_params=True)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return self.norm(self.act(self.dense(hidden_states)))
class FlexBertPoolingHead(nn.Module):
def __init__(self, config: FlexBertConfig):
super().__init__()
self.config = config
self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.head_class_bias)
self.act = get_act_fn(config.head_class_act) if config.head_class_act else nn.Identity()
self.norm = get_norm_layer(config) if config.head_class_norm else nn.Identity()
self.drop = torch.nn.Dropout(config.head_class_dropout) if config.head_class_dropout > 0 else nn.Identity()
self.pooling_type = config.pooling_type
def forward(self, hidden_states: torch.Tensor, pool: Optional[bool] = True) -> torch.Tensor:
if pool:
if self.pooling_type == "cls":
output = hidden_states[:, 0]
elif self.pooling_type == "mean":
output = hidden_states.mean(dim=1)
elif self.pooling_type == "max":
output = hidden_states.max(dim=1)[0]
else:
output = hidden_states
return self.drop(self.norm(self.act(self.dense(output))))
def _init_weights(self, reset_params: bool = False):
init_weights(self.config, self.dense, self.config.hidden_size, type_of_module=ModuleType.out_module)
if reset_params and hasattr(self.norm, "reset_parameters"):
self.norm.reset_parameters()
def reset_parameters(self):
self._init_weights(reset_params=True)
###################
# FlexBert Models
###################
@dataclass
class MaskedLMOutput(ModelOutput):
"""
Base class for masked language models outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
indices: Optional[torch.LongTensor] = None
cu_seqlens: Optional[torch.LongTensor] = None
max_seqlen: Optional[int] = None
batch_size: Optional[int] = None
seq_len: Optional[int] = None
labels: Optional[torch.LongTensor] = None
@dataclass
class MaskedLMOutputZLoss(ModelOutput):
"""
Base class for masked language models outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
ce_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Cross entropy loss.
z_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Z loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
indices (`torch.LongTensor` of shape `(batch_size,)`):
Indices of the tokens to be masked.
"""
loss: Optional[torch.FloatTensor] = None
ce_loss: Optional[torch.FloatTensor] = None
z_loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
indices: Optional[torch.LongTensor] = None
cu_seqlens: Optional[torch.LongTensor] = None
max_seqlen: Optional[int] = None
batch_size: Optional[int] = None
seq_len: Optional[int] = None
labels: Optional[torch.LongTensor] = None
class FlexBertPreTrainedModel(BertPreTrainedModel):
"""
An abstract class to handle custom weights initialization of modules
"""
def _init_module_weights(self, module: nn.Module):
"""
Custom weight init of modules using src.bert_layers.initialization.init_weights
Currently only supports init of embedding modules
"""
assert isinstance(module, nn.Module)
if isinstance(module, nn.Embedding):
init_weights(self.config, module, type_of_module=ModuleType.emb)
else:
raise NotImplementedError("Custom weight init for the given module is not supported")
class FlexBertModel(FlexBertPreTrainedModel):
"""Overall BERT model.
Args:
config: a BertConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controlled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config: FlexBertConfig):
super().__init__(config)
self.embeddings = get_embedding_layer(config)
self.encoder = get_encoder_layer(config)
if config.final_norm:
# if we use prenorm attention we need to add a final norm
self.final_norm = get_norm_layer(config)
else:
self.final_norm = None
self.unpad_embeddings = config.unpad_embeddings
self.is_decoder = config.causal_mask
def post_init(self):
self._init_weights(reset_params=False)
self._backward_compatibility_gradient_checkpointing()
def get_input_embeddings(self):
return self.embeddings.tok_embeddings
def set_input_embeddings(self, value):
self.embeddings.tok_embeddings = value
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
indices: Optional[torch.Tensor] = None,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
**kwargs,
) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
if attention_mask is None and not self.is_decoder:
attention_mask = torch.ones_like(input_ids)
embedding_output = self.embeddings(input_ids, position_ids)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=attention_mask,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
)
if self.final_norm is not None:
encoder_outputs = self.final_norm(encoder_outputs)
return encoder_outputs
def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
if module:
self._init_module_weights(module)
else:
assert isinstance(reset_params, bool)
self.embeddings._init_weights(reset_params=reset_params)
self.encoder._init_weights(reset_params=reset_params)
if reset_params and self.config.final_norm:
self.final_norm.reset_parameters()
def reset_parameters(self):
self._init_weights(reset_params=True)
def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
"""Returns the number of parameters in the model.
Args:
count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
trainable: only count trainable parameters.
"""
params = sum([_count_parameters(layer, trainable) for layer in self.encoder.layers])
if count_embeddings:
params += _count_parameters(self.embeddings, trainable)
if hasattr(self.embeddings, "position_embeddings"):
params -= _count_parameters(self.embeddings.position_embeddings, trainable)
return params
class FlexBertForMaskedLM(FlexBertPreTrainedModel):
def __init__(self, config: FlexBertConfig):
super().__init__(config)
self.bert = FlexBertModel(config)
self.head = FlexBertPredictionHead(config)
if config.tie_word_embeddings:
decoder_weights = self.bert.embeddings.tok_embeddings.weight
else:
decoder_weights = nn.Linear(config.hidden_size, config.vocab_size, bias=False).weight
self.decoder = nn.Linear(decoder_weights.size(1), decoder_weights.size(0), bias=config.decoder_bias)
self.decoder.weight = decoder_weights
self.loss_fn = nn.CrossEntropyLoss() if not hasattr(config, "loss_function") else get_loss_fn(config)
self.fa_ce = getattr(config, "loss_function", "cross_entropy") == "fa_cross_entropy"
self.return_z_loss = config.loss_kwargs.get("return_z_loss", False)
self.unpad_embeddings = config.unpad_embeddings
self.pad_logits = config.pad_logits
self.compile_model = config.compile_model
self.masked_prediction = config.masked_prediction
# Initialize weights and apply final processing
self._init_weights(reset_params=False)
def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
if module:
self._init_module_weights(module)
else:
assert isinstance(reset_params, bool)
self.bert._init_weights(reset_params=reset_params)
self.head._init_weights(reset_params=reset_params)
# Output weights.
if not self.config.tie_word_embeddings:
init_weights(self.config, self.decoder, self.config.hidden_size, type_of_module=ModuleType.final_out)
@classmethod
def from_composer(
cls,
pretrained_checkpoint,
state_dict=None,
cache_dir=None,
from_tf=False,
config=None,
*inputs,
**kwargs,
):
"""Load from pre-trained."""
model = cls(config, *inputs, **kwargs)
if from_tf:
raise ValueError("FlexBERT does not support loading TensorFlow weights.")
state_dict = torch.load(pretrained_checkpoint)
# If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(missing_keys) > 0:
logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
if len(unexpected_keys) > 0:
logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
return model
def get_output_embeddings(self):
return self.decoder
def set_output_embeddings(self, new_embeddings):
self.decoder = new_embeddings
@torch.no_grad()
def unpad_inputs(
self, input_ids: torch.Tensor, attention_mask: torch.Tensor, position_ids: torch.Tensor, labels: torch.Tensor
):
return unpad_input(input_ids, attention_mask, position_ids, labels)
@torch.no_grad()
def pad_inputs(
self,
inputs: torch.Tensor,
indices: torch.Tensor,
batch_size: int,
seqlen: int,
labels: Optional[torch.Tensor] = None,
ignore_index: int = -100,
):
return pad_input(
inputs=inputs, indices=indices, batch=batch_size, seqlen=seqlen, labels=labels, ignore_index=ignore_index
)
@torch.compile(dynamic=True)
def compiled_head(self, output: torch.Tensor) -> torch.Tensor:
return self.decoder(self.head(output))
def forward(
self,
input_ids: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
indices: Optional[torch.Tensor] = None,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
batch_size: Optional[int] = None,
seq_len: Optional[int] = None,
**kwargs,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
# labels should be a `torch.LongTensor` of shape
# `(batch_size, sequence_length)`. These are used for computing the
# masked language modeling loss.
#
# Indices should be in `[-100, 0, ..., config.vocab_size]` (see
# `input_ids` docstring) Tokens with indices set to `-100` are ignored
# (masked), the loss is only computed for the tokens with labels in `[0,
# ..., config.vocab_size]`
#
# Prediction scores are only computed for masked tokens and the (bs,
# seqlen) dimensions are flattened
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if self.unpad_embeddings and (indices is None and cu_seqlens is None and max_seqlen is None):
batch_size, seq_len = input_ids.shape[:2]
input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = self.unpad_inputs(
input_ids, attention_mask, position_ids, labels
)
output = self.bert(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
)
if self.masked_prediction and labels is not None:
# flatten labels and output first
labels = labels.view(-1)
output = output.view(labels.shape[0], -1)
# then filter out the non-masked tokens
mask_tokens = labels != self.loss_fn.ignore_index
output = output[mask_tokens]
labels = labels[mask_tokens]
if self.compile_model:
logits = self.compiled_head(output)
else:
logits = self.decoder(self.head(output))
loss = None
if labels is not None:
if not self.masked_prediction:
labels = labels.view(-1)
logits = logits.view(labels.shape[0], -1)
if self.return_z_loss:
loss, z_loss = self.loss_fn(logits, labels)
if self.pad_logits:
return MaskedLMOutputZLoss(
loss=loss,
ce_loss=loss.detach().clone() - z_loss,
z_loss=z_loss,
logits=self.pad_inputs(logits, indices, batch_size, seq_len)[0],
hidden_states=None,
attentions=None,
)
else:
return MaskedLMOutputZLoss(
loss=loss,
ce_loss=loss.detach().clone() - z_loss,
z_loss=z_loss,
logits=logits,
hidden_states=None,
attentions=None,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
batch_size=batch_size,
seq_len=seq_len,
labels=labels,
)
else:
loss = self.loss_fn(logits, labels)
if self.pad_logits:
return MaskedLMOutput(
loss=loss,
logits=self.pad_inputs(logits, indices, batch_size, seq_len)[0],
hidden_states=None,
attentions=None,
)
else:
return MaskedLMOutput(
loss=loss,
logits=logits,
hidden_states=None,
attentions=None,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
batch_size=batch_size,
seq_len=seq_len,
labels=labels,
)
def prepare_inputs_for_generation(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **model_kwargs):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
# add a dummy token
if self.config.pad_token_id is None:
raise ValueError("The PAD token should be defined for generation")
attention_mask = torch.cat(
[attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))],
dim=-1,
)
dummy_token = torch.full(
(effective_batch_size, 1),
self.config.pad_token_id,
dtype=torch.long,
device=input_ids.device,
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
return {"input_ids": input_ids, "attention_mask": attention_mask}
def get_number_parameters(
self, count_embeddings: bool = True, count_decoder: bool = False, trainable: bool = True
) -> int:
"""Returns the number of parameters in the model.
Args:
count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
count_decoder: count the parameters in the decoder layer if weights are not tied.
trainable: only count trainable parameters.
"""
params = self.bert.get_number_parameters(count_embeddings, trainable)
params += _count_parameters(self.head, trainable)
if count_decoder and not self.config.tie_word_embeddings:
params += _count_parameters(self.decoder, trainable)
return params
class FlexBertForSequenceClassification(FlexBertPreTrainedModel):
"""Bert Model transformer with a sequence classification/regression head.
This head is just a linear layer on top of the pooled output. Used for,
e.g., GLUE tasks.
"""
def __init__(self, config: FlexBertConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.bert = FlexBertModel(config)
self.head = FlexBertPoolingHead(config)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self._init_weights(reset_params=False)
def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
if module:
self._init_module_weights(module)
else:
assert isinstance(reset_params, bool)
self.bert._init_weights(reset_params=reset_params)
self.head._init_weights(reset_params=reset_params)
init_weights(self.config, self.classifier, self.config.hidden_size, type_of_module=ModuleType.final_out)
@classmethod
def from_composer(
cls,
pretrained_checkpoint,
state_dict=None,
cache_dir=None,
from_tf=False,
config=None,
*inputs,
**kwargs,
):
"""Load from pre-trained."""
model = cls(config, *inputs, **kwargs)
if from_tf:
raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")
state_dict = torch.load(pretrained_checkpoint)
# If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(missing_keys) > 0:
logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
if len(unexpected_keys) > 0:
logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
return model
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
# labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
# Labels for computing the sequence classification/regression loss.
# Indices should be in `[0, ..., config.num_labels - 1]`.
# If `config.num_labels == 1` a regression loss is computed
# (mean-square loss). If `config.num_labels > 1` a classification loss
# is computed (cross-entropy).
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output = self.bert(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
)
pooled_output = self.head(output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
# Compute loss
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = nn.MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = nn.BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + output
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=None,
attentions=None,
)
def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
"""Returns the number of parameters in the model.
Args:
count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
trainable: only count trainable parameters.
"""
params = self.bert.get_number_parameters(count_embeddings, trainable)
params += _count_parameters(self.head, trainable)
params += _count_parameters(self.classifier, trainable)
return params
class FlexBertForMultipleChoice(FlexBertPreTrainedModel):
"""
Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
def __init__(self, config: FlexBertConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.bert = FlexBertModel(config)
self.head = FlexBertPoolingHead(config)
# In multiple choice tasks, all choices are submitted in a batch, and
# we compute a logit for each option independently. The logits are then
# normalized in the forward pass to get a probability distribution over
# the choices.
self.classifier = nn.Linear(config.hidden_size, 1)
# Initialize weights and apply final processing
self._init_weights(reset_params=False)
def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
if module:
self._init_module_weights(module)
else:
assert isinstance(reset_params, bool)
self.bert._init_weights(reset_params=reset_params)
self.head._init_weights(reset_params=reset_params)
init_weights(self.config, self.classifier, self.config.hidden_size, type_of_module=ModuleType.final_out)
@classmethod
def from_composer(
cls,
pretrained_checkpoint,
state_dict=None,
cache_dir=None,
from_tf=False,
config=None,
*inputs,
**kwargs,
):
"""Load from pre-trained."""
model = cls(config, *inputs, **kwargs)
if from_tf:
raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")
state_dict = torch.load(pretrained_checkpoint)
# If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(missing_keys) > 0:
logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
if len(unexpected_keys) > 0:
logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
return model
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
# labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
# Labels for computing the sequence classification/regression loss.
# Indices should be in `[0, ..., config.num_labels - 1]`.
# If `config.num_labels == 1` a regression loss is computed
# (mean-square loss). If `config.num_labels > 1` a classification loss
# is computed (cross-entropy).
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
output = self.bert(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
)
pooled_output = self.head(output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + output
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=None,
attentions=None,
)
def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
"""Returns the number of parameters in the model.
Args:
count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
trainable: only count trainable parameters.
"""
params = self.bert.get_number_parameters(count_embeddings, trainable)
params += _count_parameters(self.head, trainable)
params += _count_parameters(self.classifier, trainable)
return params
class FlexBertForCausalLM(FlexBertPreTrainedModel):
config_class = FlexBertConfig
"""Bert Model transformer with a LM head.
This head is just a standard LM head module. Used for causal language modeling tasks.
"""
def __init__(self, config: FlexBertConfig):
super().__init__(config)
self.bert = FlexBertModel(config)
self.lm_head = FlexBertPredictionHead(config)
if config.tie_word_embeddings:
decoder_weights = self.bert.embeddings.tok_embeddings.weight
else:
decoder_weights = nn.Linear(config.hidden_size, config.vocab_size, bias=False).weight
self.decoder = nn.Linear(decoder_weights.size(1), decoder_weights.size(0), bias=config.decoder_bias)
self.decoder.weight = decoder_weights
self.loss_fn = nn.CrossEntropyLoss() if not hasattr(config, "loss_function") else get_loss_fn(config)
self.fa_ce = getattr(config, "loss_function", "cross_entropy") == "fa_cross_entropy"
self.return_z_loss = config.loss_kwargs.get("return_z_loss", False)
self.unpad_embeddings = config.unpad_embeddings
self.pad_logits = config.pad_logits
self.compile_model = config.compile_model
self.masked_prediction = config.masked_prediction
# Initialize weights and apply final processing
self._init_weights(reset_params=False)
def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
if module is not None:
# Add basic initialization for common module types
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
else:
assert isinstance(reset_params, bool)
self.bert._init_weights(reset_params=reset_params)
self.lm_head._init_weights(reset_params=reset_params)
if not self.config.tie_word_embeddings:
init_weights(self.config, self.decoder, self.config.hidden_size, type_of_module=ModuleType.final_out)
@classmethod
def from_composer(
cls,
pretrained_checkpoint,
state_dict=None,
cache_dir=None,
from_tf=False,
config=None,
*inputs,
**kwargs,
):
"""Load from pre-trained."""
model = cls(config, *inputs, **kwargs)
if from_tf:
raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")
state_dict = torch.load(pretrained_checkpoint)
# If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(missing_keys) > 0:
logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
if len(unexpected_keys) > 0:
logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
return model
def get_output_embeddings(self):
return self.decoder
def set_output_embeddings(self, new_embeddings):
self.decoder = new_embeddings
@torch.no_grad()
def unpad_inputs(
self, input_ids: torch.Tensor, attention_mask: torch.Tensor, position_ids: torch.Tensor, labels: torch.Tensor
):
return unpad_input(input_ids, attention_mask, position_ids, labels)
@torch.no_grad()
def pad_inputs(
self,
inputs: torch.Tensor,
indices: torch.Tensor,
batch_size: int,
seqlen: int,
labels: Optional[torch.Tensor] = None,
ignore_index: int = -100,
):
return pad_input(
inputs=inputs, indices=indices, batch=batch_size, seqlen=seqlen, labels=labels, ignore_index=ignore_index
)
@torch.compile(dynamic=True)
def compiled_lm_head(self, output: torch.Tensor) -> torch.Tensor:
return self.decoder(self.lm_head(output))
def forward(
self,
input_ids: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
indices: Optional[torch.Tensor] = None,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
batch_size: Optional[int] = None,
seq_len: Optional[int] = None,
**kwargs,
) -> Union[Tuple[torch.Tensor], CausalLMOutput]:
# labels should be a `torch.LongTensor` of shape
# `(batch_size, sequence_length)`. These are used for computing the
# masked language modeling loss.
#
# Indices should be in `[-100, 0, ..., config.vocab_size]` (see
# `input_ids` docstring) Tokens with indices set to `-100` are ignored
# (masked), the loss is only computed for the tokens with labels in `[0,
# ..., config.vocab_size]`
#
# Prediction scores are only computed for masked tokens and the (bs,
# seqlen) dimensions are flattened
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if self.unpad_embeddings and (indices is None and cu_seqlens is None and max_seqlen is None):
batch_size, seq_len = input_ids.shape[:2]
if attention_mask is None:
# create all ones, except for padding (TODO?)
attention_mask = torch.ones_like(input_ids)
input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = self.unpad_inputs(
input_ids, attention_mask, position_ids, labels
)
hidden_states = self.bert(
input_ids,
attention_mask=None, # let FA do this
position_ids=position_ids,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
)
if self.compile_model:
logits = self.compiled_lm_head(hidden_states)
else:
logits = self.lm_head(hidden_states)
loss = None
if labels is not None:
if cu_seqlens is not None:
shift_labels = torch.full_like(input_ids, -100)
shift_labels[:-1] = input_ids[1:]
# Mask boundaries, so eos doesn't predict bos
for i in range(len(cu_seqlens) - 1):
boundary_pos = cu_seqlens[i+1] - 1
shift_labels[boundary_pos] = -100
# NOTE: no padding or mask in there for now
assert 50283 not in shift_labels, f"PAD token found in shift_labels: {shift_labels}"
assert 50284 not in shift_labels, f"MASK token found in shift_labels: {shift_labels}"
assert shift_labels.shape == logits.shape[:-1] # Verify shapes align
else:
# Padded case: simple shift
shift_labels = input_ids[..., 1:].contiguous()
logits = logits[..., :-1, :].contiguous()
# mask out PAD tokens in the shift_labels
mask = (shift_labels == 50283)
shift_labels = torch.where(mask, torch.tensor(-100, device=shift_labels.device), shift_labels)
assert shift_labels.shape == logits.shape[:-1] # Verify shapes align
# For both cases, we'll use the shifted input_ids as our labels
labels = shift_labels
# Flatten the tokens
loss = self.loss_fn(
logits.view(-1, logits.size(-1)),
shift_labels.view(-1)
)
if self.pad_logits:
return CausalLMOutput(
loss=loss,
logits=self.pad_inputs(logits, indices, batch_size, seq_len)[0],
hidden_states=None,
attentions=None,
)
else:
return CausalLMOutput(
loss=loss,
logits=logits,
hidden_states=hidden_states,
attentions=None,
)
def prepare_inputs_for_generation(
self,
input_ids: torch.Tensor,
past_key_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
**kwargs
) -> dict:
# only last token for inputs if past is defined
if past_key_values is not None:
input_ids = input_ids[:, -1].unsqueeze(-1)
return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache", True),
"attention_mask": None,
}
def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
"""Returns the number of parameters in the model.
Args:
count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
trainable: only count trainable parameters.
"""
params = self.bert.get_number_parameters(count_embeddings, trainable)
params += _count_parameters(self.lm_head, trainable)
return params
FlexBertForCausalLM.register_for_auto_class("AutoModelForCausalLM")
def init_model_from_pretrained(
pretrained_model: FlexBertModel,
new_model: FlexBertModel,
mode: Union[str, TileMode] = TileMode.tile_weights_from_middle,
):
"""
Initialize the new model from the pretrained model.
This method uses Gopher layer scaling and Phi-style weight tiling as selected by `mode`.
The new model must have the same or more layers and the same or larger dimensions than the pretrained model.
Args:
pretrained_model (FlexBertModel): The smaller, pre-trained model
new_model (FlexBertModel): The larger model to be initialized
mode (Union[str, TileMode]): The Phi-style weight tiling mode to use
This function assumes that the new_model has more layers and a larger hidden size
than the pretrained_model, but the same vocabulary size.
"""
# Tile embeddings
assert isinstance(
new_model.embeddings, type(pretrained_model.embeddings)
), f"Pretrained and new_model layers must be the same type, got {type(new_model.embeddings)} and {type(pretrained_model.embeddings)}"
assert isinstance(
new_model.embeddings,
(FlexBertAbsoluteEmbeddings, FlexBertSansPositionEmbeddings, FlexBertCompiledSansPositionEmbeddings),
), f"Unsupported embedding layer type: {type(new_model.embeddings)}"
tile_embedding(pretrained_model.embeddings.tok_embeddings, new_model.embeddings.tok_embeddings, mode=mode)
if isinstance(pretrained_model.embeddings, FlexBertAbsoluteEmbeddings):
tile_embedding(pretrained_model.embeddings.pos_embeddings, new_model.embeddings.pos_embeddings, mode=mode)
if hasattr(pretrained_model.embeddings, "norm"):
tile_norm(pretrained_model.embeddings.norm, new_model.embeddings.norm, mode=mode)
# Tile encoder layers
assert isinstance(
pretrained_model.encoder, (FlexBertUnpadEncoder, FlexBertPaddedEncoder)
), f"Unsupported encoder layer type: {type(pretrained_model.encoder)}"
assert isinstance(
new_model.encoder, type(pretrained_model.encoder)
), f"Pretrained and new_model encoder layers must be the same type, got {type(new_model.encoder)} and {type(pretrained_model.encoder)}"
# Calculate the layer mapping
pretrained_layers = len(pretrained_model.encoder.layers)
new_layers = len(new_model.encoder.layers)
layer_mapping = [round(i * pretrained_layers / new_layers) for i in range(new_layers)]
# Initialize layers
for new_model_idx, pretrained_idx in enumerate(layer_mapping):
new_model_layer = new_model.encoder.layers[new_model_idx]
pretrained_layer = pretrained_model.encoder.layers[pretrained_idx]
# first tile the PreNorm/PostNorm layers
assert isinstance(
new_model_layer, type(pretrained_layer)
), f"Pretrained and new_model prenorm/postnorm layers must be the same type, got {type(new_model_layer)} and {type(pretrained_layer)}"
assert isinstance(
new_model_layer,
(
FlexBertUnpadPreNormLayer,
FlexBertCompileUnpadPreNormLayer,
FlexBertUnpadParallelPreNormLayer,
FlexBertUnpadPostNormLayer,
FlexBertPaddedPreNormLayer,
FlexBertPaddedParallelPreNormLayer,
FlexBertPaddedPostNormLayer,
),
), f"Unsupported prenorm/postnorm layer type: {type(new_model_layer)}"
# First tile the normalization layers
if hasattr(pretrained_layer, "attn_norm"):
tile_norm(pretrained_layer.attn_norm, new_model_layer.attn_norm, mode=mode)
if hasattr(pretrained_layer, "norm"):
tile_norm(pretrained_layer.norm, new_model_layer.norm, mode=mode)
if hasattr(pretrained_layer, "mlp_norm"):
tile_norm(pretrained_layer.mlp_norm, new_model_layer.mlp_norm, mode=mode)
# Then tile the attention & mlp layers
assert isinstance(
new_model_layer.attn, type(pretrained_layer.attn)
), f"Pretrained and new_model attention layers must be the same type, got {type(new_model_layer.attn)} and {type(pretrained_layer.attn)}"
# first try the parallel attention layers
if isinstance(pretrained_layer, (FlexBertUnpadParallelPreNormLayer, FlexBertPaddedParallelPreNormLayer)):
assert isinstance(
pretrained_layer.attn,
(
FlexBertUnpadParallelAttention,
FlexBertPaddedParallelAttention,
FlexBertUnpadRopeParallelAttention,
FlexBertPaddedRopeParallelAttention,
),
), f"Parallel prenorm layer must have parallel attention layer: {type(pretrained_layer.attn)}"
if not isinstance(pretrained_layer.mlp, (FlexBertParallelGLU)):
raise ValueError(f"Parallel prenorm layer must have parallel MLP layer: {type(pretrained_layer.mlp)}")
tile_linear(
pretrained_layer.Wqkvff,
new_model_layer.Wqkvff,
linear_type=TileLinear.wqkvff,
mode=mode,
pretrained_attn_size=pretrained_layer.attn_size,
pretrained_mlp_size=pretrained_layer.mlp_size,
new_attn_size=new_model_layer.attn_size,
new_mlp_size=new_model_layer.mlp_size,
wqkvff_is_glu=True,
)
# then try the fused attention layers
elif isinstance(
pretrained_layer.attn,
(
FlexBertUnpadAttention,
FlexBertPaddedAttention,
FlexBertUnpadRopeAttention,
FlexBertPaddedRopeAttention,
),
):
tile_linear(pretrained_layer.attn.Wqkv, new_model_layer.attn.Wqkv, linear_type=TileLinear.wqkv, mode=mode)
else:
raise ValueError(f"Unsupported attention layer type: {type(pretrained_layer.attn)}")
# finally, tile the attention output layer
tile_linear(pretrained_layer.attn.Wo, new_model_layer.attn.Wo, linear_type=TileLinear.default, mode=mode)
# tile the mlp layer if the model is not using parallel attention layers
if not isinstance(pretrained_layer.mlp, (FlexBertMLP, FlexBertGLU, FlexBertParallelGLU)):
raise ValueError(f"Unsupported MLP layer type: {type(pretrained_layer.mlp)}")
assert isinstance(
new_model_layer.mlp, type(pretrained_layer.mlp)
), f"Pretrained and new_model mlp layers must be the same type, got {type(new_model_layer.mlp)} and {type(pretrained_layer.mlp)}"
# already tiled the parallel glu layer if it exists, so only need to handle mlp & glu Wi
if isinstance(pretrained_layer.mlp, FlexBertGLU):
tile_linear(pretrained_layer.mlp.Wi, new_model_layer.mlp.Wi, linear_type=TileLinear.glu, mode=mode)
elif isinstance(pretrained_layer.mlp, FlexBertMLP):
tile_linear(pretrained_layer.mlp.Wi, new_model_layer.mlp.Wi, linear_type=TileLinear.default, mode=mode)
# tile the output for both ParallelGLU and MLP/GLU
tile_linear(pretrained_layer.mlp.Wo, new_model_layer.mlp.Wo, linear_type=TileLinear.default, mode=mode)
def init_mlm_model_from_pretrained(
config: FlexBertConfig,
pretrained_model: FlexBertForMaskedLM,
new_model: FlexBertForMaskedLM,
mode: Union[str, TileMode] = TileMode.tile_weights_from_middle,
):
"""
Initialize the new model from the pretrained model.
This method uses Gopher layer scaling and Phi-style weight tiling as selected by `mode`.
The new model must have the same or more layers and the same or larger dimensions than the pretrained model.
Args:
config (FlexBertConfig): The configuration of the new_model
pretrained_model (FlexBertForMaskedLM): The smaller, pre-trained model
new_model (FlexBertForMaskedLM): The larger model to be initialized from the pretrained model
mode (Union[str, TileMode]): The Phi-style weight tiling mode to use
This function assumes that the new_model has more layers and a larger hidden size
than the pretrained_model, but the same vocabulary size.
"""
init_model_from_pretrained(pretrained_model.bert, new_model.bert, mode=mode)
# TODO: uncomment this when the repo is turned into a pip installable package
# if not isinstance(pretrained_model.head, FlexBertPredictionHead):
# raise ValueError(f"Pretrained model must have a prediction head: {type(pretrained_model.head)}")
# if not isinstance(new_model.head, FlexBertPredictionHead):
# raise ValueError(f"New model must have a prediction head: {type(new_model.head)}")
# tile the prediction head
tile_linear(pretrained_model.head.dense, new_model.head.dense, linear_type=TileLinear.default, mode=mode)
tile_norm(pretrained_model.head.norm, new_model.head.norm, mode=mode)
# setup weight tying
if config.tie_word_embeddings:
new_model.decoder.weight = new_model.bert.embeddings.tok_embeddings.weight
tile_linear(
pretrained_model.decoder, new_model.decoder, linear_type=TileLinear.default, mode=mode, bias_only=True
)
else:
tile_linear(pretrained_model.decoder, new_model.decoder, linear_type=TileLinear.default, mode=mode)