# Copyright 2024 **AUTHORS_TODO**
# License: Apache-2.0

# RMSNorm Implementation: Copyright Meta (from their Llama RMSNorm implementation)
# License: LLAMA 2 COMMUNITY LICENSE AGREEMENT

# Copyright 2022 Jonas Geiping
# License: MIT

# Copyright 2022 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0

# Copyright 2023 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0

# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
# Copyright (c) 2023, Tri Dao.

"""Implements Mosaic BERT, with an eye towards the Hugging Face API.

Mosaic BERT improves performance over Hugging Face BERT through the following:

1. ALiBi. This architectural change removes positional embeddings and instead encodes positional
information through attention biases based on query-key position distance. It improves the effectiveness
of training with shorter sequence lengths by enabling extrapolation to longer sequences.

2. Gated Linear Units (GLU). This architectural change replaces the FFN component of the BERT layer
to improve overall expressiveness, providing better convergence properties.

3. Flash Attention. The MosaicBERT's self-attention layer makes use of Flash Attention, which dramatically
improves the speed of self-attention. Our implementation utilizes a bleeding edge implementation that
supports attention biases, which allows us to use Flash Attention with ALiBi.

4. Unpadding. Padding is often used to simplify batching across sequences of different lengths. Standard BERT
implementations waste computation on padded tokens. MosaicBERT internally unpads to reduce unnecessary computation
and improve speed. It does this without changing how the user interfaces with the model, thereby
preserving the simple API of standard implementations.


Currently, MosaicBERT is available for masked language modeling :class:`BertForMaskedLM` and sequence
classification :class:`BertForSequenceClassification`. We aim to expand this catalogue in future releases.

See :file:`./mosaic_bert.py` for utilities to simplify working with MosaicBERT in Composer, and for example usage
of the core Mosaic BERT classes.
"""

import logging
import os
import sys
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

# Add folder root to path to allow us to use relative imports regardless of what directory the script is run from
sys.path.append(os.path.dirname(os.path.realpath(__file__)))

import torch
import torch.nn as nn
from einops import rearrange
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
from transformers.modeling_outputs import (
    MaskedLMOutput,
    ModelOutput,
    CausalLMOutput,
    MultipleChoiceModelOutput,
    SequenceClassifierOutput,
)
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
from .bert_padding import index_put_first_axis

from .activation import get_act_fn
from .attention import (
    FlexBertPaddedAttention,
    FlexBertPaddedParallelAttention,
    FlexBertPaddedRopeAttention,
    FlexBertPaddedRopeParallelAttention,
    FlexBertUnpadAttention,
    FlexBertUnpadParallelAttention,
    FlexBertUnpadRopeAttention,
    FlexBertUnpadRopeParallelAttention,
)
from .configuration_bert import FlexBertConfig
from .embeddings import (
    BertAlibiEmbeddings,
    FlexBertAbsoluteEmbeddings,
    FlexBertCompiledSansPositionEmbeddings,
    FlexBertSansPositionEmbeddings,
    get_embedding_layer,
)
from .initialization import (
    ModuleType,
    TileLinear,
    TileMode,
    init_weights,
    tile_embedding,
    tile_linear,
    tile_norm,
)
from .layers import (
    BertAlibiEncoder,
    BertPooler,
    BertPredictionHeadTransform,
    FlexBertCompileUnpadPreNormLayer,
    FlexBertPaddedEncoder,
    FlexBertPaddedParallelPreNormLayer,
    FlexBertPaddedPostNormLayer,
    FlexBertPaddedPreNormLayer,
    FlexBertUnpadEncoder,
    FlexBertUnpadParallelPreNormLayer,
    FlexBertUnpadPostNormLayer,
    FlexBertUnpadPreNormLayer,
    get_encoder_layer,
)
from .mlp import FlexBertGLU, FlexBertMLP, FlexBertParallelGLU
from .normalization import get_norm_layer
from .padding import pad_input, unpad_input
from .loss import get_loss_fn

# TODO: This is not used here, but this is so these files are copied when saving the model in ST/PyLate
from .utils import StrEnum
from .rotary import UnpaddedRotaryEmbedding


logger = logging.getLogger(__name__)

def _count_parameters(model: nn.Module, trainable: bool = True) -> int:
    if trainable:
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in model.parameters())


class BertModel(BertPreTrainedModel):
    """Overall BERT model.

    Args:
        config: a BertConfig class instance with the configuration to build a new model

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.

    Outputs: Tuple of (encoded_layers, pooled_output)
        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
                to the last attention block of shape [batch_size, sequence_length, hidden_size],
        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
            classifier pretrained on top of the hidden state associated to the first character of the
            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    model = BertModel(config=config)
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    ```
    """

    def __init__(
        self,
        config,
        add_pooling_layer: bool = True,
    ):
        super(BertModel, self).__init__(config)
        self.embeddings = BertAlibiEmbeddings(config)
        self.encoder = BertAlibiEncoder(config)
        self.pooler = BertPooler(config) if add_pooling_layer else None
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def forward(
        self,
        input_ids: torch.Tensor,
        token_type_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_all_encoded_layers: Optional[bool] = False,
        masked_tokens_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        embedding_output = self.embeddings(input_ids, token_type_ids, position_ids)

        subset_mask = []
        first_col_mask = []

        if masked_tokens_mask is None:
            subset_mask = None
        else:
            first_col_mask = torch.zeros_like(masked_tokens_mask)
            first_col_mask[:, 0] = True
            subset_mask = masked_tokens_mask | first_col_mask

        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask,
            output_all_encoded_layers=output_all_encoded_layers,
            subset_mask=subset_mask,
        )

        if masked_tokens_mask is None:
            sequence_output = encoder_outputs[-1]
            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
        else:
            # TD [2022-03-01]: the indexing here is very tricky.
            attention_mask_bool = attention_mask.bool()
            subset_idx = subset_mask[attention_mask_bool]  # type: ignore
            sequence_output = encoder_outputs[-1][masked_tokens_mask[attention_mask_bool][subset_idx]]
            if self.pooler is not None:
                pool_input = encoder_outputs[-1][first_col_mask[attention_mask_bool][subset_idx]]
                pooled_output = self.pooler(pool_input, pool=False)
            else:
                pooled_output = None

        if not output_all_encoded_layers:
            encoder_outputs = sequence_output

        if self.pooler is not None:
            return encoder_outputs, pooled_output

        return encoder_outputs, None


###################
# Bert Heads
###################
class BertLMPredictionHead(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super().__init__()
        self.transform = BertPredictionHeadTransform(config)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(bert_model_embedding_weights.size(1), bert_model_embedding_weights.size(0))
        self.decoder.weight = bert_model_embedding_weights

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states


class BertOnlyMLMHead(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super().__init__()
        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


class BertOnlyNSPHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
        seq_relationship_score = self.seq_relationship(pooled_output)
        return seq_relationship_score


#####################
# Various Bert models
#####################


class BertForPreTraining(BertPreTrainedModel):
    # TBD: Coming in Future Commit
    pass


class BertLMHeadModel(BertPreTrainedModel):
    # TBD: Coming in Future Commit
    pass


class BertForMaskedLM(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        if config.is_decoder:
            warnings.warn(
                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        self.bert = BertModel(config, add_pooling_layer=False)
        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)

        # Initialize weights and apply final processing
        self.post_init()

    @classmethod
    def from_composer(
        cls,
        pretrained_checkpoint,
        state_dict=None,
        cache_dir=None,
        from_tf=False,
        config=None,
        *inputs,
        **kwargs,
    ):
        """Load from pre-trained."""
        model = cls(config, *inputs, **kwargs)
        if from_tf:
            raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")

        state_dict = torch.load(pretrained_checkpoint)
        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
        consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if len(missing_keys) > 0:
            logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
        if len(unexpected_keys) > 0:
            logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")

        return model

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        # labels should be a `torch.LongTensor` of shape
        # `(batch_size, sequence_length)`. These are used for computing the
        #  masked language modeling loss.
        #
        # Indices should be in `[-100, 0, ..., config.vocab_size]` (see
        # `input_ids` docstring) Tokens with indices set to `-100` are ignored
        # (masked), the loss is only computed for the tokens with labels in `[0,
        # ..., config.vocab_size]`
        #
        # Prediction scores are only computed for masked tokens and the (bs,
        # seqlen) dimensions are flattened
        if (input_ids is not None) == (inputs_embeds is not None):
            raise ValueError("Must specify either input_ids or input_embeds!")

        if labels is None:
            masked_tokens_mask = None
        else:
            masked_tokens_mask = labels > 0

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            masked_tokens_mask=masked_tokens_mask,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        loss = None
        if labels is not None:
            # Compute loss
            loss_fct = nn.CrossEntropyLoss()
            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
            loss = loss_fct(prediction_scores, labels.flatten()[masked_token_idx])

            assert input_ids is not None, "Coding error; please open an issue"
            batch, seqlen = input_ids.shape[:2]
            prediction_scores = rearrange(
                index_put_first_axis(prediction_scores, masked_token_idx, batch * seqlen),
                "(b s) d -> b s d",
                b=batch,
            )

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=None,
            attentions=None,
        )

    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        #  add a dummy token
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")

        attention_mask = torch.cat(
            [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))],
            dim=-1,
        )
        dummy_token = torch.full(
            (effective_batch_size, 1),
            self.config.pad_token_id,
            dtype=torch.long,
            device=input_ids.device,
        )
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        return {"input_ids": input_ids, "attention_mask": attention_mask}


class BertForNextSentencePrediction(BertPreTrainedModel):
    # TBD: Push in future commit
    pass


class BertForSequenceClassification(BertPreTrainedModel):
    """Bert Model transformer with a sequence classification/regression head.

    This head is just a linear layer on top of the pooled output. Used for,
    e.g., GLUE tasks.
    """

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @classmethod
    def from_composer(
        cls,
        pretrained_checkpoint,
        state_dict=None,
        cache_dir=None,
        from_tf=False,
        config=None,
        *inputs,
        **kwargs,
    ):
        """Load from pre-trained."""
        model = cls(config, *inputs, **kwargs)
        if from_tf:
            raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")

        state_dict = torch.load(pretrained_checkpoint)
        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
        consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if len(missing_keys) > 0:
            logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
        if len(unexpected_keys) > 0:
            logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")

        return model

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        # labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        # Labels for computing the sequence classification/regression loss.
        # Indices should be in `[0, ..., config.num_labels - 1]`.
        # If `config.num_labels == 1` a regression loss is computed
        # (mean-square loss). If `config.num_labels > 1` a classification loss
        # is computed (cross-entropy).

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            # Compute loss
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
        )


class BertForMultipleChoice(BertPreTrainedModel):
    """
    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)

        # In multiple choice tasks, all choices are submitted in a batch, and
        # we compute a logit for each option independently. The logits are then
        # normalized in the forward pass to get a probability distribution over
        # the choices.
        self.classifier = nn.Linear(config.hidden_size, 1)

        # Initialize weights and apply final processing
        self.post_init()

    @classmethod
    def from_composer(
        cls,
        pretrained_checkpoint,
        state_dict=None,
        cache_dir=None,
        from_tf=False,
        config=None,
        *inputs,
        **kwargs,
    ):
        """Load from pre-trained."""
        model = cls(config, *inputs, **kwargs)
        if from_tf:
            raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")

        state_dict = torch.load(pretrained_checkpoint)
        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
        consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if len(missing_keys) > 0:
            logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
        if len(unexpected_keys) > 0:
            logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")

        return model

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=None,
            attentions=None,
        )


class BertForTokenClassification(BertPreTrainedModel):
    # TBD: Push in future commit
    pass


class BertForQuestionAnswering(BertPreTrainedModel):
    """Bert Model with a span classification head.

    This is used for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden states' output to compute `span start logits`
    and `span end logits`).
    """

    # TBD: Push in future commit


###################
# FlexBert Heads
###################


class FlexBertPredictionHead(nn.Module):
    def __init__(self, config: FlexBertConfig):
        super().__init__()
        self.config = config
        self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.head_pred_bias)
        self.act = get_act_fn(config.head_pred_act) if config.head_pred_act else nn.Identity()
        self.norm = (
            get_norm_layer(config, compiled_norm=config.compile_model) if config.head_pred_norm else nn.Identity()
        )

    def _init_weights(self, reset_params: bool = False):
        if reset_params:
            self.norm.reset_parameters()
        init_weights(self.config, self.dense, layer_dim=self.config.hidden_size, type_of_module=ModuleType.in_module)

    def reset_parameters(self):
        self._init_weights(reset_params=True)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return self.norm(self.act(self.dense(hidden_states)))


class FlexBertPoolingHead(nn.Module):
    def __init__(self, config: FlexBertConfig):
        super().__init__()
        self.config = config
        self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.head_class_bias)
        self.act = get_act_fn(config.head_class_act) if config.head_class_act else nn.Identity()
        self.norm = get_norm_layer(config) if config.head_class_norm else nn.Identity()
        self.drop = torch.nn.Dropout(config.head_class_dropout) if config.head_class_dropout > 0 else nn.Identity()
        self.pooling_type = config.pooling_type

    def forward(self, hidden_states: torch.Tensor, pool: Optional[bool] = True) -> torch.Tensor:
        if pool:
            if self.pooling_type == "cls":
                output = hidden_states[:, 0]
            elif self.pooling_type == "mean":
                output = hidden_states.mean(dim=1)
            elif self.pooling_type == "max":
                output = hidden_states.max(dim=1)[0]
        else:
            output = hidden_states

        return self.drop(self.norm(self.act(self.dense(output))))

    def _init_weights(self, reset_params: bool = False):
        init_weights(self.config, self.dense, self.config.hidden_size, type_of_module=ModuleType.out_module)
        if reset_params and hasattr(self.norm, "reset_parameters"):
            self.norm.reset_parameters()

    def reset_parameters(self):
        self._init_weights(reset_params=True)


###################
# FlexBert Models
###################


@dataclass
class MaskedLMOutput(ModelOutput):
    """
    Base class for masked language models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    indices: Optional[torch.LongTensor] = None
    cu_seqlens: Optional[torch.LongTensor] = None
    max_seqlen: Optional[int] = None
    batch_size: Optional[int] = None
    seq_len: Optional[int] = None
    labels: Optional[torch.LongTensor] = None


@dataclass
class MaskedLMOutputZLoss(ModelOutput):
    """
    Base class for masked language models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        ce_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Cross entropy loss.
        z_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Z loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        indices (`torch.LongTensor` of shape `(batch_size,)`):
            Indices of the tokens to be masked.
    """

    loss: Optional[torch.FloatTensor] = None
    ce_loss: Optional[torch.FloatTensor] = None
    z_loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    indices: Optional[torch.LongTensor] = None
    cu_seqlens: Optional[torch.LongTensor] = None
    max_seqlen: Optional[int] = None
    batch_size: Optional[int] = None
    seq_len: Optional[int] = None
    labels: Optional[torch.LongTensor] = None


class FlexBertPreTrainedModel(BertPreTrainedModel):
    """
    An abstract class to handle custom weights initialization of modules
    """

    def _init_module_weights(self, module: nn.Module):
        """
        Custom weight init of modules using src.bert_layers.initialization.init_weights
        Currently only supports init of embedding modules
        """
        assert isinstance(module, nn.Module)
        if isinstance(module, nn.Embedding):
            init_weights(self.config, module, type_of_module=ModuleType.emb)
        else:
            raise NotImplementedError("Custom weight init for the given module is not supported")


class FlexBertModel(FlexBertPreTrainedModel):
    """Overall BERT model.

    Args:
        config: a BertConfig class instance with the configuration to build a new model

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.

    Outputs: Tuple of (encoded_layers, pooled_output)
        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
                to the last attention block of shape [batch_size, sequence_length, hidden_size],
        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
            classifier pretrained on top of the hidden state associated to the first character of the
            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    model = BertModel(config=config)
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    ```
    """

    def __init__(self, config: FlexBertConfig):
        super().__init__(config)
        self.embeddings = get_embedding_layer(config)
        self.encoder = get_encoder_layer(config)
        if config.final_norm:
            # if we use prenorm attention we need to add a final norm
            self.final_norm = get_norm_layer(config)
        else:
            self.final_norm = None
        self.unpad_embeddings = config.unpad_embeddings
        self.is_decoder = False

    def post_init(self):
        self._init_weights(reset_params=False)
        self._backward_compatibility_gradient_checkpointing()

    def get_input_embeddings(self):
        return self.embeddings.tok_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.tok_embeddings = value

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        **kwargs,
    ) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
        if attention_mask is None and not self.is_decoder:
            attention_mask = torch.ones_like(input_ids)

        embedding_output = self.embeddings(input_ids, position_ids)

        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=attention_mask,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
        )

        if self.final_norm is not None:
            encoder_outputs = self.final_norm(encoder_outputs)
        return encoder_outputs

    def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
        assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
        if module:
            self._init_module_weights(module)
        else:
            assert isinstance(reset_params, bool)
            self.embeddings._init_weights(reset_params=reset_params)
            self.encoder._init_weights(reset_params=reset_params)

            if reset_params and self.config.final_norm:
                self.final_norm.reset_parameters()

    def reset_parameters(self):
        self._init_weights(reset_params=True)

    def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
        """Returns the number of parameters in the model.

        Args:
            count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
            trainable: only count trainable parameters.
        """
        params = sum([_count_parameters(layer, trainable) for layer in self.encoder.layers])
        if count_embeddings:
            params += _count_parameters(self.embeddings, trainable)
            if hasattr(self.embeddings, "position_embeddings"):
                params -= _count_parameters(self.embeddings.position_embeddings, trainable)
        return params


class FlexBertForMaskedLM(FlexBertPreTrainedModel):
    def __init__(self, config: FlexBertConfig):
        super().__init__(config)
        self.bert = FlexBertModel(config)
        self.head = FlexBertPredictionHead(config)

        if config.tie_word_embeddings:
            decoder_weights = self.bert.embeddings.tok_embeddings.weight
        else:
            decoder_weights = nn.Linear(config.hidden_size, config.vocab_size, bias=False).weight
        self.decoder = nn.Linear(decoder_weights.size(1), decoder_weights.size(0), bias=config.decoder_bias)
        self.decoder.weight = decoder_weights

        self.loss_fn = nn.CrossEntropyLoss() if not hasattr(config, "loss_function") else get_loss_fn(config)
        self.fa_ce = getattr(config, "loss_function", "cross_entropy") == "fa_cross_entropy"
        self.return_z_loss = config.loss_kwargs.get("return_z_loss", False)
        self.unpad_embeddings = config.unpad_embeddings
        self.pad_logits = config.pad_logits
        self.compile_model = config.compile_model
        self.masked_prediction = config.masked_prediction

        # Initialize weights and apply final processing
        self._init_weights(reset_params=False)

    def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
        assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
        if module:
            self._init_module_weights(module)
        else:
            assert isinstance(reset_params, bool)
            self.bert._init_weights(reset_params=reset_params)
            self.head._init_weights(reset_params=reset_params)

            # Output weights.
            if not self.config.tie_word_embeddings:
                init_weights(self.config, self.decoder, self.config.hidden_size, type_of_module=ModuleType.final_out)

    @classmethod
    def from_composer(
        cls,
        pretrained_checkpoint,
        state_dict=None,
        cache_dir=None,
        from_tf=False,
        config=None,
        *inputs,
        **kwargs,
    ):
        """Load from pre-trained."""
        model = cls(config, *inputs, **kwargs)
        if from_tf:
            raise ValueError("FlexBERT does not support loading TensorFlow weights.")

        state_dict = torch.load(pretrained_checkpoint)
        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
        consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if len(missing_keys) > 0:
            logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
        if len(unexpected_keys) > 0:
            logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")

        return model

    def get_output_embeddings(self):
        return self.decoder

    def set_output_embeddings(self, new_embeddings):
        self.decoder = new_embeddings

    @torch.no_grad()
    def unpad_inputs(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, position_ids: torch.Tensor, labels: torch.Tensor
    ):
        return unpad_input(input_ids, attention_mask, position_ids, labels)

    @torch.no_grad()
    def pad_inputs(
        self,
        inputs: torch.Tensor,
        indices: torch.Tensor,
        batch_size: int,
        seqlen: int,
        labels: Optional[torch.Tensor] = None,
        ignore_index: int = -100,
    ):
        return pad_input(
            inputs=inputs, indices=indices, batch=batch_size, seqlen=seqlen, labels=labels, ignore_index=ignore_index
        )

    @torch.compile(dynamic=True)
    def compiled_head(self, output: torch.Tensor) -> torch.Tensor:
        return self.decoder(self.head(output))

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        **kwargs,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        # labels should be a `torch.LongTensor` of shape
        # `(batch_size, sequence_length)`. These are used for computing the
        #  masked language modeling loss.
        #
        # Indices should be in `[-100, 0, ..., config.vocab_size]` (see
        # `input_ids` docstring) Tokens with indices set to `-100` are ignored
        # (masked), the loss is only computed for the tokens with labels in `[0,
        # ..., config.vocab_size]`
        #
        # Prediction scores are only computed for masked tokens and the (bs,
        # seqlen) dimensions are flattened

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if self.unpad_embeddings and (indices is None and cu_seqlens is None and max_seqlen is None):
            batch_size, seq_len = input_ids.shape[:2]
            input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = self.unpad_inputs(
                input_ids, attention_mask, position_ids, labels
            )
        

        output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
        )

        if self.masked_prediction and labels is not None:
            # flatten labels and output first
            labels = labels.view(-1)
            output = output.view(labels.shape[0], -1)

            # then filter out the non-masked tokens
            mask_tokens = labels != self.loss_fn.ignore_index
            output = output[mask_tokens]
            labels = labels[mask_tokens]

        if self.compile_model:
            logits = self.compiled_head(output)
        else:
            logits = self.decoder(self.head(output))

        loss = None
        if labels is not None:
            if not self.masked_prediction:
                labels = labels.view(-1)
                logits = logits.view(labels.shape[0], -1)

            if self.return_z_loss:
                loss, z_loss = self.loss_fn(logits, labels)
                if self.pad_logits:
                    return MaskedLMOutputZLoss(
                        loss=loss,
                        ce_loss=loss.detach().clone() - z_loss,
                        z_loss=z_loss,
                        logits=self.pad_inputs(logits, indices, batch_size, seq_len)[0],
                        hidden_states=None,
                        attentions=None,
                    )
                else:
                    return MaskedLMOutputZLoss(
                        loss=loss,
                        ce_loss=loss.detach().clone() - z_loss,
                        z_loss=z_loss,
                        logits=logits,
                        hidden_states=None,
                        attentions=None,
                        indices=indices,
                        cu_seqlens=cu_seqlens,
                        max_seqlen=max_seqlen,
                        batch_size=batch_size,
                        seq_len=seq_len,
                        labels=labels,
                    )
            else:
                loss = self.loss_fn(logits, labels)

        if self.pad_logits:
            return MaskedLMOutput(
                loss=loss,
                logits=self.pad_inputs(logits, indices, batch_size, seq_len)[0],
                hidden_states=None,
                attentions=None,
            )
        else:
            return MaskedLMOutput(
                loss=loss,
                logits=logits,
                hidden_states=None,
                attentions=None,
                indices=indices,
                cu_seqlens=cu_seqlens,
                max_seqlen=max_seqlen,
                batch_size=batch_size,
                seq_len=seq_len,
                labels=labels,
            )

    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        #  add a dummy token
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")

        attention_mask = torch.cat(
            [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))],
            dim=-1,
        )
        dummy_token = torch.full(
            (effective_batch_size, 1),
            self.config.pad_token_id,
            dtype=torch.long,
            device=input_ids.device,
        )
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        return {"input_ids": input_ids, "attention_mask": attention_mask}

    def get_number_parameters(
        self, count_embeddings: bool = True, count_decoder: bool = False, trainable: bool = True
    ) -> int:
        """Returns the number of parameters in the model.

        Args:
            count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
            count_decoder: count the parameters in the decoder layer if weights are not tied.
            trainable: only count trainable parameters.
        """
        params = self.bert.get_number_parameters(count_embeddings, trainable)
        params += _count_parameters(self.head, trainable)
        if count_decoder and not self.config.tie_word_embeddings:
            params += _count_parameters(self.decoder, trainable)
        return params


class FlexBertForSequenceClassification(FlexBertPreTrainedModel):
    """Bert Model transformer with a sequence classification/regression head.

    This head is just a linear layer on top of the pooled output. Used for,
    e.g., GLUE tasks.
    """

    def __init__(self, config: FlexBertConfig):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = FlexBertModel(config)
        self.head = FlexBertPoolingHead(config)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self._init_weights(reset_params=False)

    def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
        assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
        if module:
            self._init_module_weights(module)
        else:
            assert isinstance(reset_params, bool)
            self.bert._init_weights(reset_params=reset_params)
            self.head._init_weights(reset_params=reset_params)
            init_weights(self.config, self.classifier, self.config.hidden_size, type_of_module=ModuleType.final_out)

    @classmethod
    def from_composer(
        cls,
        pretrained_checkpoint,
        state_dict=None,
        cache_dir=None,
        from_tf=False,
        config=None,
        *inputs,
        **kwargs,
    ):
        """Load from pre-trained."""
        model = cls(config, *inputs, **kwargs)
        if from_tf:
            raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")

        state_dict = torch.load(pretrained_checkpoint)
        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
        consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if len(missing_keys) > 0:
            logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
        if len(unexpected_keys) > 0:
            logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")

        return model

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        # labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        # Labels for computing the sequence classification/regression loss.
        # Indices should be in `[0, ..., config.num_labels - 1]`.
        # If `config.num_labels == 1` a regression loss is computed
        # (mean-square loss). If `config.num_labels > 1` a classification loss
        # is computed (cross-entropy).

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )

        pooled_output = self.head(output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            # Compute loss
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + output
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
        )

    def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
        """Returns the number of parameters in the model.

        Args:
            count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
            trainable: only count trainable parameters.
        """
        params = self.bert.get_number_parameters(count_embeddings, trainable)
        params += _count_parameters(self.head, trainable)
        params += _count_parameters(self.classifier, trainable)
        return params


class FlexBertForMultipleChoice(FlexBertPreTrainedModel):
    """
    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """

    def __init__(self, config: FlexBertConfig):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = FlexBertModel(config)
        self.head = FlexBertPoolingHead(config)

        # In multiple choice tasks, all choices are submitted in a batch, and
        # we compute a logit for each option independently. The logits are then
        # normalized in the forward pass to get a probability distribution over
        # the choices.
        self.classifier = nn.Linear(config.hidden_size, 1)

        # Initialize weights and apply final processing
        self._init_weights(reset_params=False)

    def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
        assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
        if module:
            self._init_module_weights(module)
        else:
            assert isinstance(reset_params, bool)
            self.bert._init_weights(reset_params=reset_params)
            self.head._init_weights(reset_params=reset_params)
            init_weights(self.config, self.classifier, self.config.hidden_size, type_of_module=ModuleType.final_out)

    @classmethod
    def from_composer(
        cls,
        pretrained_checkpoint,
        state_dict=None,
        cache_dir=None,
        from_tf=False,
        config=None,
        *inputs,
        **kwargs,
    ):
        """Load from pre-trained."""
        model = cls(config, *inputs, **kwargs)
        if from_tf:
            raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")

        state_dict = torch.load(pretrained_checkpoint)
        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
        consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if len(missing_keys) > 0:
            logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
        if len(unexpected_keys) > 0:
            logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")

        return model

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        # labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        # Labels for computing the sequence classification/regression loss.
        # Indices should be in `[0, ..., config.num_labels - 1]`.
        # If `config.num_labels == 1` a regression loss is computed
        # (mean-square loss). If `config.num_labels > 1` a classification loss
        # is computed (cross-entropy).

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None

        output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )

        pooled_output = self.head(output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + output
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=None,
            attentions=None,
        )

    def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
        """Returns the number of parameters in the model.

        Args:
            count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
            trainable: only count trainable parameters.
        """
        params = self.bert.get_number_parameters(count_embeddings, trainable)
        params += _count_parameters(self.head, trainable)
        params += _count_parameters(self.classifier, trainable)
        return params


class FlexBertForCausalLM(FlexBertPreTrainedModel):
    config_class = FlexBertConfig
    """Bert Model transformer with a LM head.

    This head is just a standard LM head module. Used for causal language modeling tasks.
    """

    def __init__(self, config: FlexBertConfig):
        super().__init__(config)
        self.bert = FlexBertModel(config)
        self.bert.is_decoder = True
        self.lm_head = FlexBertPredictionHead(config)

        if config.tie_word_embeddings:
            decoder_weights = self.bert.embeddings.tok_embeddings.weight
        else:
            decoder_weights = nn.Linear(config.hidden_size, config.vocab_size, bias=False).weight
        self.decoder = nn.Linear(decoder_weights.size(1), decoder_weights.size(0), bias=config.decoder_bias)
        self.decoder.weight = decoder_weights

        self.loss_fn = nn.CrossEntropyLoss() if not hasattr(config, "loss_function") else get_loss_fn(config)
        self.fa_ce = getattr(config, "loss_function", "cross_entropy") == "fa_cross_entropy"
        self.return_z_loss = config.loss_kwargs.get("return_z_loss", False)
        self.unpad_embeddings = config.unpad_embeddings
        self.pad_logits = config.pad_logits
        self.compile_model = config.compile_model
        self.masked_prediction = config.masked_prediction

        # Initialize weights and apply final processing
        self._init_weights(reset_params=False)

    def _init_weights(self, module: Optional[nn.Module] = None, reset_params: Optional[bool] = None):
        assert (module is None) != (reset_params is None), "arg module xor reset_params must be specified"
        if module is not None:
            # Add basic initialization for common module types
            if isinstance(module, (nn.Linear, nn.Embedding)):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
        else:
            assert isinstance(reset_params, bool)
            self.bert._init_weights(reset_params=reset_params)
            self.lm_head._init_weights(reset_params=reset_params)

            if not self.config.tie_word_embeddings:
                init_weights(self.config, self.decoder, self.config.hidden_size, type_of_module=ModuleType.final_out)

    @classmethod
    def from_composer(
        cls,
        pretrained_checkpoint,
        state_dict=None,
        cache_dir=None,
        from_tf=False,
        config=None,
        *inputs,
        **kwargs,
    ):
        """Load from pre-trained."""
        model = cls(config, *inputs, **kwargs)
        if from_tf:
            raise ValueError("Mosaic BERT does not support loading TensorFlow weights.")

        state_dict = torch.load(pretrained_checkpoint)
        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
        consume_prefix_in_state_dict_if_present(state_dict, prefix="model.")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if len(missing_keys) > 0:
            logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
        if len(unexpected_keys) > 0:
            logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")

        return model


    def get_output_embeddings(self):
        return self.decoder

    def set_output_embeddings(self, new_embeddings):
        self.decoder = new_embeddings

    @torch.no_grad()
    def unpad_inputs(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, position_ids: torch.Tensor, labels: torch.Tensor
    ):
        return unpad_input(input_ids, attention_mask, position_ids, labels)

    @torch.no_grad()
    def pad_inputs(
        self,
        inputs: torch.Tensor,
        indices: torch.Tensor,
        batch_size: int,
        seqlen: int,
        labels: Optional[torch.Tensor] = None,
        ignore_index: int = -100,
    ):
        return pad_input(
            inputs=inputs, indices=indices, batch=batch_size, seqlen=seqlen, labels=labels, ignore_index=ignore_index
        )

    @torch.compile(dynamic=True)
    def compiled_lm_head(self, output: torch.Tensor) -> torch.Tensor:
        return self.decoder(self.lm_head(output))

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        **kwargs,
    ) -> Union[Tuple[torch.Tensor], CausalLMOutput]:
        # labels should be a `torch.LongTensor` of shape
        # `(batch_size, sequence_length)`. These are used for computing the
        #  masked language modeling loss.
        #
        # Indices should be in `[-100, 0, ..., config.vocab_size]` (see
        # `input_ids` docstring) Tokens with indices set to `-100` are ignored
        # (masked), the loss is only computed for the tokens with labels in `[0,
        # ..., config.vocab_size]`
        #
        # Prediction scores are only computed for masked tokens and the (bs,
        # seqlen) dimensions are flattened
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if self.unpad_embeddings and (indices is None and cu_seqlens is None and max_seqlen is None):
            batch_size, seq_len = input_ids.shape[:2]
            if attention_mask is None:
                # create all ones, except for padding (TODO?)
                attention_mask = torch.ones_like(input_ids)
            input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = self.unpad_inputs(
                input_ids, attention_mask, position_ids, labels
            )

        hidden_states = self.bert(
            input_ids,
            attention_mask=None, # let FA do this
            position_ids=position_ids,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
        )

        if self.compile_model:
            logits = self.compiled_lm_head(hidden_states)
        else:
            logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            if cu_seqlens is not None:                
                shift_labels = input_ids[1:].clone()    
                loss_logits = logits[:-1]  # Only shift for loss

                # Mask boundaries, so eos doesn't predict bos
                for i in range(len(cu_seqlens) - 1):
                    boundary_pos = cu_seqlens[i+1] - 1
                    if boundary_pos < len(shift_labels):
                        shift_labels[boundary_pos] = -100

                # NOTE: no padding or mask in there for now
                assert 50283 not in shift_labels, f"PAD token found in shift_labels: {shift_labels}"
                assert 50284 not in shift_labels, f"MASK token found in shift_labels: {shift_labels}"
                assert shift_labels.shape[0] == loss_logits.shape[0] # Verify shapes align                    
            else:
                # Padded case: simple shift
                shift_labels = input_ids[..., 1:].contiguous()
                loss_logits = logits[..., :-1, :].contiguous()
                # mask out PAD tokens in the shift_labels
                mask = (shift_labels == 50283)
                shift_labels = torch.where(mask, torch.tensor(-100, device=shift_labels.device), shift_labels)
                assert shift_labels.shape[0] == loss_logits.shape[0] # Verify shapes align

            # For both cases, we'll use the shifted input_ids as our labels
            labels = shift_labels
            
            # Flatten the tokens
            loss = self.loss_fn(loss_logits.view(-1, loss_logits.size(-1)), shift_labels.view(-1))

        if self.pad_logits:
            return CausalLMOutput(
                loss=loss,
                logits=self.pad_inputs(logits, indices, batch_size, seq_len)[0],
                hidden_states=hidden_states,
                attentions=None,
            )
        else:
            return CausalLMOutput(
                loss=loss,
                logits=logits,
                hidden_states=hidden_states,
                attentions=None,
            )

    def prepare_inputs_for_generation(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        **kwargs
    ) -> dict:
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }

    def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
        """Returns the number of parameters in the model.

        Args:
            count_embeddings: count the parameters in the embeddings layer, excluding position embeddings.
            trainable: only count trainable parameters.
        """
        params = self.bert.get_number_parameters(count_embeddings, trainable)
        params += _count_parameters(self.lm_head, trainable)
        return params

FlexBertForCausalLM.register_for_auto_class("AutoModelForCausalLM")

def init_model_from_pretrained(
    pretrained_model: FlexBertModel,
    new_model: FlexBertModel,
    mode: Union[str, TileMode] = TileMode.tile_weights_from_middle,
):
    """
    Initialize the new model from the pretrained model.

    This method uses Gopher layer scaling and Phi-style weight tiling as selected by `mode`.
    The new model must have the same or more layers and the same or larger dimensions than the pretrained model.

    Args:
        pretrained_model (FlexBertModel): The smaller, pre-trained model
        new_model (FlexBertModel): The larger model to be initialized
        mode (Union[str, TileMode]): The Phi-style weight tiling mode to use

    This function assumes that the new_model has more layers and a larger hidden size
    than the pretrained_model, but the same vocabulary size.
    """

    # Tile embeddings
    assert isinstance(
        new_model.embeddings, type(pretrained_model.embeddings)
    ), f"Pretrained and new_model layers must be the same type, got {type(new_model.embeddings)} and {type(pretrained_model.embeddings)}"
    assert isinstance(
        new_model.embeddings,
        (FlexBertAbsoluteEmbeddings, FlexBertSansPositionEmbeddings, FlexBertCompiledSansPositionEmbeddings),
    ), f"Unsupported embedding layer type: {type(new_model.embeddings)}"

    tile_embedding(pretrained_model.embeddings.tok_embeddings, new_model.embeddings.tok_embeddings, mode=mode)
    if isinstance(pretrained_model.embeddings, FlexBertAbsoluteEmbeddings):
        tile_embedding(pretrained_model.embeddings.pos_embeddings, new_model.embeddings.pos_embeddings, mode=mode)

    if hasattr(pretrained_model.embeddings, "norm"):
        tile_norm(pretrained_model.embeddings.norm, new_model.embeddings.norm, mode=mode)

    # Tile encoder layers
    assert isinstance(
        pretrained_model.encoder, (FlexBertUnpadEncoder, FlexBertPaddedEncoder)
    ), f"Unsupported encoder layer type: {type(pretrained_model.encoder)}"
    assert isinstance(
        new_model.encoder, type(pretrained_model.encoder)
    ), f"Pretrained and new_model encoder layers must be the same type, got {type(new_model.encoder)} and {type(pretrained_model.encoder)}"

    # Calculate the layer mapping
    pretrained_layers = len(pretrained_model.encoder.layers)
    new_layers = len(new_model.encoder.layers)
    layer_mapping = [round(i * pretrained_layers / new_layers) for i in range(new_layers)]

    # Initialize layers
    for new_model_idx, pretrained_idx in enumerate(layer_mapping):
        new_model_layer = new_model.encoder.layers[new_model_idx]
        pretrained_layer = pretrained_model.encoder.layers[pretrained_idx]

        # first tile the PreNorm/PostNorm layers
        assert isinstance(
            new_model_layer, type(pretrained_layer)
        ), f"Pretrained and new_model prenorm/postnorm layers must be the same type, got {type(new_model_layer)} and {type(pretrained_layer)}"
        assert isinstance(
            new_model_layer,
            (
                FlexBertUnpadPreNormLayer,
                FlexBertCompileUnpadPreNormLayer,
                FlexBertUnpadParallelPreNormLayer,
                FlexBertUnpadPostNormLayer,
                FlexBertPaddedPreNormLayer,
                FlexBertPaddedParallelPreNormLayer,
                FlexBertPaddedPostNormLayer,
            ),
        ), f"Unsupported prenorm/postnorm layer type: {type(new_model_layer)}"

        # First tile the normalization layers
        if hasattr(pretrained_layer, "attn_norm"):
            tile_norm(pretrained_layer.attn_norm, new_model_layer.attn_norm, mode=mode)
        if hasattr(pretrained_layer, "norm"):
            tile_norm(pretrained_layer.norm, new_model_layer.norm, mode=mode)
        if hasattr(pretrained_layer, "mlp_norm"):
            tile_norm(pretrained_layer.mlp_norm, new_model_layer.mlp_norm, mode=mode)

        # Then tile the attention & mlp layers
        assert isinstance(
            new_model_layer.attn, type(pretrained_layer.attn)
        ), f"Pretrained and new_model attention layers must be the same type, got {type(new_model_layer.attn)} and {type(pretrained_layer.attn)}"

        # first try the parallel attention layers
        if isinstance(pretrained_layer, (FlexBertUnpadParallelPreNormLayer, FlexBertPaddedParallelPreNormLayer)):
            assert isinstance(
                pretrained_layer.attn,
                (
                    FlexBertUnpadParallelAttention,
                    FlexBertPaddedParallelAttention,
                    FlexBertUnpadRopeParallelAttention,
                    FlexBertPaddedRopeParallelAttention,
                ),
            ), f"Parallel prenorm layer must have parallel attention layer: {type(pretrained_layer.attn)}"
            if not isinstance(pretrained_layer.mlp, (FlexBertParallelGLU)):
                raise ValueError(f"Parallel prenorm layer must have parallel MLP layer: {type(pretrained_layer.mlp)}")
            tile_linear(
                pretrained_layer.Wqkvff,
                new_model_layer.Wqkvff,
                linear_type=TileLinear.wqkvff,
                mode=mode,
                pretrained_attn_size=pretrained_layer.attn_size,
                pretrained_mlp_size=pretrained_layer.mlp_size,
                new_attn_size=new_model_layer.attn_size,
                new_mlp_size=new_model_layer.mlp_size,
                wqkvff_is_glu=True,
            )

        # then try the fused attention layers
        elif isinstance(
            pretrained_layer.attn,
            (
                FlexBertUnpadAttention,
                FlexBertPaddedAttention,
                FlexBertUnpadRopeAttention,
                FlexBertPaddedRopeAttention,
            ),
        ):
            tile_linear(pretrained_layer.attn.Wqkv, new_model_layer.attn.Wqkv, linear_type=TileLinear.wqkv, mode=mode)
        else:
            raise ValueError(f"Unsupported attention layer type: {type(pretrained_layer.attn)}")

        # finally, tile the attention output layer
        tile_linear(pretrained_layer.attn.Wo, new_model_layer.attn.Wo, linear_type=TileLinear.default, mode=mode)

        # tile the mlp layer if the model is not using parallel attention layers
        if not isinstance(pretrained_layer.mlp, (FlexBertMLP, FlexBertGLU, FlexBertParallelGLU)):
            raise ValueError(f"Unsupported MLP layer type: {type(pretrained_layer.mlp)}")
        assert isinstance(
            new_model_layer.mlp, type(pretrained_layer.mlp)
        ), f"Pretrained and new_model mlp layers must be the same type, got {type(new_model_layer.mlp)} and {type(pretrained_layer.mlp)}"

        # already tiled the parallel glu layer if it exists, so only need to handle mlp & glu Wi
        if isinstance(pretrained_layer.mlp, FlexBertGLU):
            tile_linear(pretrained_layer.mlp.Wi, new_model_layer.mlp.Wi, linear_type=TileLinear.glu, mode=mode)
        elif isinstance(pretrained_layer.mlp, FlexBertMLP):
            tile_linear(pretrained_layer.mlp.Wi, new_model_layer.mlp.Wi, linear_type=TileLinear.default, mode=mode)
        # tile the output for both ParallelGLU and MLP/GLU
        tile_linear(pretrained_layer.mlp.Wo, new_model_layer.mlp.Wo, linear_type=TileLinear.default, mode=mode)


def init_mlm_model_from_pretrained(
    config: FlexBertConfig,
    pretrained_model: FlexBertForMaskedLM,
    new_model: FlexBertForMaskedLM,
    mode: Union[str, TileMode] = TileMode.tile_weights_from_middle,
):
    """
    Initialize the new model from the pretrained model.

    This method uses Gopher layer scaling and Phi-style weight tiling as selected by `mode`.
    The new model must have the same or more layers and the same or larger dimensions than the pretrained model.

    Args:
        config (FlexBertConfig): The configuration of the new_model
        pretrained_model (FlexBertForMaskedLM): The smaller, pre-trained model
        new_model (FlexBertForMaskedLM): The larger model to be initialized from the pretrained model
        mode (Union[str, TileMode]): The Phi-style weight tiling mode to use

    This function assumes that the new_model has more layers and a larger hidden size
    than the pretrained_model, but the same vocabulary size.
    """
    init_model_from_pretrained(pretrained_model.bert, new_model.bert, mode=mode)

    # TODO: uncomment this when the repo is turned into a pip installable package
    # if not isinstance(pretrained_model.head, FlexBertPredictionHead):
    #     raise ValueError(f"Pretrained model must have a prediction head: {type(pretrained_model.head)}")
    # if not isinstance(new_model.head, FlexBertPredictionHead):
    #     raise ValueError(f"New model must have a prediction head: {type(new_model.head)}")

    # tile the prediction head
    tile_linear(pretrained_model.head.dense, new_model.head.dense, linear_type=TileLinear.default, mode=mode)
    tile_norm(pretrained_model.head.norm, new_model.head.norm, mode=mode)

    # setup weight tying
    if config.tie_word_embeddings:
        new_model.decoder.weight = new_model.bert.embeddings.tok_embeddings.weight
        tile_linear(
            pretrained_model.decoder, new_model.decoder, linear_type=TileLinear.default, mode=mode, bias_only=True
        )
    else:
        tile_linear(pretrained_model.decoder, new_model.decoder, linear_type=TileLinear.default, mode=mode)