Upload BiMambaForMaskedLM

Browse files

Files changed (5) hide show

README.md +201 -0
config.json +44 -0
configuration_bimamba.py +51 -0
model.safetensors +3 -0
modeling_bimamba.py +417 -0

README.md ADDED Viewed

	@@ -0,0 +1,201 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "architectures": [
+    "BiMambaForMaskedLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_bimamba.BiMambaConfig",
+    "AutoModel": "modeling_bimamba.BiMamba",
+    "AutoModelForMaskedLM": "modeling_bimamba.BiMambaForMaskedLM"
+  },
+  "bidirectional": true,
+  "bidirectional_strategy": "add",
+  "bidirectional_weight_tie": true,
+  "d_model": 2,
+  "fused_add_norm": true,
+  "initializer_cfg": {
+    "initializer_range": 0.02,
+    "n_residuals_per_layer": 1,
+    "rescale_prenorm_residual": true
+  },
+  "model_type": "bimamba",
+  "n_layer": 1,
+  "norm_epsilon": 1e-05,
+  "pad_token_id": -100,
+  "pad_vocab_size_multiple": 8,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "ssm_cfg": {
+    "bias": false,
+    "conv_bias": true,
+    "d_conv": 4,
+    "d_state": 16,
+    "dt_init": "random",
+    "dt_init_floor": 0.0001,
+    "dt_max": 0.1,
+    "dt_min": 0.001,
+    "dt_rank": "auto",
+    "dt_scale": 1.0,
+    "expand": 2,
+    "use_fast_path": true
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.1",
+  "vocab_size": 8
+}

configuration_bimamba.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Caduceus config for Hugging Face.
+"""
+from typing import Optional, Union
+from transformers import PretrainedConfig
+class BiMambaConfig(PretrainedConfig):
+    """Config that extends the original MambaConfig with params relevant to bi-directionality."""
+    model_type = "bimamba"
+    def __init__(
+            self,
+            # From original MambaConfig
+            d_model: int = 2560,
+            n_layer: int = 64,
+            vocab_size: int = 50277,
+            ssm_cfg: Optional[dict] = None,
+            rms_norm: bool = True,
+            residual_in_fp32: bool = True,
+            fused_add_norm: bool = True,
+            pad_vocab_size_multiple: int = 8,
+            # Not in original MambaConfig, but default arg in create_block in mamba_ssm repo; used in layer norm
+            norm_epsilon: float = 1e-5,
+            # Used in init_weights
+            initializer_cfg: Optional[dict] = None,
+            # Caduceus-specific params
+            bidirectional: bool = True,
+            bidirectional_strategy: Union[str, None] = "add",
+            bidirectional_weight_tie: bool = True,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.vocab_size = vocab_size
+        self.ssm_cfg = ssm_cfg
+        self.rms_norm = rms_norm
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.norm_epsilon = norm_epsilon
+        self.initializer_cfg = initializer_cfg
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        self.bidirectional_weight_tie = bidirectional_weight_tie

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e27efb874440812a910cf613e84d73caa4d133cd1e4dc33d8dbae5eae60de1f
+size 4112

modeling_bimamba.py ADDED Viewed

	@@ -0,0 +1,417 @@

+"""BiMamba model for Hugging Face.
+"""
+import math
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+from mamba_ssm.modules.mamba_simple import Mamba, Block
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention, MaskedLMOutput
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from .configuration_bimamba import BiMambaConfig
+def create_block(
+        d_model,
+        ssm_cfg=None,
+        norm_epsilon=1e-5,
+        rms_norm=False,
+        residual_in_fp32=False,
+        fused_add_norm=False,
+        layer_idx=None,
+        bidirectional=True,
+        bidirectional_strategy="add",
+        bidirectional_weight_tie=True,
+        device=None,
+        dtype=None,
+):
+    """Create BiMamba block.
+    Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
+    """
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    factory_kwargs = {"device": device, "dtype": dtype}
+    bidirectional_kwargs = {
+        "bidirectional": bidirectional,
+        "bidirectional_strategy": bidirectional_strategy,
+        "bidirectional_weight_tie": bidirectional_weight_tie,
+    }
+    mixer_cls = partial(BiMambaWrapper, layer_idx=layer_idx, **ssm_cfg, **bidirectional_kwargs, **factory_kwargs)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    block_cls = Block
+    block = block_cls(
+        d_model,
+        mixer_cls,
+        norm_cls=norm_cls,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    block.layer_idx = layer_idx
+    return block
+class BiMambaWrapper(nn.Module):
+    """Thin wrapper around Mamba to support bi-directionality."""
+    def __init__(
+            self,
+            d_model: int,
+            bidirectional: bool = True,
+            bidirectional_strategy: Optional[str] = "add",
+            bidirectional_weight_tie: bool = True,
+            **mamba_kwargs,
+    ):
+        super().__init__()
+        if bidirectional and bidirectional_strategy is None:
+            bidirectional_strategy = "add"  # Default strategy: `add`
+        if bidirectional and bidirectional_strategy not in ["add", "ew_multiply"]:
+            raise NotImplementedError(f"`{bidirectional_strategy}` strategy for bi-directionality is not implemented!")
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        self.mamba_fwd = Mamba(
+            d_model=d_model,
+            **mamba_kwargs
+        )
+        if bidirectional:
+            self.mamba_rev = Mamba(
+                d_model=d_model,
+                **mamba_kwargs
+            )
+            if bidirectional_weight_tie:  # Tie in and out projections (where most of param count lies)
+                self.mamba_rev.in_proj.weight = self.mamba_fwd.in_proj.weight
+                self.mamba_rev.in_proj.bias = self.mamba_fwd.in_proj.bias
+                self.mamba_rev.out_proj.weight = self.mamba_fwd.out_proj.weight
+                self.mamba_rev.out_proj.bias = self.mamba_fwd.out_proj.bias
+        else:
+            self.mamba_rev = None
+    def forward(self, hidden_states, inference_params=None):
+        """Bidirectional-enabled forward pass
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        out = self.mamba_fwd(hidden_states, inference_params=inference_params)
+        if self.bidirectional:
+            out_rev = self.mamba_rev(
+                hidden_states.flip(dims=(1,)),  # Flip along the sequence length dimension
+                inference_params=inference_params
+            ).flip(dims=(1,))  # Flip back for combining with forward hidden states
+            if self.bidirectional_strategy == "add":
+                out = out + out_rev
+            elif self.bidirectional_strategy == "ew_multiply":
+                out = out * out_rev
+            else:
+                raise NotImplementedError(f"`{self.bidirectional_strategy}` for bi-directionality not implemented!")
+        return out
+class BiMambaEmbeddings(nn.Module):
+    def __init__(
+            self,
+            config: BiMambaConfig,
+            device=None,
+            dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.d_model, **factory_kwargs)
+    def forward(self, input_ids):
+        """
+            input_ids: (batch, seqlen)
+        """
+        return self.word_embeddings(input_ids)
+class BiMambaMixerModel(nn.Module):
+    def __init__(
+            self,
+            config: BiMambaConfig,
+            device=None,
+            dtype=None,
+    ) -> None:
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.fused_add_norm = config.fused_add_norm
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.embeddings = BiMambaEmbeddings(config, **factory_kwargs)
+        # Mamba changes the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        if config.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    config.d_model,
+                    ssm_cfg=config.ssm_cfg,
+                    norm_epsilon=config.norm_epsilon,
+                    rms_norm=config.rms_norm,
+                    residual_in_fp32=config.residual_in_fp32,
+                    fused_add_norm=config.fused_add_norm,
+                    layer_idx=i,
+                    bidirectional=config.bidirectional,
+                    bidirectional_strategy=config.bidirectional_strategy,
+                    bidirectional_weight_tie=config.bidirectional_weight_tie,
+                    **factory_kwargs,
+                )
+                for i in range(config.n_layer)
+            ]
+        )
+        norm_f = (nn.LayerNorm if not config.rms_norm else RMSNorm)(
+            config.d_model, eps=config.norm_epsilon, **factory_kwargs
+        )
+        self.norm_f = norm_f
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=False):
+        """Mixer forward."""
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        residual = None
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+            # TODO: Add support for gradient checkpointing
+            hidden_states, residual = layer(
+                hidden_states, residual, inference_params=None
+            )
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            # Set prenorm=False here since we don't need the residual
+            hidden_states = fused_add_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+def cross_entropy(logits, y, ignore_index=-100):
+    """Cross entropy loss."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    return F.cross_entropy(logits, y, ignore_index=ignore_index)
+def weighted_cross_entropy(logits, y, loss_weights, ignore_index=-100):
+    """Weighted cross entropy loss (discounts certain tokens)."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    ce = F.cross_entropy(logits, y, ignore_index=ignore_index, reduction="none")
+    loss_weights = loss_weights.view(-1)
+    loss_weights[y == ignore_index] = 0.0
+    # TODO: Follows GPN implementation, but should we remove weight normalization?
+    return (ce * (loss_weights / loss_weights.sum())).sum()
+class BiMambaPreTrainedModel(PreTrainedModel):
+    """PreTrainedModel wrapper for BiMamba backbone."""
+    config_class = BiMambaConfig
+    base_model_prefix = "bimamba"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["BiMambaWrapper"]
+    def _init_weights(
+            self,
+            module,
+            initializer_range=0.02,  # Now only used for embedding layer.
+            **kwargs,
+    ):
+        """Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py"""
+        n_layer = self.config.n_layer
+        initialized_cfg = self.config.initializer_cfg if self.config.initializer_cfg is not None else {}
+        rescale_prenorm_residual = initialized_cfg.get("rescale_prenorm_residual", True)
+        initializer_range = initialized_cfg.get("initializer_range", initializer_range)
+        n_residuals_per_layer = initialized_cfg.get("n_residuals_per_layer", 1)
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth.
+            #   > Scale the weights of residual layers at initialization by a factor of 1/√N where N is the # of
+            #   residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight", "fc2.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(n_residuals_per_layer * n_layer)
+class BiMamba(BiMambaPreTrainedModel):
+    """BiMamba model that can be instantiated using HF patterns."""
+    def __init__(self, config: BiMambaConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config)
+        # Adjust vocab size if vocab padding is set.
+        if config.vocab_size % config.pad_vocab_size_multiple != 0:
+            config.vocab_size += config.pad_vocab_size_multiple - (config.vocab_size % config.pad_vocab_size_multiple)
+        self.config = config
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.backbone = BiMambaMixerModel(config, **factory_kwargs, **kwargs)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[torch.Tensor, Tuple, BaseModelOutputWithNoAttention]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states, all_hidden_states = self.backbone(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states
+        )
+        if return_dict:
+            return BaseModelOutputWithNoAttention(
+                last_hidden_state=hidden_states,
+                hidden_states=all_hidden_states if output_hidden_states else None
+            )
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class BiMambaForMaskedLM(BiMambaPreTrainedModel):
+    """HF-compatible BiMamba model for masked language modeling."""
+    def __init__(self, config: BiMambaConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config, **kwargs)
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.bimamba = BiMamba(config, **factory_kwargs, **kwargs)
+        self.lm_head = nn.Linear(
+            config.d_model,
+            self.config.vocab_size,  # Use BiMamba config as it might have been updated
+            bias=False,
+            **factory_kwargs
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.bimamba.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.bimamba.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Overrides output embeddings."""
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        """Tie weights."""
+        super().tie_weights()
+    def get_decoder(self):
+        """Get decoder (backbone) for the model."""
+        return self.bimamba
+    def set_decoder(self, decoder):
+        """Set decoder (backbone) for the model."""
+        self.bimamba = decoder
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        loss_weights: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.bimamba(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            if loss_weights is not None:
+                loss = weighted_cross_entropy(logits, labels, loss_weights, ignore_index=self.config.pad_token_id)
+            else:
+                loss = cross_entropy(logits, labels, ignore_index=self.config.pad_token_id)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )