support use_flash_attn in from_pretrained

#18

by michael-guenther - opened May 30

base: refs/heads/main

←

from: refs/pr/18

Discussion Files changed

+586

-1478

Files changed (16) hide show

README.md +4 -113
block.py +4 -5
config.json +31 -0
configuration_xlm_roberta.py +34 -95
embedding.py +12 -45
mha.py +46 -143
mlp.py +15 -58
modeling_lora.py +95 -140
modeling_xlm_roberta.py +217 -191
modeling_xlm_roberta_for_glue.py +109 -0
pytorch_model.bin +3 -0
rotary.py +0 -659
stochastic_depth.py +1 -1
tokenizer.json +0 -0
tokenizer_config.json +4 -0
xlm_padding.py +10 -28

README.md CHANGED Viewed

@@ -1,114 +1,5 @@
----
-tags:
-- transformers
-- xlm-roberta
-library_name: transformers
-license: cc-by-nc-4.0
-language:
-  - multilingual
-  - af
-  - am
-  - ar
-  - as
-  - az
-  - be
-  - bg
-  - bn
-  - br
-  - bs
-  - ca
-  - cs
-  - cy
-  - da
-  - de
-  - el
-  - en
-  - eo
-  - es
-  - et
-  - eu
-  - fa
-  - fi
-  - fr
-  - fy
-  - ga
-  - gd
-  - gl
-  - gu
-  - ha
-  - he
-  - hi
-  - hr
-  - hu
-  - hy
-  - id
-  - is
-  - it
-  - ja
-  - jv
-  - ka
-  - kk
-  - km
-  - kn
-  - ko
-  - ku
-  - ky
-  - la
-  - lo
-  - lt
-  - lv
-  - mg
-  - mk
-  - ml
-  - mn
-  - mr
-  - ms
-  - my
-  - ne
-  - nl
-  - 'no'
-  - om
-  - or
-  - pa
-  - pl
-  - ps
-  - pt
-  - ro
-  - ru
-  - sa
-  - sd
-  - si
-  - sk
-  - sl
-  - so
-  - sq
-  - sr
-  - su
-  - sv
-  - sw
-  - ta
-  - te
-  - th
-  - tl
-  - tr
-  - ug
-  - uk
-  - ur
-  - uz
-  - vi
-  - xh
-  - yi
-  - zh
----
-Core implementation of Jina XLM-RoBERTa
-This implementation is adapted from [XLM-Roberta](https://huggingface.co/docs/transformers/en/model_doc/xlm-roberta). In contrast to the original implementation, this model uses Rotary positional encodings and supports flash-attention 2.
-### Models that use this implementation
-- [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3)
-- [jinaai/jina-colbert-v2](https://huggingface.co/jinaai/jina-colbert-v2)
-### Converting weights
-Weights from an [original XLMRoberta model](https://huggingface.co/FacebookAI/xlm-roberta-large) can be converted using the `convert_roberta_weights_to_flash.py` script in the model repository.

+# Converting Weights
+```
+python3 -m "xlm-roberta-flash-implementation".convert_roberta_weights_to_flash --output pytorch_model_xlmr_flash.bin
+```

block.py CHANGED Viewed

@@ -8,14 +8,15 @@ from typing import Optional
 import torch
 import torch.nn as nn
 from torch import Tensor
 from .mha import MHA
 from .mlp import Mlp
-from .stochastic_depth import StochasticDepth
 try:
-    from flash_attn.ops.triton.layer_norm import RMSNorm, layer_norm_fn
 except ImportError:
     layer_norm_fn, RMSNorm = None, None
@@ -232,9 +233,7 @@ class Block(nn.Module):
                     is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if not isinstance(self.mlp, nn.Identity):
-                mlp_out = self.mlp(
-                    hidden_states, adapter_mask=mixer_kwargs.get("adapter_mask")
-                )
                 if self.return_residual:  # mlp out is actually a pair here
                     mlp_out, hidden_states = mlp_out
                 if not self.fused_dropout_add_ln:

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch import Tensor
+from .stochastic_depth import StochasticDepth
 from .mha import MHA
 from .mlp import Mlp
 try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
 except ImportError:
     layer_norm_fn, RMSNorm = None, None
                     is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if not isinstance(self.mlp, nn.Identity):
+                mlp_out = self.mlp(hidden_states)
                 if self.return_residual:  # mlp out is actually a pair here
                     mlp_out, hidden_states = mlp_out
                 if not self.fused_dropout_add_ln:

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "auto_map": {
+    "AutoConfig": "configuration_xlm_roberta.XLMRobertaFlashConfig",
+    "AutoModel": "modeling_xlm_roberta.XLMRobertaModel",
+    "AutoModelForPreTraining": "modeling_xlm_roberta.XLMRobertaForPreTraining",
+    "AutoModelForMaskedLM": "modeling_xlm_roberta.XLMRobertaForMaskedLM",
+    "AutoModelForSequenceClassification":"modeling_xlm_roberta.XLMRobertaForSequenceClassification"
+  },
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.17.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": false,
+  "vocab_size": 250002
+}

configuration_xlm_roberta.py CHANGED Viewed

@@ -1,94 +1,42 @@
-from typing import Any, Dict, List, Optional, Union
-import torch
 from transformers import PretrainedConfig
 class XLMRobertaFlashConfig(PretrainedConfig):
-    model_type = "xlm-roberta"
     def __init__(
-        self,
-        vocab_size: int = 250002,
-        hidden_size: int = 1024,
-        num_hidden_layers: int = 24,
-        num_attention_heads: int = 16,
-        intermediate_size: int = 4096,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        max_position_embeddings: int = 8194,
-        type_vocab_size: int = 1,
-        initializer_range: float = 0.02,
-        layer_norm_eps: float = 1e-05,
-        pad_token_id: int = 1,
-        bos_token_id: int = 0,
-        eos_token_id: int = 2,
-        position_embedding_type: str = "rotary",
-        rotary_emb_base: float = 10000.0,
-        use_cache: bool = True,
-        use_reentrant: bool = False,
-        classifier_dropout: Optional[float] = None,
-        lora_adaptations: Optional[List[str]] = None,
-        task_instructions: Optional[Dict[str, str]] = None,
-        lora_rank: int = 4,
-        lora_dropout_p: float = 0.0,
-        lora_alpha: int = 1,
-        lora_main_params_trainable: bool = False,
-        load_trained_adapters: bool = False,
-        use_flash_attn: bool = True,
-        torch_dtype: Optional[Union[str, torch.dtype]] = None,
-        emb_pooler: Optional[str] = None,
-        matryoshka_dimensions: Optional[List[int]] = None,
-        truncate_dim: Optional[int] = None,
-        **kwargs: Dict[str, Any],
     ):
-        """
-        Initialize the XLMRobertaFlashConfig configuration.
-        Args:
-            vocab_size (int): Size of the vocabulary.
-            hidden_size (int): Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (int): Number of hidden layers in the Transformer encoder.
-            num_attention_heads (int): Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (int): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer.
-            hidden_act (str): The activation function to use.
-            hidden_dropout_prob (float): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (float): The dropout ratio for the attention probabilities.
-            max_position_embeddings (int): The maximum length of the position embeddings.
-            type_vocab_size (int): The vocabulary size of the token type ids.
-            initializer_range (float): The standard deviation for initializing all weight matrices.
-            layer_norm_eps (float): The epsilon used by the layer normalization layers.
-            pad_token_id (int): The ID of the padding token.
-            bos_token_id (int): The ID of the beginning-of-sequence token.
-            eos_token_id (int): The ID of the end-of-sequence token.
-            position_embedding_type (str): Type of position embeddings. Options are 'absolute', 'alibi', or 'rotary'.
-            rotary_emb_base (float): Base for rotary embeddings.
-            use_cache (bool): Whether or not the model should return the last key/values attentions (not used by all models).
-            use_reentrant (bool): Whether or not the model should enable the 'use_reentrant' flag in gradient checkpointing.
-            classifier_dropout (Optional[float]): The dropout ratio for the classification head.
-            lora_adaptations (Optional[List[str]]): LoRA adaptations configuration.
-            lora_prompts (Optional[Dict[str, str]]): LoRA prompts configuration.
-            lora_rank (int): Rank for LoRA adaptations.
-            lora_dropout_p (float): Dropout probability for LoRA adaptations.
-            lora_alpha (int): Alpha parameter for LoRA.
-            lora_main_params_trainable (bool): Whether to make the main model parameters trainable when using LoRA.
-            load_trained_adapters (bool): Whether to load trained adapters.
-            use_flash_attn (bool): Whether to use FlashAttention.
-            torch_dtype (Optional[Union[str, torch.dtype]]): Data type for the tensors.
-            emb_pooler (Optional[str]): Pooling layer configuration.
-            matryoshka_dimensions (Optional[List[int]]): Configuration for matryoshka dimension reduction.
-            truncate_dim (Optional[int]): Dimension to truncate embeddings to, if any.
-            **kwargs (Dict[str, Any]): Additional keyword arguments passed to the configuration.
-        """
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
@@ -103,13 +51,10 @@ class XLMRobertaFlashConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
-        self.rotary_emb_base = rotary_emb_base
         self.use_cache = use_cache
-        self.use_reentrant = use_reentrant
         self.classifier_dropout = classifier_dropout
         self.load_trained_adapters = load_trained_adapters
         self.lora_adaptations = lora_adaptations
-        self.task_instructions = task_instructions
         self.lora_rank = lora_rank
         self.lora_dropout_p = lora_dropout_p
         self.lora_alpha = lora_alpha
@@ -118,13 +63,7 @@ class XLMRobertaFlashConfig(PretrainedConfig):
         self.emb_pooler = emb_pooler
         self.matryoshka_dimensions = matryoshka_dimensions
         self.truncate_dim = truncate_dim
-        if (
-            torch_dtype
-            and hasattr(torch, torch_dtype)
-            and type(getattr(torch, torch_dtype)) is torch.dtype
-        ):
             self.torch_dtype = getattr(torch, torch_dtype)
         else:
             self.torch_dtype = torch_dtype
-        if not self.use_flash_attn or not torch.cuda.is_available():
-            self.torch_dtype = torch.float32

 from transformers import PretrainedConfig
+import torch
 class XLMRobertaFlashConfig(PretrainedConfig):
     def __init__(
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+            position_embedding_type="absolute",
+            use_cache=True,
+            classifier_dropout=None,
+            lora_adaptations=None,
+            lora_rank=4,
+            lora_dropout_p=0.0,
+            lora_alpha=1,
+            lora_main_params_trainable=False,
+            load_trained_adapters=False,
+            use_flash_attn=True,
+            torch_dtype=None,
+            emb_pooler=None,
+            matryoshka_dimensions=None,
+            truncate_dim=None,
+            **kwargs,
     ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
         self.load_trained_adapters = load_trained_adapters
         self.lora_adaptations = lora_adaptations
         self.lora_rank = lora_rank
         self.lora_dropout_p = lora_dropout_p
         self.lora_alpha = lora_alpha
         self.emb_pooler = emb_pooler
         self.matryoshka_dimensions = matryoshka_dimensions
         self.truncate_dim = truncate_dim
+        if torch_dtype and hasattr(torch, torch_dtype) and type(getattr(torch, torch_dtype)) is torch.dtype:
             self.torch_dtype = getattr(torch, torch_dtype)
         else:
             self.torch_dtype = torch_dtype

embedding.py CHANGED Viewed

@@ -5,8 +5,10 @@
 import torch
 import torch.nn as nn
-from transformers.models.xlm_roberta.modeling_xlm_roberta import \
-    create_position_ids_from_input_ids
 class XLMRobertaEmbeddings(nn.Module):
@@ -36,60 +38,25 @@ class XLMRobertaEmbeddings(nn.Module):
                 max_position_embeddings, embed_dim, **factory_kwargs
             )
         if self.type_vocab_size > 0:
-            self.token_type_embeddings = nn.Embedding(
-                type_vocab_size, embed_dim, **factory_kwargs
-            )
-    def forward(
-        self, input_ids, position_ids=None, token_type_ids=None, adapter_mask=None
-    ):
         """
         input_ids: (batch, seqlen)
         position_ids: (batch, seqlen)
         token_type_ids: (batch, seqlen)
-        adapter_mask: (batch, 1)
         """
         batch_size, seqlen = input_ids.shape
-        if adapter_mask is not None:
-            unique_tasks = torch.unique(adapter_mask)
-            embedding_dtype = next(self.word_embeddings.parameters()).dtype
-            embeddings = torch.empty(
-                *input_ids.shape,
-                self.word_embeddings.embedding_dim,
-                dtype=embedding_dtype,
-                device=input_ids.device
-            )
-            for task_id in unique_tasks:
-                task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                task_input_ids = input_ids[task_indices]
-                task_embeddings = self.word_embeddings(task_input_ids, task_id=task_id)
-                embeddings[task_indices] = task_embeddings
-        else:
-            embeddings = self.word_embeddings(input_ids)
         if self.max_position_embeddings > 0:
             if position_ids is None:
-                position_ids = create_position_ids_from_input_ids(
-                    input_ids, padding_idx=self.word_embeddings.padding_idx
-                ).to(input_ids.device)
             position_embeddings = self.position_embeddings(position_ids)
             embeddings = embeddings + position_embeddings
         if self.type_vocab_size > 0:
             if token_type_ids is None:
-                token_type_ids = torch.zeros(
-                    seqlen, dtype=torch.long, device=input_ids.device
-                )
-            if adapter_mask is not None:
-                unique_tasks = torch.unique(adapter_mask)
-                for task_id in unique_tasks:
-                    task_token_type_embeddings = self.token_type_embeddings(
-                        token_type_ids, task_id=task_id
-                    )
-                    task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                    embeddings[task_indices] = (
-                        embeddings[task_indices] + task_token_type_embeddings
-                    )
-            else:
-                token_type_embeddings = self.token_type_embeddings(token_type_ids)
-                embeddings = embeddings + token_type_embeddings
         return embeddings

 import torch
 import torch.nn as nn
+from einops import rearrange
+from torch import Tensor
+from transformers.models.xlm_roberta.modeling_xlm_roberta import create_position_ids_from_input_ids
 class XLMRobertaEmbeddings(nn.Module):
                 max_position_embeddings, embed_dim, **factory_kwargs
             )
         if self.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(type_vocab_size, embed_dim, **factory_kwargs)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None):
         """
         input_ids: (batch, seqlen)
         position_ids: (batch, seqlen)
         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
+        embeddings = self.word_embeddings(input_ids)
         if self.max_position_embeddings > 0:
             if position_ids is None:
+                position_ids =create_position_ids_from_input_ids(input_ids, padding_idx=self.word_embeddings.padding_idx).to(input_ids.device)
+                # position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
             position_embeddings = self.position_embeddings(position_ids)
             embeddings = embeddings + position_embeddings
         if self.type_vocab_size > 0:
             if token_type_ids is None:
+                token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings = embeddings + token_type_embeddings
         return embeddings

mha.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # This implementation was adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py
 # Commit id: 6bbc532388e61185a92e2a563126739967b4c8c5
-# Rotary varlen support from https://github.com/Dao-AILab/flash-attention/pull/556
 # Copyright (c) 2023, Tri Dao.
@@ -12,23 +11,27 @@ import torch.nn as nn
 from einops import rearrange, repeat
 try:
-    from flash_attn import (flash_attn_kvpacked_func,
-                            flash_attn_qkvpacked_func,
-                            flash_attn_varlen_kvpacked_func,
-                            flash_attn_varlen_qkvpacked_func,
-                            flash_attn_with_kvcache)
 except ImportError:
     flash_attn_varlen_qkvpacked_func, flash_attn_varlen_kvpacked_func = None, None
     flash_attn_qkvpacked_func, flash_attn_kvpacked_func = None, None
     flash_attn_with_kvcache = None
 try:
-    from flash_attn.ops.fused_dense import (ColumnParallelLinear, FusedDense,
-                                            RowParallelLinear)
 except ImportError:
     FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
-from .rotary import RotaryEmbedding
 # From https://github.com/ofirpress/attention_with_linear_biases/blob/4b92f28a005ead2567abe2359f633e73e08f3833/fairseq/models/transformer.py#L742
@@ -44,9 +47,7 @@ def get_alibi_slopes(nheads):
         closest_power_of_2 = 2 ** math.floor(math.log2(nheads))
         return (
             get_slopes_power_of_2(closest_power_of_2)
-            + get_alibi_slopes(2 * closest_power_of_2)[0::2][
-                : nheads - closest_power_of_2
-            ]
         )
@@ -71,9 +72,7 @@ class FlashSelfAttention(nn.Module):
         deterministic=False,
     ):
         super().__init__()
-        assert (
-            flash_attn_varlen_qkvpacked_func is not None
-        ), "FlashAttention is not installed"
         assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
@@ -153,9 +152,7 @@ class FlashCrossAttention(nn.Module):
         deterministic=False,
     ):
         super().__init__()
-        assert (
-            flash_attn_varlen_kvpacked_func is not None
-        ), "FlashAttention is not installed"
         assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
@@ -321,10 +318,7 @@ class CrossAttention(nn.Module):
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
             padding_mask = torch.full(
-                (batch_size, seqlen_k),
-                -10000.0,
-                dtype=scores.dtype,
-                device=scores.device,
             )
             padding_mask.masked_fill_(key_padding_mask, 0.0)
             # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
@@ -436,26 +430,20 @@ class MHA(nn.Module):
         else:
             alibi_slopes = None
         if window_size != (-1, -1):
-            assert (
-                use_flash_attn
-            ), "Local (sliding window) attention code path requires flash_attn"
         self.num_heads = num_heads
         self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
         assert (
             self.num_heads % self.num_heads_kv == 0
         ), "num_heads must be divisible by num_heads_kv"
-        assert (
-            self.embed_dim % num_heads == 0
-        ), "embed_dim must be divisible by num_heads"
         self.head_dim = self.embed_dim // num_heads
         qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
         kv_dim = 2 * self.head_dim * self.num_heads_kv
         if self.rotary_emb_dim > 0:
-            assert (
-                not cross_attn
-            ), "MHA with rotary embedding does not support cross-attention yet"
             assert RotaryEmbedding is not None, "rotary_emb is not installed"
             self.rotary_emb = RotaryEmbedding(
                 self.rotary_emb_dim,
@@ -463,41 +451,29 @@ class MHA(nn.Module):
                 scale_base=rotary_emb_scale_base,
                 interleaved=rotary_emb_interleaved,
                 device=device,
-                use_flash_attn=use_flash_attn,
             )
         if fused_bias_fc and FusedDense is None:
             raise ImportError("fused_dense is not installed")
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
         linear_resid_cls = (
-            LinearResidual
-            if not fused_bias_fc
-            else partial(FusedDense, return_residual=True)
         )
         wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
         inner_attn_cls = (
-            partial(
-                FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size
-            )
             if use_flash_attn
             else SelfAttention
         )
         inner_cross_attn_cls = (
-            partial(
-                FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size
-            )
             if use_flash_attn
             else CrossAttention
         )
         if not self.cross_attn:
-            self.Wqkv = wqkv_cls(
-                embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs
-            )
         else:
-            self.Wq = linear_cls(
-                embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs
-            )
             self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
         if self.dwconv:
             if self.num_heads_kv == self.num_heads:
@@ -508,9 +484,7 @@ class MHA(nn.Module):
                 self.dwconv_q = nn.Conv1d(
                     embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim
                 )
-                self.dwconv_kv = nn.Conv1d(
-                    kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim
-                )
         self.inner_attn = inner_attn_cls(
             causal=causal,
             softmax_scale=softmax_scale,
@@ -519,9 +493,7 @@ class MHA(nn.Module):
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        self.out_proj = linear_cls(
-            embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs
-        )
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
         dtype = self.out_proj.weight.dtype if dtype is None else dtype
@@ -539,9 +511,7 @@ class MHA(nn.Module):
     def _update_kv_cache(self, kv, inference_params):
         """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
         assert not self.dwconv, "Generation does not support dwconv yet"
-        assert (
-            self.layer_idx is not None
-        ), "Generation requires layer_idx in the constructor"
         return _update_kv_cache(kv, inference_params, self.layer_idx)
     def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
@@ -557,10 +527,7 @@ class MHA(nn.Module):
             self.rotary_emb._update_cos_sin_cache(
                 inference_params.max_seqlen, device=q.device, dtype=q.dtype
             )
-            rotary_cos, rotary_sin = (
-                self.rotary_emb._cos_cached,
-                self.rotary_emb._sin_cached,
-            )
         else:
             rotary_cos, rotary_sin = None, None
         batch = q.shape[0]
@@ -582,9 +549,7 @@ class MHA(nn.Module):
             cache_seqlens=cache_seqlens,
             softmax_scale=self.inner_cross_attn.softmax_scale,
             causal=self.inner_cross_attn.causal,
-            rotary_interleaved=(
-                self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False
-            ),
             alibi_slopes=alibi_slopes,
         )
         return context
@@ -629,7 +594,6 @@ class MHA(nn.Module):
         max_seqlen=None,
         mixer_subset=None,
         inference_params=None,
-        adapter_mask=None,
         **kwargs,
     ):
         """
@@ -655,6 +619,7 @@ class MHA(nn.Module):
             assert key_padding_mask is None
             assert self.use_flash_attn
             assert not self.dwconv
         if key_padding_mask is not None:
             assert cu_seqlens is None
             assert max_seqlen is None
@@ -678,50 +643,19 @@ class MHA(nn.Module):
                 else inference_params.seqlen_offset
             )
         )
-        rotary_max_seqlen = (
-            inference_params.max_sequence_len
-            if inference_params is not None
-            else max_seqlen
-        )
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
-            if adapter_mask is not None:
-                unique_tasks = torch.unique(adapter_mask)
-                qkv_dtype = next(self.Wqkv.parameters()).dtype
-                qkv = torch.empty(
-                    *x.shape[:-1],
-                    self.Wqkv.out_features,
-                    dtype=qkv_dtype,
-                    device=x.device,
-                )
-                for task_id in unique_tasks:
-                    task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                    task_tensor = x[task_indices]
-                    if not self.return_residual:
-                        task_qkv = self.Wqkv(task_tensor, task_id=task_id)
-                    else:
-                        task_qkv, _ = self.Wqkv(
-                            task_tensor, task_id=task_id, residual=True
-                        )
-                    qkv[task_indices] = task_qkv
             else:
-                if not self.return_residual:
-                    qkv = self.Wqkv(x)
-                else:
-                    if hasattr(self.Wqkv, "parametrizations"):
-                        qkv, x = self.Wqkv(x, residual=True)
-                    else:
-                        qkv, x = self.Wqkv(x)
             if self.dwconv:
                 qkv = rearrange(
-                    self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2],
-                    "b d s -> b s d",
                 ).contiguous()
-            qkv = rearrange(
-                qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim
-            )
             if (
                 inference_params is None
                 or inference_params.seqlen_offset == 0
@@ -730,18 +664,13 @@ class MHA(nn.Module):
             ):
                 if self.rotary_emb_dim > 0:
                     qkv = self.rotary_emb(
-                        qkv,
-                        seqlen_offset=seqlen_offset,
-                        cu_seqlens=cu_seqlens,
-                        max_seqlen=rotary_max_seqlen,
                     )
                 if inference_params is None:
                     if not self.checkpointing:
                         context = self.inner_attn(qkv, **kwargs)
                     else:
-                        context = torch.utils.checkpoint.checkpoint(
-                            self.inner_attn, qkv, **kwargs
-                        )
                 else:
                     context = self._update_kvcache_attention(
                         qkv[:, :, 0], qkv[:, :, 1:], inference_params
@@ -770,17 +699,13 @@ class MHA(nn.Module):
                 q = qkv[..., : self.num_heads * self.head_dim]
                 kv = qkv[..., self.num_heads * self.head_dim :]
             q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
-            kv = rearrange(
-                kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim
-            )
             if self.dwconv:
                 q = rearrange(
-                    self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2],
-                    "b d s -> b s d",
                 ).contiguous()
                 kv = rearrange(
-                    self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2],
-                    "b d s -> b s d",
                 ).contiguous()
             if (
                 inference_params is None
@@ -790,11 +715,7 @@ class MHA(nn.Module):
             ):
                 if self.rotary_emb_dim > 0:
                     q, kv = self.rotary_emb(
-                        q,
-                        kv,
-                        seqlen_offset=seqlen_offset,
-                        cu_seqlens=cu_seqlens,
-                        max_seqlen=rotary_max_seqlen,
                     )
                 if inference_params is None:
                     if not self.checkpointing:
@@ -806,25 +727,7 @@ class MHA(nn.Module):
                 else:
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
-                context = self._apply_rotary_update_kvcache_attention(
-                    q, kv, inference_params
-                )
-        inp = rearrange(context, "... h d -> ... (h d)")
-        if adapter_mask is not None:
-            unique_tasks = torch.unique(adapter_mask)
-            out_dtype = next(self.out_proj.parameters()).dtype
-            out = torch.empty(
-                *inp.shape[:-1],
-                self.out_proj.out_features,
-                dtype=out_dtype,
-                device=inp.device,
-            )
-            for task_id in unique_tasks:
-                task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                task_tensor = inp[task_indices]
-                task_out = self.out_proj(task_tensor, task_id=task_id)
-                out[task_indices] = task_out
-        else:
-            out = self.out_proj(inp)
         return out if not self.return_residual else (out, x)

 # This implementation was adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py
 # Commit id: 6bbc532388e61185a92e2a563126739967b4c8c5
 # Copyright (c) 2023, Tri Dao.
 from einops import rearrange, repeat
 try:
+    from flash_attn import (
+        flash_attn_kvpacked_func,
+        flash_attn_qkvpacked_func,
+        flash_attn_varlen_kvpacked_func,
+        flash_attn_varlen_qkvpacked_func,
+        flash_attn_with_kvcache,
+    )
 except ImportError:
     flash_attn_varlen_qkvpacked_func, flash_attn_varlen_kvpacked_func = None, None
     flash_attn_qkvpacked_func, flash_attn_kvpacked_func = None, None
     flash_attn_with_kvcache = None
 try:
+    from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, RowParallelLinear
 except ImportError:
     FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
+try:
+    from flash_attn.layers.rotary import RotaryEmbedding
+except ImportError:
+    RotaryEmbedding = None
 # From https://github.com/ofirpress/attention_with_linear_biases/blob/4b92f28a005ead2567abe2359f633e73e08f3833/fairseq/models/transformer.py#L742
         closest_power_of_2 = 2 ** math.floor(math.log2(nheads))
         return (
             get_slopes_power_of_2(closest_power_of_2)
+            + get_alibi_slopes(2 * closest_power_of_2)[0::2][: nheads - closest_power_of_2]
         )
         deterministic=False,
     ):
         super().__init__()
+        assert flash_attn_varlen_qkvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
         deterministic=False,
     ):
         super().__init__()
+        assert flash_attn_varlen_kvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
             padding_mask = torch.full(
+                (batch_size, seqlen_k), -10000.0, dtype=scores.dtype, device=scores.device
             )
             padding_mask.masked_fill_(key_padding_mask, 0.0)
             # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
         else:
             alibi_slopes = None
         if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
         self.num_heads = num_heads
         self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
         assert (
             self.num_heads % self.num_heads_kv == 0
         ), "num_heads must be divisible by num_heads_kv"
+        assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
         self.head_dim = self.embed_dim // num_heads
         qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
         kv_dim = 2 * self.head_dim * self.num_heads_kv
         if self.rotary_emb_dim > 0:
+            assert not cross_attn, "MHA with rotary embedding does not support cross-attention yet"
             assert RotaryEmbedding is not None, "rotary_emb is not installed"
             self.rotary_emb = RotaryEmbedding(
                 self.rotary_emb_dim,
                 scale_base=rotary_emb_scale_base,
                 interleaved=rotary_emb_interleaved,
                 device=device,
             )
         if fused_bias_fc and FusedDense is None:
             raise ImportError("fused_dense is not installed")
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
         linear_resid_cls = (
+            LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
         )
         wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
         inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
             if use_flash_attn
             else SelfAttention
         )
         inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
             if use_flash_attn
             else CrossAttention
         )
         if not self.cross_attn:
+            self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
         else:
+            self.Wq = linear_cls(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs)
             self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
         if self.dwconv:
             if self.num_heads_kv == self.num_heads:
                 self.dwconv_q = nn.Conv1d(
                     embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim
                 )
+                self.dwconv_kv = nn.Conv1d(kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim)
         self.inner_attn = inner_attn_cls(
             causal=causal,
             softmax_scale=softmax_scale,
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
+        self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
         dtype = self.out_proj.weight.dtype if dtype is None else dtype
     def _update_kv_cache(self, kv, inference_params):
         """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
         assert not self.dwconv, "Generation does not support dwconv yet"
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
         return _update_kv_cache(kv, inference_params, self.layer_idx)
     def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
             self.rotary_emb._update_cos_sin_cache(
                 inference_params.max_seqlen, device=q.device, dtype=q.dtype
             )
+            rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached
         else:
             rotary_cos, rotary_sin = None, None
         batch = q.shape[0]
             cache_seqlens=cache_seqlens,
             softmax_scale=self.inner_cross_attn.softmax_scale,
             causal=self.inner_cross_attn.causal,
+            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
             alibi_slopes=alibi_slopes,
         )
         return context
         max_seqlen=None,
         mixer_subset=None,
         inference_params=None,
         **kwargs,
     ):
         """
             assert key_padding_mask is None
             assert self.use_flash_attn
             assert not self.dwconv
+            assert self.rotary_emb_dim == 0
         if key_padding_mask is not None:
             assert cu_seqlens is None
             assert max_seqlen is None
                 else inference_params.seqlen_offset
             )
         )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        batch, seqlen = x.shape[:2]
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
+            if not self.return_residual:
+                qkv = self.Wqkv(x)
             else:
+                qkv, x = self.Wqkv(x)
             if self.dwconv:
                 qkv = rearrange(
+                    self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
                 ).contiguous()
+            qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
             if (
                 inference_params is None
                 or inference_params.seqlen_offset == 0
             ):
                 if self.rotary_emb_dim > 0:
                     qkv = self.rotary_emb(
+                        qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
                     )
                 if inference_params is None:
                     if not self.checkpointing:
                         context = self.inner_attn(qkv, **kwargs)
                     else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
                 else:
                     context = self._update_kvcache_attention(
                         qkv[:, :, 0], qkv[:, :, 1:], inference_params
                 q = qkv[..., : self.num_heads * self.head_dim]
                 kv = qkv[..., self.num_heads * self.head_dim :]
             q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
+            kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
             if self.dwconv:
                 q = rearrange(
+                    self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
                 ).contiguous()
                 kv = rearrange(
+                    self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
                 ).contiguous()
             if (
                 inference_params is None
             ):
                 if self.rotary_emb_dim > 0:
                     q, kv = self.rotary_emb(
+                        q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
                     )
                 if inference_params is None:
                     if not self.checkpointing:
                 else:
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
+                context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"))
         return out if not self.return_residual else (out, x)

mlp.py CHANGED Viewed

@@ -8,14 +8,14 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed import ProcessGroup
 try:
     from flash_attn.ops.activations import swiglu
 except ImportError:
     swiglu = None
 try:
-    from flash_attn.ops.fused_dense import (ColumnParallelLinear,
-                                            RowParallelLinear)
 except ImportError:
     ColumnParallelLinear, RowParallelLinear = None, None
@@ -41,48 +41,17 @@ class Mlp(nn.Module):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         out_features = out_features if out_features is not None else in_features
-        hidden_features = (
-            hidden_features if hidden_features is not None else in_features * 4
-        )
         self.return_residual = return_residual
         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
         self.activation = activation
-        self.fc2 = nn.Linear(
-            hidden_features, out_features, bias=bias2, **factory_kwargs
-        )
-    def forward(self, x, adapter_mask=None):
-        if adapter_mask is not None:
-            unique_tasks = torch.unique(adapter_mask)
-            fc1_dtype = next(self.fc1.parameters()).dtype
-            y = torch.empty(
-                *x.shape[:-1], self.fc1.out_features, dtype=fc1_dtype, device=x.device
-            )
-            for task_id in unique_tasks:
-                task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                task_tensor = x[task_indices]
-                task_y = self.fc1(task_tensor, task_id=task_id)
-                y[task_indices] = task_y
-        else:
-            y = self.fc1(x)
         y = self.activation(y)
-        if adapter_mask is not None:
-            unique_tasks = torch.unique(adapter_mask)
-            fc2_dtype = next(self.fc2.parameters()).dtype
-            out = torch.empty(
-                *y.shape[:-1], self.fc2.out_features, dtype=fc2_dtype, device=y.device
-            )
-            for task_id in unique_tasks:
-                task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                task_tensor = y[task_indices]
-                task_out = self.fc2(task_tensor, task_id=task_id)
-                out[task_indices] = task_out
-        else:
-            out = self.fc2(y)
-        return out if not self.return_residual else (out, x)
 class ParallelMLP(nn.Module):
@@ -104,9 +73,7 @@ class ParallelMLP(nn.Module):
         assert ColumnParallelLinear is not None, "Need to install fused_dense"
         assert RowParallelLinear is not None, "Need to install fused_dense"
         out_features = out_features if out_features is not None else in_features
-        hidden_features = (
-            hidden_features if hidden_features is not None else in_features * 4
-        )
         self.fc1 = ColumnParallelLinear(
             in_features,
             hidden_features,
@@ -152,25 +119,17 @@ class GatedMlp(nn.Module):
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
-        hidden_features = (
-            (hidden_features + multiple_of - 1) // multiple_of * multiple_of
-        )
         self.return_residual = return_residual
-        self.fc1 = nn.Linear(
-            in_features, 2 * hidden_features, bias=bias1, **factory_kwargs
-        )
         self.activation = activation
-        self.fc2 = nn.Linear(
-            hidden_features, out_features, bias=bias2, **factory_kwargs
-        )
     def forward(self, x):
         y = self.fc1(x)
         if self.activation == F.sigmoid:  # Special case for GLU
             y = F.glu(y, dim=-1)
-        elif (
-            self.activation == F.silu and swiglu is not None
-        ):  # Special case for SwiGLU
             y, gate = y.chunk(2, dim=-1)
             y = swiglu(gate, y)
         else:
@@ -203,9 +162,7 @@ class ParallelGatedMlp(nn.Module):
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
-        hidden_features = (
-            (hidden_features + multiple_of - 1) // multiple_of * multiple_of
-        )
         if ColumnParallelLinear is None or RowParallelLinear is None:
             raise ImportError("fused_dense is not installed")
         self.fc1 = ColumnParallelLinear(
@@ -234,4 +191,4 @@ class ParallelGatedMlp(nn.Module):
             y, gate = y.chunk(2, dim=-1)
             y = y * self.activation(gate)
         y = self.fc2(y)
-        return y

 import torch.nn.functional as F
 from torch.distributed import ProcessGroup
 try:
     from flash_attn.ops.activations import swiglu
 except ImportError:
     swiglu = None
 try:
+    from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
 except ImportError:
     ColumnParallelLinear, RowParallelLinear = None, None
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         out_features = out_features if out_features is not None else in_features
+        hidden_features = hidden_features if hidden_features is not None else in_features * 4
         self.return_residual = return_residual
         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
         self.activation = activation
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+    def forward(self, x):
+        y = self.fc1(x)
         y = self.activation(y)
+        y = self.fc2(y)
+        return y if not self.return_residual else (y, x)
 class ParallelMLP(nn.Module):
         assert ColumnParallelLinear is not None, "Need to install fused_dense"
         assert RowParallelLinear is not None, "Need to install fused_dense"
         out_features = out_features if out_features is not None else in_features
+        hidden_features = hidden_features if hidden_features is not None else in_features * 4
         self.fc1 = ColumnParallelLinear(
             in_features,
             hidden_features,
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
+        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
         self.return_residual = return_residual
+        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
         self.activation = activation
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
     def forward(self, x):
         y = self.fc1(x)
         if self.activation == F.sigmoid:  # Special case for GLU
             y = F.glu(y, dim=-1)
+        elif self.activation == F.silu and swiglu is not None:  # Special case for SwiGLU
             y, gate = y.chunk(2, dim=-1)
             y = swiglu(gate, y)
         else:
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
+        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
         if ColumnParallelLinear is None or RowParallelLinear is None:
             raise ImportError("fused_dense is not installed")
         self.fc1 = ColumnParallelLinear(
             y, gate = y.chunk(2, dim=-1)
             y = y * self.activation(gate)
         y = self.fc2(y)
+        return y

modeling_lora.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import math
 import os
 from functools import partial
 from typing import Iterator, List, Optional, Tuple, Union
@@ -8,15 +9,12 @@ import torch
 import torch.nn.utils.parametrize as parametrize
 from torch import nn
 from torch.nn import Parameter
-from torch.nn import functional as F
 from transformers import PretrainedConfig
-from .configuration_xlm_roberta import XLMRobertaFlashConfig
-from .modeling_xlm_roberta import (
-    XLMRobertaFlashConfig,
-    XLMRobertaModel,
-    XLMRobertaPreTrainedModel,
-)
 def initialized_weights(
@@ -93,19 +91,22 @@ class LoRAParametrization(nn.Module):
             torch.ones(self.swap((1, fan_in)), dtype=self.lora_A.dtype),
             persistent=False,
         )
     def _dropout(self, A):
         # to mimic the original implementation: A @ dropout(x), we do (A * dropout(ones)) @ x
         return A * self.lora_dropout(self.lora_dropout_mask)
-    def lora_forward(self, X, current_task):
         return (
             X
             + torch.matmul(
                 *self.swap(
                     (
-                        self.lora_B[current_task],
-                        self.dropout_fn(self.lora_A[current_task]),
                     )
                 )
             ).view(X.shape)
@@ -113,7 +114,19 @@ class LoRAParametrization(nn.Module):
         )
     def forward(self, X):
-        return X
     @classmethod
     def from_linear(
@@ -166,15 +179,6 @@ class LoRAParametrization(nn.Module):
         dropout_p: float,
         alpha: float,
     ):
-        """
-        Registering LoRA adapters to all embedding and linear layers.
-        Additionally, we implement a custom forward function for LoRA parametrization.
-        This function modifies the layer's forward pass to optionally use task-specific
-        parameters. When a `task_id` is provided, it employs a LoRA parametrization
-        to modify the original weights according to the specific task. This allows
-        the layer to adapt dynamically to different tasks at runtime. If no `task_id`
-        is specified, the layer uses its original weights.
-        """
         if isinstance(layer, nn.Linear):
             parametrize.register_parametrization(
                 layer,
@@ -187,23 +191,6 @@ class LoRAParametrization(nn.Module):
                     alpha=alpha,
                 ),
             )
-            def new_forward(self, input, task_id=None, residual=False):
-                if task_id is not None:
-                    weights = self.parametrizations.weight[0].lora_forward(
-                        self.weight, current_task=task_id
-                    )
-                else:
-                    weights = self.weight
-                out = F.linear(input, weights, self.bias)
-                if residual:
-                    return out, input
-                return out
-            layer.forward = new_forward.__get__(layer, layer.__class__)
         elif isinstance(layer, nn.Embedding):
             parametrize.register_parametrization(
                 layer,
@@ -217,43 +204,22 @@ class LoRAParametrization(nn.Module):
                 ),
             )
-            def new_forward(self, input, task_id=None):
-                if task_id is not None:
-                    weights = self.parametrizations.weight[0].lora_forward(
-                        self.weight, current_task=task_id
-                    )
-                else:
-                    weights = self.weight
-                out = F.embedding(
-                    input,
-                    weights,
-                    self.padding_idx,
-                    self.max_norm,
-                    self.norm_type,
-                    self.scale_grad_by_freq,
-                    self.sparse,
-                )
-                return out
-            layer.forward = new_forward.__get__(layer, layer.__class__)
 class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
-    """
-    A wrapper class around the Jina XLM-RoBERTa model that integrates LoRA (Low-Rank Adaptation) adapters.
-    """
     def __init__(
         self,
         config: XLMRobertaFlashConfig,
-        roberta: Optional[XLMRobertaModel] = None,
-        add_pooling_layer: bool = True,
     ):
         super().__init__(config)
         if roberta is None:
-            self.roberta = XLMRobertaModel(config, add_pooling_layer=add_pooling_layer)
         else:
             self.roberta = roberta
@@ -263,19 +229,7 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
             or len(self._lora_adaptations) < 1
         ):
             raise ValueError(
-                f"`lora_adaptations` must be a list and contain at least one element"
-            )
-        self._task_instructions = config.task_instructions
-        if (
-            not isinstance(self._task_instructions, dict)
-            or len(self._task_instructions) != len(self._lora_adaptations)
-            or not all(
-                [v in self._lora_adaptations for v in self._task_instructions.keys()]
-            )
-        ):
-            raise ValueError(
-                f"`task_instructions` must be a dict and contain the same number of elements "
-                f"as `lora_adaptations` with all keys in `task_instructions` present in `lora_adaptations`."
             )
         self._adaptation_map = {
             name: idx for idx, name in enumerate(self._lora_adaptations)
@@ -290,14 +244,9 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
             alpha=self._alpha,
         )
         self.main_params_trainable = config.lora_main_params_trainable
-    @property
-    def rotary_emb_base(self):
-        return self.roberta.rotary_emb_base
-    @rotary_emb_base.setter
-    def rotary_emb_base(self, base):
-        self.roberta.rotary_emb_base = base
     @property
     def main_params_trainable(self):
@@ -331,30 +280,16 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         use_safetensors: bool = None,
         **kwargs,
     ):
-        for key in list(kwargs.keys()):
-            if key in config.to_dict():
-                config.update({key: kwargs.pop(key)})
-        if config.load_trained_adapters:  # checkpoint already contains LoRA adapters
             return super().from_pretrained(
-                pretrained_model_name_or_path,
-                *model_args,
-                config=config,
-                cache_dir=cache_dir,
-                ignore_mismatched_sizes=ignore_mismatched_sizes,
-                force_download=force_download,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                use_safetensors=use_safetensors,
-                **kwargs,
-            )
-        else:  # initializing new adapters
-            roberta = XLMRobertaModel.from_pretrained(
-                pretrained_model_name_or_path,
-                *model_args,
-                use_flash_attn=config.use_flash_attn,
-                **kwargs,
             )
             return cls(config, roberta=roberta)
     def _register_lora(self, num_adaptations, rank, dropout_p, alpha):
@@ -368,7 +303,39 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
             )
         )
-    def forward(self, *args, **kwargs):
         return self.roberta(*args, **kwargs)
     def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
@@ -387,40 +354,28 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
     @torch.inference_mode()
     def encode(
         self,
-        sentences: Union[str, List[str]],
         *args,
-        task: Optional[str] = None,
         **kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
-        Computes sentence embeddings.
-        sentences(`str` or `List[str]`):
-            Sentence or sentences to be encoded
-        task(`str`, *optional*, defaults to `None`):
-            Specifies the task for which the encoding is intended. If `task` is not provided,
-            all LoRA adapters are disabled, and the model reverts to its original,
-            general-purpose weights.
         """
-        if task and task not in self._lora_adaptations:
-            raise ValueError(
-                f"Unsupported task '{task}'. "
-                f"Supported tasks are: {', '.join(self.config.lora_adaptations)}."
-                f"Alternatively, don't pass the `task` argument to disable LoRA."
-            )
-        adapter_mask = None
-        if task:
-            task_id = self._adaptation_map[task]
-            num_examples = 1 if isinstance(sentences, str) else len(sentences)
-            adapter_mask = torch.full(
-                (num_examples,), task_id, dtype=torch.int32, device=self.device
-            )
-            if isinstance(sentences, str):
-                sentences = self._task_instructions[task] + sentences
-            else:
-                sentences = [
-                    self._task_instructions[task] + sentence for sentence in sentences
-                ]
-        return self.roberta.encode(
-            sentences, *args, adapter_mask=adapter_mask, **kwargs
-        )

 import math
 import os
+import warnings
 from functools import partial
 from typing import Iterator, List, Optional, Tuple, Union
 import torch.nn.utils.parametrize as parametrize
 from torch import nn
 from torch.nn import Parameter
 from transformers import PretrainedConfig
+from .modeling_xlm_roberta import XLMRobertaFlashConfig, XLMRobertaModel, XLMRobertaPreTrainedModel
+LORA_NO_UPDATE = '__lora_no_update__'
 def initialized_weights(
             torch.ones(self.swap((1, fan_in)), dtype=self.lora_A.dtype),
             persistent=False,
         )
+        self.forward_fn = lambda x: x
+        self.current_task = None
     def _dropout(self, A):
         # to mimic the original implementation: A @ dropout(x), we do (A * dropout(ones)) @ x
         return A * self.lora_dropout(self.lora_dropout_mask)
+    def lora_forward(self, X):
+        assert self.current_task is not None
         return (
             X
             + torch.matmul(
                 *self.swap(
                     (
+                        self.lora_B[self.current_task],
+                        self.dropout_fn(self.lora_A[self.current_task]),
                     )
                 )
             ).view(X.shape)
         )
     def forward(self, X):
+        return self.forward_fn(X)
+    @property
+    def current_task(self):
+        return self._current_task
+    @current_task.setter
+    def current_task(self, task: Union[None, int]):
+        self._current_task = task
+        if task is None:
+            self.forward_fn = lambda x: x
+        else:
+            self.forward_fn = self.lora_forward
     @classmethod
     def from_linear(
         dropout_p: float,
         alpha: float,
     ):
         if isinstance(layer, nn.Linear):
             parametrize.register_parametrization(
                 layer,
                     alpha=alpha,
                 ),
             )
         elif isinstance(layer, nn.Embedding):
             parametrize.register_parametrization(
                 layer,
                 ),
             )
+    @staticmethod
+    def select_task_for_layer(layer: nn.Module, task_idx: Optional[int] = None):
+        if isinstance(layer, LoRAParametrization):
+            layer.current_task = task_idx
 class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
     def __init__(
         self,
         config: XLMRobertaFlashConfig,
+        roberta: Optional[XLMRobertaModel] = None
     ):
         super().__init__(config)
         if roberta is None:
+            self.roberta = XLMRobertaModel(config)
         else:
             self.roberta = roberta
             or len(self._lora_adaptations) < 1
         ):
             raise ValueError(
+                f'`lora_adaptations` must be a list and contain at least one element'
             )
         self._adaptation_map = {
             name: idx for idx, name in enumerate(self._lora_adaptations)
             alpha=self._alpha,
         )
         self.main_params_trainable = config.lora_main_params_trainable
+        self._task_idx = None
+        # By default, disable LoRA until it's specified which adapter/task to use
+        self.current_task = None
     @property
     def main_params_trainable(self):
         use_safetensors: bool = None,
         **kwargs,
     ):
+        config = XLMRobertaFlashConfig.from_pretrained(
+            pretrained_model_name_or_path, *model_args, **kwargs
+        )
+        if config.load_trained_adapters:
             return super().from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
             )
+        else:
+            roberta = XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
             return cls(config, roberta=roberta)
     def _register_lora(self, num_adaptations, rank, dropout_p, alpha):
             )
         )
+    @property
+    def current_task(self):
+        """Which LoRA is currently selected
+        :return: Integer or None (when LoRA is disabled)
+        """
+        return self._task_idx
+    @current_task.setter
+    def current_task(self, task_name: Union[None, str]):
+        """Set the LoRA that is to be used.
+        The LoRA is specified by `task_idx`, which may be an integer >= 0,
+        indexing the available LoRAs. If it is None, no LoRA is used.
+        :param task_name: Which LoRA to use
+        :return:
+        """
+        if task_name and task_name not in self._lora_adaptations:
+            raise ValueError(
+                f"Unsupported task '{task_name}'. "
+                f"Supported tasks are: {', '.join(self.config.lora_adaptations)}."
+                f"Alternatively, set `task` to `None` if you want to disable LoRA."
+            )
+        task_idx = self._adaptation_map[task_name] if task_name else None
+        if self._task_idx != task_idx:
+            # In this case, we need to update the LoRAs everywhere
+            self._task_idx = task_idx
+            self.apply(
+                partial(LoRAParametrization.select_task_for_layer, task_idx=task_idx)
+            )
+    def forward(self, *args, task: Union[str, None] = LORA_NO_UPDATE, **kwargs):
+        if task != LORA_NO_UPDATE:
+            self.current_task = task
         return self.roberta(*args, **kwargs)
     def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
     @torch.inference_mode()
     def encode(
         self,
         *args,
+        task: Union[str, None] = LORA_NO_UPDATE,
         **kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
+        Computes sentence embeddings
+        task(`str`, *optional*, defaults to `LORA_NO_UPDATE`):
+            Specifies the task for which the encoding is intended. This parameter controls the
+            use of specialized LoRA adapters that are tuned for specific tasks. If `task` is set
+            to `LORA_NO_UPDATE`, there will be no update to the current task, retaining the
+            existing adapter configuration. If `task` is explicitly set to `None`, all LoRA
+            adapters are disabled, and the model reverts to its original, general-purpose weights.
+            If `task` is set to a specific LoRA adaptation, that adaptation is activated.
         """
+        if task != LORA_NO_UPDATE:
+            if not task:
+                warnings.warn(
+                    f"Task-specific embeddings are disabled. To enable, specify the `task` "
+                    f"argument with one of the supported tasks: {', '.join(self.config.lora_adaptations)}",
+                    category=UserWarning,
+                )
+            self.current_task = task
+        return self.roberta.encode(*args, **kwargs)

modeling_xlm_roberta.py CHANGED Viewed

@@ -13,30 +13,39 @@ import re
 from collections import OrderedDict
 from collections.abc import Sequence
 from functools import partial
-from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers import AutoTokenizer, PretrainedConfig
-from transformers.modeling_outputs import MaskedLMOutput, SequenceClassifierOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     BertForPreTrainingOutput,
 )
-from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaLMHead
-from .rotary import RotaryEmbedding
-from .block import Block
 from .configuration_xlm_roberta import XLMRobertaFlashConfig
 from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
-from .xlm_padding import index_first_axis_residual, pad_input, unpad_input
 try:
     from flash_attn.ops.fused_dense import FusedDense
@@ -64,11 +73,13 @@ logger = logging.getLogger(__name__)
 def get_use_flash_attn(config: XLMRobertaFlashConfig):
-    if not getattr(config, "use_flash_attn", False) or not torch.cuda.is_available():
         return False
     if importlib.util.find_spec("flash_attn") is None:
         logger.warning(
-            "flash_attn is not installed. Using PyTorch native attention implementation."
         )
         return False
     return True
@@ -80,9 +91,9 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
     rotary_kwargs = {}
     if config.position_embedding_type == "rotary":
         rotary_kwargs["rotary_emb_dim"] = getattr(
-            config, "rotary_emb_dim", config.hidden_size / config.num_attention_heads
         )
-        rotary_kwargs["rotary_emb_base"] = config.rotary_emb_base
         rotary_kwargs["rotary_emb_scale_base"] = getattr(
             config, "rotary_emb_scale_base", None
         )
@@ -98,7 +109,6 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
         fused_bias_fc=fused_bias_fc,
         use_flash_attn=use_flash_attn,
         return_residual=return_residual,
-        use_alibi=config.position_embedding_type == "alibi",
         **rotary_kwargs,
     )
     return mixer_cls
@@ -180,7 +190,6 @@ class XLMRobertaEncoder(nn.Module):
     def __init__(self, config: XLMRobertaFlashConfig):
         super().__init__()
         self.use_flash_attn = get_use_flash_attn(config)
-        self.use_reentrant = config.use_reentrant
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
@@ -194,72 +203,46 @@ class XLMRobertaEncoder(nn.Module):
     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
-    def forward(
-        self,
-        hidden_states,
-        key_padding_mask=None,
-        subset_mask=None,
-        adapter_mask=None,
-        output_hidden_states: Optional[bool] = None,
-    ):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
-        all_hidden_states = () if output_hidden_states else None
-        if output_hidden_states and subset_mask:
-            raise ValueError('output_hidden_states is not supported for subset_masks')
         if key_padding_mask is None or not self.use_flash_attn:
-            mixer_kwargs = {"adapter_mask": adapter_mask}
-            if key_padding_mask is not None:
-                mixer_kwargs["key_padding_mask"] = key_padding_mask.bool()
             for layer in self.layers:
-                if output_hidden_states:
-                    all_hidden_states = all_hidden_states + (hidden_states,)
                 if self._grad_checkpointing:
                     hidden_states = torch.utils.checkpoint.checkpoint(
                         layer,
                         hidden_states,
-                        use_reentrant=self.use_reentrant,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
             if subset_mask is not None:
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            hidden_states, indices, cu_seqlens, max_seqlen_in_batch, cu_adapter_mask = (
-                unpad_input(hidden_states, key_padding_mask, adapter_mask)
             )
-            mixer_kwargs = {
-                "cu_seqlens": cu_seqlens,
-                "max_seqlen": max_seqlen_in_batch,
-                "adapter_mask": cu_adapter_mask,
-            }
             if subset_mask is None:
                 for layer in self.layers:
                     if self._grad_checkpointing:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
-                            use_reentrant=self.use_reentrant,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-                    if output_hidden_states:
-                        all_hidden_states = all_hidden_states + (
-                            pad_input(hidden_states, indices, batch, seqlen),
-                        )
                 hidden_states = pad_input(hidden_states, indices, batch, seqlen)
             else:
                 for layer in self.layers[:-1]:
@@ -267,7 +250,7 @@ class XLMRobertaEncoder(nn.Module):
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
-                            use_reentrant=self.use_reentrant,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
@@ -305,14 +288,14 @@ class XLMRobertaEncoder(nn.Module):
                     torch.utils.checkpoint.checkpoint(
                         self.layers[-1],
                         hidden_states_subset,
-                        use_reentrant=self.use_reentrant,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else:
                     hidden_states = self.layers[-1](
                         hidden_states_subset, mixer_kwargs=mixer_kwargs
                     )
-        return all_hidden_states if output_hidden_states else hidden_states
 class XLMRobertaPooler(nn.Module):
@@ -325,28 +308,11 @@ class XLMRobertaPooler(nn.Module):
         self.dense = linear_cls(config.hidden_size, config.hidden_size)
         self.activation = nn.Tanh()
-    def forward(self, hidden_states, pool=True, adapter_mask=None):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
-        if adapter_mask is not None:
-            unique_tasks = torch.unique(adapter_mask)
-            pool_dtype = next(self.dense.parameters()).dtype
-            pooled_output = torch.empty(
-                first_token_tensor.shape[0],
-                self.dense.out_features,
-                dtype=pool_dtype,
-                device=first_token_tensor.device,
-            )
-            for task_id in unique_tasks:
-                task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                task_first_token_tensor = first_token_tensor[task_indices]
-                task_pooled_output = self.dense(
-                    task_first_token_tensor, task_id=task_id
-                )
-                pooled_output[task_indices] = task_pooled_output
-        else:
-            pooled_output = self.dense(first_token_tensor)
         pooled_output = self.activation(pooled_output)
         return pooled_output
@@ -425,7 +391,6 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
     config_class = XLMRobertaFlashConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
-    _supports_param_buffer_assignment = False
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, XLMRobertaEncoder):
@@ -437,11 +402,12 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
         *args,
         **kwargs,
     ):
-        if not "torch_dtype" in kwargs:
-            kwargs["torch_dtype"] = "auto"
         return super().from_pretrained(*args, **kwargs)
 class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, add_pooling_layer=True):
         super().__init__(config)
@@ -459,14 +425,11 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             "gelu_fast",
             "gelu_pytorch_tanh",
         ]
         self.embeddings = XLMRobertaEmbeddings(
             config.hidden_size,
             config.vocab_size,
-            (
-                config.max_position_embeddings
-                if config.position_embedding_type == "absolute"
-                else -1
-            ),
             config.type_vocab_size,
             padding_idx=config.pad_token_id,
         )
@@ -476,25 +439,20 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.name_or_path, trust_remote_code=True
-        )
-        self._rotary_emb_base = config.rotary_emb_base
     @torch.inference_mode()
     def encode(
-        self: "XLMRobertaModel",
         sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
-        output_value: str = "sentence_embedding",
         convert_to_numpy: bool = True,
         convert_to_tensor: bool = False,
         device: Optional[torch.device] = None,
-        normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
-        adapter_mask: Optional[torch.Tensor] = None,
-        task: Optional[str] = None,
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
@@ -520,7 +478,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
                 Overwrites any setting from convert_to_numpy
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
-            normalize_embeddings(`bool`, *optional*, defaults to True):
                 If set to true, returned vectors will have length 1. In that case, the
                 faster dot-product (util.dot_score) instead of cosine similarity can
                 be used.
@@ -533,6 +491,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             If convert_to_tensor, a stacked tensor is returned.
             If convert_to_numpy, a numpy matrix is returned.
         """
         is_training = self.training
         self.eval()
@@ -545,12 +509,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         if convert_to_tensor:
             convert_to_numpy = False
-        if output_value != "sentence_embedding":
             convert_to_tensor = False
             convert_to_numpy = False
         input_was_string = False
-        if isinstance(sentences, str) or not hasattr(sentences, "__len__"):
             sentences = [sentences]
             input_was_string = True
@@ -561,11 +525,11 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         inverse_permutation = np.argsort(permutation)
         sentences = [sentences[idx] for idx in permutation]
-        tokenizer_kwargs["padding"] = tokenizer_kwargs.get("padding", True)
-        tokenizer_kwargs["max_length"] = tokenizer_kwargs.get(
-            "max_length", self.tokenizer.init_kwargs.get("model_max_length", 8192)
         )
-        tokenizer_kwargs["truncation"] = tokenizer_kwargs.get("truncation", True)
         all_embeddings = []
@@ -583,33 +547,33 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         for i in range_iter:
             encoded_input = self.tokenizer(
                 sentences[i : i + batch_size],
-                return_tensors="pt",
                 **tokenizer_kwargs,
             ).to(self.device)
-            lora_arguments = (
-                {"adapter_mask": adapter_mask[i : i + batch_size]}
-                if adapter_mask is not None
-                else {}
-            )
-            token_embs = self.forward(**encoded_input, **lora_arguments)[0]
             # Accumulate in fp32 to avoid overflow
             token_embs = token_embs.float()
-            if output_value == "token_embeddings":
                 raise NotImplementedError
             elif output_value is None:
                 raise NotImplementedError
             else:
-                if self.config.emb_pooler == "cls":
                     embeddings = self.cls_pooling(
-                        token_embs, encoded_input["attention_mask"]
                     )
                 else:
                     embeddings = self.mean_pooling(
-                        token_embs, encoded_input["attention_mask"]
                     )
             all_embeddings.extend(embeddings)
         all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
@@ -618,16 +582,10 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         if truncate_dim:
             all_embeddings = self.truncate_embeddings(all_embeddings, truncate_dim)
-        if normalize_embeddings:
-            all_embeddings = [
-                torch.nn.functional.normalize(embedding, p=2, dim=0)
-                for embedding in all_embeddings
-            ]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
-            all_embeddings = np.asarray([emb.cpu().numpy() for emb in all_embeddings])
         if input_was_string:
             all_embeddings = all_embeddings[0]
@@ -635,19 +593,18 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         self.train(is_training)
         return all_embeddings
     def truncate_embeddings(self, embeddings, truncate_dim):
         if not self.config.matryoshka_dimensions:
             logger.warning(
-                "Matryoshka embeddings are not supported, so dimension truncation will not be performed."
             )
             return embeddings
         elif truncate_dim in self.config.matryoshka_dimensions:
             return [tensor[:truncate_dim] for tensor in embeddings]
         else:
-            raise ValueError(
-                f"The provided `truncate_dim` value of {truncate_dim} is not supported. "
-                f"Supported dimensions are {self.config.matryoshka_dimensions}."
-            )
     def mean_pooling(
         self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
@@ -659,21 +616,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             input_mask_expanded.sum(1), min=1e-9
         )
-    def cls_pooling(self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor):
-        return token_embeddings[:, 0]
-    @property
-    def rotary_emb_base(self):
-        return self._rotary_emb_base
-    @rotary_emb_base.setter
-    def rotary_emb_base(self, base):
-        if not isinstance(base, (int, float)):
-            raise TypeError("Base must be an integer or float")
-        logger.info(f"Changing RoPE base value to {base}")
-        for layer in self.encoder.layers:
-            layer.mixer.rotary_emb.base = base
-        self._rotary_emb_base = base
     def forward(
         self,
@@ -683,7 +631,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         attention_mask=None,
         masked_tokens_mask=None,
         return_dict=None,
-        output_hidden_states=None,
         **kwargs,
     ):
         """If masked_tokens_mask is not None (i.e. last_layer_subset == True in XLMForPreTraining),
@@ -691,12 +638,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         layer output for these tokens.
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
-        adapter_mask = kwargs.pop("adapter_mask", None)
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
                     logger.warning(
-                        "Flash attention implementation does not support kwargs: %s",
                         key,
                     )
@@ -705,10 +652,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         )
         hidden_states = self.embeddings(
-            input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            adapter_mask=adapter_mask,
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
@@ -732,24 +676,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             subset_mask = None
         sequence_output = self.encoder(
-            hidden_states,
-            key_padding_mask=attention_mask,
-            subset_mask=subset_mask,
-            adapter_mask=adapter_mask,
-            output_hidden_states=output_hidden_states,
         )
-        if output_hidden_states:
-            all_hidden_states = sequence_output
-            sequence_output = sequence_output[-1]
-        else:
-            all_hidden_states = None
         if masked_tokens_mask is None:
             pooled_output = (
-                self.pooler(sequence_output, adapter_mask=adapter_mask)
-                if self.pooler is not None
-                else None
             )
         else:
             # TD [2022-03-01]: the indexing here is very tricky.
@@ -763,9 +695,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
                 pool_input = sequence_output[first_col_mask[subset_mask]]
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = (
-                self.pooler(pool_input, pool=False, adapter_mask=adapter_mask)
-                if self.pooler is not None
-                else None
             )
         if not return_dict:
@@ -774,7 +704,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
-            hidden_states=all_hidden_states,
         )
@@ -871,6 +800,103 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
         )
 def remap_state_dict(state_dict, config: PretrainedConfig):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
@@ -1022,47 +1048,47 @@ def inv_remap_state_dict(state_dict, config: PretrainedConfig):
         if not last_layer_subset or d != (config.num_hidden_layers - 1):
             Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
             Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = (
-                Wqkv_weights[: Wqkv_weights.shape[0] // 3, :]
-            )
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = (
-                Wqkv_weights[
-                    Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
-                ]
-            )
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = (
-                Wqkv_weights[2 * Wqkv_weights.shape[0] // 3 :, :]
-            )
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = (
-                Wqkv_biases[: Wqkv_biases.shape[0] // 3]
-            )
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = (
-                Wqkv_biases[Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3]
-            )
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = (
-                Wqkv_biases[2 * Wqkv_biases.shape[0] // 3 :]
-            )
         else:
             Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
             Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
             Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
             Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = (
-                Wq_weight
-            )
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = (
-                Wkv_weights[: Wkv_weights.shape[0] // 2, :]
-            )
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = (
-                Wkv_weights[Wkv_weights.shape[0] // 2 :, :]
-            )
             state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
             state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
                 : Wkv_biases.shape[0] // 2
             ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = (
-                Wkv_biases[Wkv_biases.shape[0] // 2 :]
-            )
     def inv_key_mapping_ln(key):
         key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)

 from collections import OrderedDict
 from collections.abc import Sequence
 from functools import partial
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from einops import rearrange
+from transformers import PretrainedConfig
 from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import MaskedLMOutput,SequenceClassifierOutput
+from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaLMHead
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     BertForPreTrainingOutput,
 )
+from typing import List, Optional, Tuple, Union
+from .xlm_padding import (
+    index_first_axis,
+    index_first_axis_residual,
+    pad_input,
+    unpad_input,
+)
 from .configuration_xlm_roberta import XLMRobertaFlashConfig
+from .block import Block
 from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
+from .stochastic_depth import StochasticDepth
 try:
     from flash_attn.ops.fused_dense import FusedDense
 def get_use_flash_attn(config: XLMRobertaFlashConfig):
+    if not getattr(config, "use_flash_attn", False):
+        return False
+    if not torch.cuda.is_available():
         return False
     if importlib.util.find_spec("flash_attn") is None:
         logger.warning(
+            'flash_attn is not installed. Using PyTorch native attention implementation.'
         )
         return False
     return True
     rotary_kwargs = {}
     if config.position_embedding_type == "rotary":
         rotary_kwargs["rotary_emb_dim"] = getattr(
+            config, "rotary_emb_dim", config.hidden_size
         )
+        rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
         rotary_kwargs["rotary_emb_scale_base"] = getattr(
             config, "rotary_emb_scale_base", None
         )
         fused_bias_fc=fused_bias_fc,
         use_flash_attn=use_flash_attn,
         return_residual=return_residual,
         **rotary_kwargs,
     )
     return mixer_cls
     def __init__(self, config: XLMRobertaFlashConfig):
         super().__init__()
         self.use_flash_attn = get_use_flash_attn(config)
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
+    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
         if key_padding_mask is None or not self.use_flash_attn:
+            mixer_kwargs = (
+                {"key_padding_mask": key_padding_mask.bool()}
+                if key_padding_mask is not None
+                else None
+            )
             for layer in self.layers:
                 if self._grad_checkpointing:
                     hidden_states = torch.utils.checkpoint.checkpoint(
                         layer,
                         hidden_states,
+                        use_reentrant=False,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
             if subset_mask is not None:
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
+            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+                hidden_states, key_padding_mask
             )
+            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
             if subset_mask is None:
                 for layer in self.layers:
                     if self._grad_checkpointing:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
+                            use_reentrant=False,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                 hidden_states = pad_input(hidden_states, indices, batch, seqlen)
             else:
                 for layer in self.layers[:-1]:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
+                            use_reentrant=False,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
                     torch.utils.checkpoint.checkpoint(
                         self.layers[-1],
                         hidden_states_subset,
+                        use_reentrant=False,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else:
                     hidden_states = self.layers[-1](
                         hidden_states_subset, mixer_kwargs=mixer_kwargs
                     )
+        return hidden_states
 class XLMRobertaPooler(nn.Module):
         self.dense = linear_cls(config.hidden_size, config.hidden_size)
         self.activation = nn.Tanh()
+    def forward(self, hidden_states, pool=True):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
+        pooled_output = self.dense(first_token_tensor)
         pooled_output = self.activation(pooled_output)
         return pooled_output
     config_class = XLMRobertaFlashConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, XLMRobertaEncoder):
         *args,
         **kwargs,
     ):
+        if not 'torch_dtype' in kwargs:
+            kwargs['torch_dtype'] = 'auto'
         return super().from_pretrained(*args, **kwargs)
 class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, add_pooling_layer=True):
         super().__init__(config)
             "gelu_fast",
             "gelu_pytorch_tanh",
         ]
         self.embeddings = XLMRobertaEmbeddings(
             config.hidden_size,
             config.vocab_size,
+            config.max_position_embeddings,
             config.type_vocab_size,
             padding_idx=config.pad_token_id,
         )
         self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     @torch.inference_mode()
     def encode(
+        self: 'XLMRobertaModel',
         sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
+        output_value: str = 'sentence_embedding',
         convert_to_numpy: bool = True,
         convert_to_tensor: bool = False,
         device: Optional[torch.device] = None,
+        normalize_embeddings: bool = False,
         truncate_dim: Optional[int] = None,
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
                 Overwrites any setting from convert_to_numpy
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
+            normalize_embeddings(`bool`, *optional*, defaults to False):
                 If set to true, returned vectors will have length 1. In that case, the
                 faster dot-product (util.dot_score) instead of cosine similarity can
                 be used.
             If convert_to_tensor, a stacked tensor is returned.
             If convert_to_numpy, a numpy matrix is returned.
         """
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.name_or_path, trust_remote_code=True
+        )
         is_training = self.training
         self.eval()
         if convert_to_tensor:
             convert_to_numpy = False
+        if output_value != 'sentence_embedding':
             convert_to_tensor = False
             convert_to_numpy = False
         input_was_string = False
+        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
             sentences = [sentences]
             input_was_string = True
         inverse_permutation = np.argsort(permutation)
         sentences = [sentences[idx] for idx in permutation]
+        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
+        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get(
+            'max_length', self.tokenizer.init_kwargs.get('model_max_length', 8192)
         )
+        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
         all_embeddings = []
         for i in range_iter:
             encoded_input = self.tokenizer(
                 sentences[i : i + batch_size],
+                return_tensors='pt',
                 **tokenizer_kwargs,
             ).to(self.device)
+            token_embs = self.forward(**encoded_input)[0]
             # Accumulate in fp32 to avoid overflow
             token_embs = token_embs.float()
+            if output_value == 'token_embeddings':
                 raise NotImplementedError
             elif output_value is None:
                 raise NotImplementedError
             else:
+                if self.config.emb_pooler == 'cls':
                     embeddings = self.cls_pooling(
+                        token_embs, encoded_input['attention_mask']
                     )
                 else:
                     embeddings = self.mean_pooling(
+                        token_embs, encoded_input['attention_mask']
                     )
+                if normalize_embeddings:
+                    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+                if convert_to_numpy:
+                    embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
         all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
         if truncate_dim:
             all_embeddings = self.truncate_embeddings(all_embeddings, truncate_dim)
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
         if input_was_string:
             all_embeddings = all_embeddings[0]
         self.train(is_training)
         return all_embeddings
     def truncate_embeddings(self, embeddings, truncate_dim):
         if not self.config.matryoshka_dimensions:
             logger.warning(
+                'Matryoshka embeddings are not supported, so dimension truncation will not be performed.'
             )
             return embeddings
         elif truncate_dim in self.config.matryoshka_dimensions:
             return [tensor[:truncate_dim] for tensor in embeddings]
         else:
+            raise ValueError(f'The provided `truncate_dim` value of {truncate_dim} is not supported. '
+                             f'Supported dimensions are {self.config.matryoshka_dimensions}.')
     def mean_pooling(
         self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
             input_mask_expanded.sum(1), min=1e-9
         )
+    def cls_pooling(
+        self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
+    ):
+        return token_embeddings[:,0]
     def forward(
         self,
         attention_mask=None,
         masked_tokens_mask=None,
         return_dict=None,
         **kwargs,
     ):
         """If masked_tokens_mask is not None (i.e. last_layer_subset == True in XLMForPreTraining),
         layer output for these tokens.
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
                     logger.warning(
+                        'Flash attention implementation does not support kwargs: %s',
                         key,
                     )
         )
         hidden_states = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
             subset_mask = None
         sequence_output = self.encoder(
+            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask
         )
         if masked_tokens_mask is None:
             pooled_output = (
+                self.pooler(sequence_output) if self.pooler is not None else None
             )
         else:
             # TD [2022-03-01]: the indexing here is very tricky.
                 pool_input = sequence_output[first_col_mask[subset_mask]]
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = (
+                self.pooler(pool_input, pool=False) if self.pooler is not None else None
             )
         if not return_dict:
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
         )
         )
+# class XLMRobertaForPreTraining(XLMRobertaPreTrainedModel):
+#     def __init__(self, config: XLMRobertaFlashConfig):
+#         super().__init__(config)
+#         # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
+#         # (around 15%) to the classifier heads.
+#         self.dense_seq_output = getattr(config, "dense_seq_output", False)
+#         # If last_layer_subset, we only need the compute the last layer for a subset of tokens
+#         # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
+#         self.last_layer_subset = getattr(config, "last_layer_subset", False)
+#         if self.last_layer_subset:
+#             assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
+#         use_xentropy = getattr(config, "use_xentropy", False)
+#         if use_xentropy and CrossEntropyLoss is None:
+#             raise ImportError("xentropy_cuda is not installed")
+#         loss_cls = (
+#             nn.CrossEntropyLoss
+#             if not use_xentropy
+#             else partial(CrossEntropyLoss, inplace_backward=True)
+#         )
+#
+#         self.xlm = XLMRobertaModel(config)
+#         self.cls = XLMRobertaPreTrainingHeads(config)
+#         self.mlm_loss = loss_cls(ignore_index=0)
+#         self.nsp_loss = loss_cls(ignore_index=-1)
+#
+#         # Initialize weights and apply final processing
+#         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+#         self.tie_weights()
+#
+#     def tie_weights(self):
+#         self.cls.predictions.decoder.weight = self.xlm.embeddings.word_embeddings.weight
+#
+#     def forward(
+#         self,
+#         input_ids,
+#         position_ids=None,
+#         token_type_ids=None,
+#         attention_mask=None,
+#         labels=None,
+#         next_sentence_label=None,
+#     ):
+#         """
+#         If labels are provided, they must be 0 for masked out tokens (as specified in the attention
+#         mask).
+#         Outputs:
+#             if `labels` and `next_sentence_label` are not `None`:
+#                 Outputs the total_loss which is the sum of the masked language modeling loss and the next
+#                 sentence classification loss.
+#             if `labels` or `next_sentence_label` is `None`:
+#                 Outputs a tuple comprising
+#                 - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+#                 - the next sentence classification logits of shape [batch_size, 2].
+#
+#         """
+#         masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
+#         outputs = self.xlm(
+#             input_ids,
+#             position_ids=position_ids,
+#             token_type_ids=token_type_ids,
+#             attention_mask=attention_mask.bool() if attention_mask is not None else None,
+#             masked_tokens_mask=masked_tokens_mask,
+#         )
+#         sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
+#         if self.dense_seq_output and labels is not None:
+#             masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
+#             if not self.last_layer_subset:
+#                 sequence_output = index_first_axis(
+#                     rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
+#                 )
+#         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+#
+#         total_loss = None
+#         if labels is not None and next_sentence_label is not None:
+#             if (
+#                 self.dense_seq_output and labels is not None
+#             ):  # prediction_scores are already flattened
+#                 masked_lm_loss = self.mlm_loss(
+#                     prediction_scores, labels.flatten()[masked_token_idx]
+#                 )
+#             else:
+#                 masked_lm_loss = self.mlm_loss(
+#                     rearrange(prediction_scores, "... v -> (...) v"),
+#                     rearrange(labels, "... -> (...)"),
+#                 )
+#             next_sentence_loss = self.nsp_loss(
+#                 rearrange(seq_relationship_score, "... t -> (...) t"),
+#                 rearrange(next_sentence_label, "... -> (...)"),
+#             )
+#             total_loss = masked_lm_loss.float() + next_sentence_loss.float()
+#
+#         return BertForPreTrainingOutput(
+#             loss=total_loss,
+#             prediction_logits=prediction_scores,
+#             seq_relationship_logits=seq_relationship_score,
+#         )
 def remap_state_dict(state_dict, config: PretrainedConfig):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
         if not last_layer_subset or d != (config.num_hidden_layers - 1):
             Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
             Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.query.weight"
+            ] = Wqkv_weights[: Wqkv_weights.shape[0] // 3, :]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.key.weight"
+            ] = Wqkv_weights[
+                Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
+            ]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.weight"
+            ] = Wqkv_weights[2 * Wqkv_weights.shape[0] // 3 :, :]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.query.bias"
+            ] = Wqkv_biases[: Wqkv_biases.shape[0] // 3]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.key.bias"
+            ] = Wqkv_biases[Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.bias"
+            ] = Wqkv_biases[2 * Wqkv_biases.shape[0] // 3 :]
         else:
             Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
             Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
             Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
             Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.query.weight"
+            ] = Wq_weight
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.key.weight"
+            ] = Wkv_weights[: Wkv_weights.shape[0] // 2, :]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.weight"
+            ] = Wkv_weights[Wkv_weights.shape[0] // 2 :, :]
             state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
             state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
                 : Wkv_biases.shape[0] // 2
             ]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.bias"
+            ] = Wkv_biases[Wkv_biases.shape[0] // 2 :]
     def inv_key_mapping_ln(key):
         key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)

modeling_xlm_roberta_for_glue.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Optional, Union, Tuple
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
+from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, TokenClassifierOutput
+from .modeling_xlm_roberta import XLMRobertaPreTrainedModel, XLMRobertaModel
+from .configuration_xlm_roberta import XLMRobertaFlashConfig
+class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
+    def __init__(self, config: XLMRobertaFlashConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.roberta = XLMRobertaModel(config)
+        classifier_dropout = (
+            config.classifier_dropout
+            if config.classifier_dropout is not None
+            else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        assert head_mask is None
+        assert inputs_embeds is None
+        assert output_attentions is None
+        assert output_hidden_states is None
+        assert return_dict
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfa8fa7c7e120199548fe7149512c0adfe58f6bc13ce19f09b895aa25e8af910
+size 1113232188

rotary.py DELETED Viewed

@@ -1,659 +0,0 @@
-# This implementation was adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/layers/rotary.py
-# Commit id: 3566596ad867ee415dd3c12616dd50c610176f6c
-# Rotary varlen support from https://github.com/Dao-AILab/flash-attention/pull/556
-# Copyright (c) 2023, Tri Dao.
-from typing import Optional, Tuple, Union
-import torch
-from einops import rearrange, repeat
-if torch.cuda.is_available():
-    try:
-        from flash_attn.ops.triton.rotary import apply_rotary
-    except ImportError:
-        def apply_rotary(*args, **kwargs):
-            raise RuntimeError(
-                "FlashAttention is not installed. To proceed with training, please install FlashAttention. "
-                "For inference, you have two options: either install FlashAttention or disable it by setting use_flash_attn=False when loading the model."
-            )
-def rotate_half(x, interleaved=False):
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
-def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos, sin = (
-        cos[: x.shape[1]],
-        sin[: x.shape[1]],
-    )
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-class ApplyRotaryEmb(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        cos,
-        sin,
-        interleaved=False,
-        inplace=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-    ):
-        out = apply_rotary(
-            x,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-            interleaved=interleaved,
-            inplace=inplace,
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(
-                cos, sin, cu_seqlens
-            )  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        ctx.inplace = inplace
-        ctx.max_seqlen = max_seqlen
-        return out if not inplace else x
-    @staticmethod
-    def backward(ctx, do):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cu_seqlens = ctx.saved_tensors
-        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
-        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
-        if not ctx.interleaved and not ctx.inplace:
-            do = do.clone()
-        dx = apply_rotary(
-            do,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=ctx.max_seqlen,
-            interleaved=ctx.interleaved,
-            inplace=ctx.inplace,
-            conjugate=True,
-        )
-        return dx, None, None, None, None, None, None, None
-def apply_rotary_emb(
-    x,
-    cos,
-    sin,
-    interleaved=False,
-    inplace=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[int] = None,
-):
-    """
-    Arguments:
-        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-        cos, sin: (seqlen_rotary, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        inplace: if True, apply rotary embedding in-place.
-        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-        cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Return:
-        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding to the first rotary_dim of x.
-    """
-    return ApplyRotaryEmb.apply(
-        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
-    )
-# For backward compatibility
-apply_rotary_emb_func = apply_rotary_emb
-class ApplyRotaryEmbQKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cos,
-        sin,
-        cos_k=None,
-        sin_k=None,
-        interleaved=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-        use_flash_attn: bool = True,
-    ):
-        # batch, seqlen, three, nheads, headdim = qkv.shape
-        assert qkv.shape[-3] == 3
-        if cos_k is None and sin_k is None and qkv.is_contiguous():
-            if use_flash_attn:
-                # Call 1 kernel instead of 2 kernels
-                # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
-                # dimensions, we get the same tensor
-                qk = rearrange(qkv[..., :2, :, :], "... t h d -> ... (t h) d")
-                # qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
-                apply_rotary(
-                    qk,
-                    cos,
-                    sin,
-                    seqlen_offsets=seqlen_offsets,
-                    interleaved=interleaved,
-                    inplace=True,
-                    cu_seqlens=cu_seqlens,
-                    max_seqlen=max_seqlen,
-                )
-            else:
-                q_rot = apply_rotary_emb_torch(
-                    qkv[:, :, 0],
-                    cos,
-                    sin,
-                    interleaved=interleaved,
-                )
-                k_rot = apply_rotary_emb_torch(
-                    qkv[:, :, 1],
-                    cos,
-                    sin,
-                    interleaved=interleaved,
-                )
-                qkv = torch.stack((q_rot, k_rot, qkv[:, :, 2]), dim=2)
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            q, k = qkv[..., 0, :, :], qkv[..., 1, :, :]
-            apply_rotary(
-                q,
-                cos,
-                sin,
-                seqlen_offsets,
-                interleaved=interleaved,
-                inplace=True,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-            apply_rotary(
-                k,
-                cos_k,
-                sin_k,
-                seqlen_offsets,
-                interleaved=interleaved,
-                inplace=True,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-            ctx.save_for_backward(cos, sin, cos_k, sin_k)
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cos_k, sin_k, cu_seqlens)
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cos_k, sin_k, cu_seqlens, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.max_seqlen = max_seqlen
-        ctx.interleaved = interleaved
-        return qkv
-    @staticmethod
-    def backward(ctx, dqkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cos_k, sin_k, cu_seqlens, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cos_k, sin_k, cu_seqlens = ctx.saved_tensors
-        if cos_k is None and sin_k is None and dqkv.is_contiguous():
-            # Call 1 kernel instead of 2 kernels
-            # We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
-            # dimensions, we get the same tensor
-            dqk = rearrange(dqkv[..., :2, :, :], "... t h d -> ... (t h) d")
-            apply_rotary(
-                dqk,
-                cos,
-                sin,
-                seqlen_offsets=seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=ctx.max_seqlen,
-            )
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            dq, dk = dqkv[..., 0, :, :], dqkv[..., 1, :, :]
-            apply_rotary(
-                dq,
-                cos,
-                sin,
-                seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=ctx.max_seqlen,
-            )
-            apply_rotary(
-                dk,
-                cos_k,
-                sin_k,
-                seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=ctx.max_seqlen,
-            )
-        return dqkv, None, None, None, None, None, None, None, None, None
-def apply_rotary_emb_qkv_(
-    qkv,
-    cos,
-    sin,
-    cos_k=None,
-    sin_k=None,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[int] = None,
-    use_flash_attn=True,
-):
-    """
-    Arguments:
-        qkv: (batch_size, seqlen, 3, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, 3, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-            cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Return:
-        qkv: (batch_size, seqlen, 3, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, 3, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
-    """
-    return ApplyRotaryEmbQKV_.apply(
-        qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets, cu_seqlens, max_seqlen, use_flash_attn,
-    )
-class ApplyRotaryEmbKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        kv,
-        cos,
-        sin,
-        interleaved=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-    ):
-        # batch, seqlen, two, nheads, headdim = kv.shape
-        assert kv.shape[-3] == 2
-        k = kv[..., 0, :, :]
-        apply_rotary(
-            k,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            interleaved=interleaved,
-            inplace=True,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(
-                cos, sin, cu_seqlens
-            )  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.max_seqlen = max_seqlen
-        ctx.interleaved = interleaved
-        return kv
-    @staticmethod
-    def backward(ctx, dkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cu_seqlens = ctx.saved_tensors
-        apply_rotary(
-            dkv[..., 0, :, :],
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            interleaved=ctx.interleaved,
-            inplace=True,
-            conjugate=True,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=ctx.max_seqlen,
-        )
-        return dkv, None, None, None, None, None, None
-apply_rotary_emb_kv_ = ApplyRotaryEmbKV_.apply
-def apply_rotary_emb_kv_(
-    kv,
-    cos,
-    sin,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[int] = None,
-):
-    """
-    Arguments:
-        kv: (batch_size, seqlen, 2, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, 2, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-        cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Return:
-        kv: (batch_size, seqlen, 2, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, 2, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of K.
-    """
-    return ApplyRotaryEmbKV_.apply(
-        kv, cos, sin, interleaved, seqlen_offsets, cu_seqlens, max_seqlen
-    )
-class RotaryEmbedding(torch.nn.Module):
-    """
-    The rotary position embeddings from RoFormer_ (Su et. al).
-    A crucial insight from the method is that the query and keys are
-    transformed by rotation matrices which depend on the relative positions.
-    Other implementations are available in the Rotary Transformer repo_ and in
-    GPT-NeoX_, GPT-NeoX was an inspiration
-    .. _RoFormer: https://arxiv.org/abs/2104.09864
-    .. _repo: https://github.com/ZhuiyiTechnology/roformer
-    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
-    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
-    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
-    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
-    """
-    def __init__(
-        self,
-        dim: int,
-        base=10000.0,
-        interleaved=False,
-        scale_base=None,
-        pos_idx_in_fp32=True,
-        device=None,
-        use_flash_attn=True,
-    ):
-        """
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
-            otherwise they might be in lower precision.
-            This option was added because previously (before 2023-07-02), when we construct
-            the position indices, we use the dtype of self.inv_freq. In most cases this would
-            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
-            self.inv_freq would be bf16, and the position indices are also in bf16.
-            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
-            embeddings for some positions will coincide.
-            To maintain compatibility with models previously trained in pure bf16,
-            we add this option.
-        """
-        super().__init__()
-        self.dim = dim
-        self._base = float(base)
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        self.use_flash_attn = use_flash_attn
-        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.interleaved = interleaved
-        self.scale_base = scale_base
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
-            / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale, persistent=False)
-        self._seq_len_cached = 0
-        self._cos_cached = None
-        self._sin_cached = None
-        self._cos_k_cached = None
-        self._sin_k_cached = None
-    @property
-    def base(self):
-        return self._base
-    @base.setter
-    def base(self, new_base):
-        new_base = float(new_base)
-        if new_base > 0:
-            if self._base != new_base:  # only update if the base value has changed
-                self._base = new_base
-                self._update_cos_sin_cache(
-                    self._seq_len_cached,
-                    device=self.inv_freq.device,
-                    dtype=self._cos_cached.dtype if self._cos_cached is not None else None,
-                    rotary_base_changed=True,
-                )
-        else:
-            raise ValueError("Rotary base value must be positive")
-    def _compute_inv_freq(self, device=None):
-        return 1.0 / (
-            self.base
-            ** (
-                torch.arange(0, self.dim, 2, device=device, dtype=torch.float32)
-                / self.dim
-            )
-        )
-    def _update_cos_sin_cache(
-        self, seqlen, device=None, dtype=None, rotary_base_changed=False
-    ):
-        # Reset the tables if the sequence length has changed,
-        # if we're on a new device (possibly due to tracing for instance),
-        # or if we're switching from inference mode to training
-        # or if the rotary base value was changed
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached is None
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-            or (self.training and self._cos_cached.is_inference())
-            or rotary_base_changed
-        ):
-            self._seq_len_cached = seqlen
-            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
-            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
-            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
-            if rotary_base_changed:
-                self.inv_freq = self._compute_inv_freq(device=device)
-            if self.pos_idx_in_fp32:
-                t = torch.arange(seqlen, device=device, dtype=torch.float32)
-                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
-                # will be large. Having it in bf16 will lose a lot of precision and cause the
-                # cos & sin output to change significantly.
-                # We want to recompute self.inv_freq if it was not loaded in fp32
-                if self.inv_freq.dtype != torch.float32:
-                    inv_freq = self._compute_inv_freq(device=device)
-                else:
-                    inv_freq = self.inv_freq
-            else:
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                inv_freq = self.inv_freq
-            # Don't do einsum, it converts fp32 to fp16 under AMP
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(t, inv_freq)
-            if self.scale is None:
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-            else:
-                power = (
-                    torch.arange(
-                        seqlen, dtype=self.scale.dtype, device=self.scale.device
-                    )
-                    - seqlen // 2
-                ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(
-                    power, "s -> s 1"
-                )
-                # We want the multiplication by scale to happen in fp32
-                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
-    def forward(
-        self,
-        qkv: torch.Tensor,
-        kv: Optional[torch.Tensor] = None,
-        seqlen_offset: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
-             else it's just q of shape (batch, seqlen, nheads, headdim)
-        kv: (batch, seqlen, 2, nheads, headdim)
-        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
-            should pass in max_seqlen, which will update the cos / sin cache up to that length.
-        Apply rotary embedding *inplace* to qkv and / or kv.
-        """
-        if cu_seqlens is not None:
-            assert max_seqlen is not None
-        seqlen = qkv.shape[1] if max_seqlen is None else max_seqlen
-        if max_seqlen is not None:
-            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
-        elif isinstance(seqlen_offset, int):
-            self._update_cos_sin_cache(
-                seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype
-            )
-        if kv is None:
-            if self.scale is None:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                    cu_seqlens=cu_seqlens,
-                    max_seqlen=max_seqlen,
-                    use_flash_attn=self.use_flash_attn,
-                )
-            else:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                    cu_seqlens=cu_seqlens,
-                    max_seqlen=max_seqlen,
-                    use_flash_attn=self.use_flash_attn,
-                )
-        else:
-            q = qkv
-            q = apply_rotary_emb_func(
-                q,
-                self._cos_cached,
-                self._sin_cached,
-                interleaved=self.interleaved,
-                inplace=True,
-                seqlen_offsets=seqlen_offset,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-            if self.scale is None:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                    cu_seqlens=cu_seqlens,
-                    max_seqlen=max_seqlen,
-                )
-            else:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                    cu_seqlens=cu_seqlens,
-                    max_seqlen=max_seqlen,
-                )
-            return q, kv

stochastic_depth.py CHANGED Viewed

@@ -34,7 +34,7 @@
 import torch
 import torch.fx
-from torch import Tensor, nn
 def stochastic_depth(

 import torch
 import torch.fx
+from torch import nn, Tensor
 def stochastic_depth(

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model_max_length": 8194,
+  "tokenizer_class": "XLMRobertaTokenizer"
+}

xlm_padding.py CHANGED Viewed

@@ -18,9 +18,7 @@ class IndexFirstAxis(torch.autograd.Function):
         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
         return torch.gather(
-            rearrange(input, "b ... -> b (...)"),
-            0,
-            repeat(indices, "z -> z d", d=second_dim),
         ).reshape(-1, *other_shape)
     @staticmethod
@@ -36,9 +34,7 @@ class IndexFirstAxis(torch.autograd.Function):
         )
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         # grad_input[indices] = grad_output
-        grad_input.scatter_(
-            0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output
-        )
         return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
@@ -102,7 +98,7 @@ class IndexFirstAxisResidual(torch.autograd.Function):
 index_first_axis_residual = IndexFirstAxisResidual.apply
-def unpad_input(hidden_states, attention_mask, adapter_mask=None):
     """
     Arguments:
         hidden_states: (batch, seqlen, ...)
@@ -116,16 +112,7 @@ def unpad_input(hidden_states, attention_mask, adapter_mask=None):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(
-        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
-    )
-    cu_adapter_mask = (
-        torch.repeat_interleave(adapter_mask, cu_seqlens[1:] - cu_seqlens[:-1])
-        if adapter_mask is not None
-        else None
-    )
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
@@ -136,7 +123,6 @@ def unpad_input(hidden_states, attention_mask, adapter_mask=None):
         indices,
         cu_seqlens,
         max_seqlen_in_batch,
-        cu_adapter_mask,
     )
@@ -194,18 +180,14 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(
-        seqlen, device=length.device, dtype=length.dtype
-    ).expand(len(length), seqlen) < length.unsqueeze(1)
-    real_indices_idx = torch.nonzero(
-        attention_mask_in_length.flatten(), as_tuple=False
-    ).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(
-        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
-    )
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
@@ -233,4 +215,4 @@ def pad_input(hidden_states, indices, batch, seqlen):
     # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
     # output[indices] = hidden_states
     output = index_put_first_axis(hidden_states, indices, batch * seqlen)
-    return rearrange(output, "(b s) ... -> b s ...", b=batch)

         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
         return torch.gather(
+            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
         ).reshape(-1, *other_shape)
     @staticmethod
         )
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
         return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
 index_first_axis_residual = IndexFirstAxisResidual.apply
+def unpad_input(hidden_states, attention_mask):
     """
     Arguments:
         hidden_states: (batch, seqlen, ...)
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
         indices,
         cu_seqlens,
         max_seqlen_in_batch,
     )
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length),
+                                                                                              seqlen) < length.unsqueeze(
+        1)
+    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
     # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
     # output[indices] = hidden_states
     output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)