Porting v2 models to flash attention (#15)

Browse files

- Added GLUMLP, changed config accordingly, added code to convert state_dict (0211324e8c38d72ef847d46db9a9f389c864a5de)
- fixed GLU implementation, added conversion of layer norms (9587227ceebcbf4e7335c0938838e9a2eb0b5d6b)

Co-authored-by: Markus Krimmel <Markus28@users.noreply.huggingface.co>

Files changed (4) hide show

configuration_bert.py +3 -3
convert_v2_weights.py +144 -0
mlp.py +41 -0
modeling_bert.py +18 -5

configuration_bert.py CHANGED Viewed

@@ -75,7 +75,7 @@ class JinaBertConfig(PretrainedConfig):
         pad_token_id=0,
         window_size=(-1, -1),
         dense_seq_output=False,
-        fused_mlp=False,
         mlp_checkpoint_lvl=0,
         last_layer_subset=False,
         fused_dropout_add_ln=False,
@@ -92,7 +92,7 @@ class JinaBertConfig(PretrainedConfig):
         assert 'max_position_embeddings' not in kwargs
         super().__init__(pad_token_id=pad_token_id, **kwargs)
-        if fused_mlp and hidden_act not in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]:
             raise ValueError('Fused MLP only supports approximate gelu')
         self.vocab_size = vocab_size
@@ -108,7 +108,7 @@ class JinaBertConfig(PretrainedConfig):
         self.layer_norm_eps = layer_norm_eps
         self.window_size = window_size
         self.dense_seq_output = dense_seq_output
-        self.fused_mlp = fused_mlp
         self.mlp_checkpoint_lvl = mlp_checkpoint_lvl
         self.last_layer_subset = last_layer_subset
         self.fused_dropout_add_ln = fused_dropout_add_ln

         pad_token_id=0,
         window_size=(-1, -1),
         dense_seq_output=False,
+        mlp_type='mlp',
         mlp_checkpoint_lvl=0,
         last_layer_subset=False,
         fused_dropout_add_ln=False,
         assert 'max_position_embeddings' not in kwargs
         super().__init__(pad_token_id=pad_token_id, **kwargs)
+        if mlp_type == 'fused_mlp' and hidden_act not in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]:
             raise ValueError('Fused MLP only supports approximate gelu')
         self.vocab_size = vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.window_size = window_size
         self.dense_seq_output = dense_seq_output
+        self.mlp_type= mlp_type
         self.mlp_checkpoint_lvl = mlp_checkpoint_lvl
         self.last_layer_subset = last_layer_subset
         self.fused_dropout_add_ln = fused_dropout_add_ln

convert_v2_weights.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import re
+from collections import OrderedDict
+from transformers import AutoModel, AutoTokenizer
+from .configuration_bert import JinaBertConfig
+import torch
+from .modeling_bert import BertModel
+def remap_state_dict(state_dict, config: JinaBertConfig):
+    """
+    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
+    """
+    # LayerNorm
+    def key_mapping_ln_gamma_beta(key):
+        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
+        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
+        return key
+    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
+    # Layers
+    def key_mapping_layers(key):
+        return re.sub(r"^encoder.layer.", "encoder.layers.", key)
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^embeddings.LayerNorm.", "emb_ln.", key)
+        key = re.sub(
+            r"^encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
+            r"encoder.layers.\1.norm1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
+            r"encoder.layers.\1.norm2.\2",
+            key,
+        )
+        key = re.sub(
+            r"^cls.predictions.transform.LayerNorm.(weight|bias)",
+            r"cls.predictions.transform.layer_norm.\1",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^encoder.layers.(\d+).intermediate.dense.(weight|bias)",
+            r"encoder.layers.\1.mlp.fc1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^encoder.layers.(\d+).output.dense.(weight|bias)",
+            r"encoder.layers.\1.mlp.fc2.\2",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    last_layer_subset = getattr(config, "last_layer_subset", False)
+    for d in range(config.num_hidden_layers):
+        Wq = state_dict.pop(f"encoder.layers.{d}.attention.self.query.weight")
+        Wk = state_dict.pop(f"encoder.layers.{d}.attention.self.key.weight")
+        Wv = state_dict.pop(f"encoder.layers.{d}.attention.self.value.weight")
+        bq = state_dict.pop(f"encoder.layers.{d}.attention.self.query.bias")
+        bk = state_dict.pop(f"encoder.layers.{d}.attention.self.key.bias")
+        bv = state_dict.pop(f"encoder.layers.{d}.attention.self.value.bias")
+        if not (last_layer_subset and d == config.num_hidden_layers - 1):
+            state_dict[f"encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
+                [Wq, Wk, Wv], dim=0
+            )
+            state_dict[f"encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
+        else:
+            state_dict[f"encoder.layers.{d}.mixer.Wq.weight"] = Wq
+            state_dict[f"encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
+            state_dict[f"encoder.layers.{d}.mixer.Wq.bias"] = bq
+            state_dict[f"encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^encoder.layers.(\d+).attention.output.dense.(weight|bias)",
+            r"encoder.layers.\1.mixer.out_proj.\2",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    def key_mapping_decoder_bias(key):
+        return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
+    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
+    # Word embedding
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    if pad_vocab_size_multiple > 1:
+        word_embeddings = state_dict["embeddings.word_embeddings.weight"]
+        state_dict["embeddings.word_embeddings.weight"] = F.pad(
+            word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
+        )
+        decoder_weight = state_dict["cls.predictions.decoder.weight"]
+        state_dict["cls.predictions.decoder.weight"] = F.pad(
+            decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
+        )
+        # If the vocab was padded, we want to set the decoder bias for those padded indices to be
+        # strongly negative (i.e. the decoder shouldn't predict those indices).
+        # TD [2022-05-09]: I don't think it affects the MLPerf training.
+        decoder_bias = state_dict["cls.predictions.decoder.bias"]
+        state_dict["cls.predictions.decoder.bias"] = F.pad(
+            decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
+        )
+    # LayerNorm
+    def key_mapping_layernorm(key):
+        return re.sub(r'^encoder.layers.(\d+).mlp.layernorm.(weight|bias)', r"encoder.layers.\1.norm2.\2", key)
+    state_dict = OrderedDict((key_mapping_layernorm(k), v) for k, v in state_dict.items())
+    return state_dict
+v2_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
+config = JinaBertConfig(vocab_size=30528, use_qk_norm=False, mlp_type='glu', hidden_act='gelu')
+state_dict = v2_model.state_dict()
+new_state_dict = remap_state_dict(state_dict, config)
+flash_model = BertModel(config)
+flash_model.load_state_dict(new_state_dict)
+tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
+inp = tokenizer.batch_encode_plus(['Hello world', 'How is the weather today?', 'It is raining a lot in  Berlin'], return_tensors='pt', padding=True).to('cuda')
+v2_model.eval()
+flash_model.eval()
+v2_model = v2_model.to('cuda', torch.float16)
+flash_model = flash_model.to('cuda', torch.float16)
+output_v2 = v2_model(**inp)
+output_flash = flash_model(**inp)
+x = output_v2.last_hidden_state
+y = output_flash.last_hidden_state
+print(torch.abs(x - y))

mlp.py CHANGED Viewed

@@ -27,6 +27,47 @@ except ImportError:
     FusedMLP, ParallelFusedMLP = None, None
 class Mlp(nn.Module):
     def __init__(
         self,

     FusedMLP, ParallelFusedMLP = None, None
+class GLUMLP(nn.Module):
+    def __init__(
+            self,
+            in_features,
+            hidden_features,
+            activation,
+            return_residual=False,
+            hidden_dropout_prob=0.1
+    ):
+        super().__init__()
+        self.hidden_features = hidden_features
+        self.gated_layers = nn.Linear(
+            in_features, hidden_features * 2, bias=False
+        )
+        if activation == 'relu':
+            self.act = nn.ReLU()
+        elif activation == 'gelu':
+            self.act = nn.GELU()
+        else:
+            raise ValueError(
+                f"activation {activation} not supported"
+            )
+        self.wo = nn.Linear(hidden_features, in_features)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+        self.return_residual = return_residual
+        #self.layernorm = nn.LayerNorm(in_features, eps=layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual_connection = hidden_states
+        # compute the activation
+        hidden_states = self.gated_layers(hidden_states)
+        gated = hidden_states[:, : self.hidden_features]
+        non_gated = hidden_states[:, self.hidden_features :]
+        hidden_states = self.act(gated) * non_gated
+        hidden_states = self.dropout(hidden_states)
+        # multiply by the second matrix
+        hidden_states = self.wo(hidden_states)
+        # add the residual connection and post-LN
+        # hidden_states = self.layernorm(hidden_states + residual_connection)
+        return hidden_states if not self.return_residual else (hidden_states, residual_connection)
 class Mlp(nn.Module):
     def __init__(
         self,

modeling_bert.py CHANGED Viewed

@@ -39,7 +39,7 @@ from .bert_padding import (
 from .block import Block
 from .embedding import BertEmbeddings
 from .mha import MHA
-from .mlp import FusedMLP, Mlp
 try:
     from flash_attn.ops.fused_dense import FusedDense
@@ -89,12 +89,15 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
 def create_mlp_cls(config, layer_idx=None, return_residual=False):
     inner_dim = config.intermediate_size
-    fused_mlp = getattr(config, "fused_mlp", False)
-    if fused_mlp:
         assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
             "fused_mlp only " "supports approximate gelu"
         )
-    if not fused_mlp:
         approximate = (
             "tanh"
             if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
@@ -106,7 +109,15 @@ def create_mlp_cls(config, layer_idx=None, return_residual=False):
             activation=partial(F.gelu, approximate=approximate),
             return_residual=return_residual,
         )
-    else:
         if FusedMLP is None:
             raise ImportError("fused_dense is not installed")
         mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
@@ -120,6 +131,8 @@ def create_mlp_cls(config, layer_idx=None, return_residual=False):
             checkpoint_lvl=mlp_checkpoint_lvl,
             return_residual=return_residual,
         )
     return mlp_cls

 from .block import Block
 from .embedding import BertEmbeddings
 from .mha import MHA
+from .mlp import FusedMLP, Mlp, GLUMLP
 try:
     from flash_attn.ops.fused_dense import FusedDense
 def create_mlp_cls(config, layer_idx=None, return_residual=False):
     inner_dim = config.intermediate_size
+    mlp_type = config.mlp_type
+    assert mlp_type in ('mlp', 'fused_mlp', 'glu')
+    if mlp_type == 'fused_mlp':
         assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
             "fused_mlp only " "supports approximate gelu"
         )
+    if mlp_type == 'glu':
+        assert config.hidden_act in ('relu', 'gelu')
+    if mlp_type == 'mlp':
         approximate = (
             "tanh"
             if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
             activation=partial(F.gelu, approximate=approximate),
             return_residual=return_residual,
         )
+    elif mlp_type == 'glu':
+        mlp_cls = partial(
+            GLUMLP,
+            hidden_features=inner_dim,
+            activation=config.hidden_act,
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            return_residual=return_residual,
+        )
+    elif mlp_type == 'fused_mlp':
         if FusedMLP is None:
             raise ImportError("fused_dense is not installed")
         mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
             checkpoint_lvl=mlp_checkpoint_lvl,
             return_residual=return_residual,
         )
+    else:
+        raise NotImplementedError
     return mlp_cls