feat-swiglu

#17

by bwang0911 - opened May 29

base: refs/heads/main

←

from: refs/pr/17

Discussion Files changed

+35

-117

Files changed (5) hide show

activation.py +23 -0
config.json +1 -1
mlp.py +4 -2
modeling_xlm_roberta.py +7 -5
modeling_xlm_roberta_for_glue.py +0 -109

activation.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from flash_attn.ops.activations import swiglu as flash_swiglu
+except ImportError:
+    flash_swiglu = None
+if flash_swiglu is None:
+    # PyTorch implementation of SwiGLU
+    class SwiGLU(nn.Module):
+        def forward(self, x):
+            x, gate = x.chunk(2, dim=-1)
+            return F.silu(gate) * x
+    def swiglu(x):
+        layer = SwiGLU()
+        return layer(x)
+else:
+    # Use Flash Attention's built-in swiglu
+    def swiglu(x):
+        return flash_swiglu(x)

config.json CHANGED Viewed

@@ -12,7 +12,7 @@
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
   "eos_token_id": 2,
-  "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "initializer_range": 0.02,

   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
   "eos_token_id": 2,
+  "hidden_act": "swiglu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "initializer_range": 0.02,

mlp.py CHANGED Viewed

@@ -24,6 +24,8 @@ try:
 except ImportError:
     FusedMLP, ParallelFusedMLP = None, None
 class Mlp(nn.Module):
     def __init__(
@@ -31,7 +33,7 @@ class Mlp(nn.Module):
         in_features,
         hidden_features=None,
         out_features=None,
-        activation=F.gelu,
         bias1=True,
         bias2=True,
         return_residual=False,
@@ -60,7 +62,7 @@ class ParallelMLP(nn.Module):
         in_features,
         hidden_features=None,
         out_features=None,
-        activation=F.gelu,
         process_group: ProcessGroup = None,
         sequence_parallel=True,
         bias1=True,

 except ImportError:
     FusedMLP, ParallelFusedMLP = None, None
+from .activation import swiglu
 class Mlp(nn.Module):
     def __init__(
         in_features,
         hidden_features=None,
         out_features=None,
+        activation=swiglu,
         bias1=True,
         bias2=True,
         return_residual=False,
         in_features,
         hidden_features=None,
         out_features=None,
+        activation=swiglu,
         process_group: ProcessGroup = None,
         sequence_parallel=True,
         bias1=True,

modeling_xlm_roberta.py CHANGED Viewed

@@ -45,6 +45,7 @@ from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
 from .stochastic_depth import StochasticDepth
 try:
@@ -118,19 +119,19 @@ def create_mlp_cls(config, layer_idx=None, return_residual=False):
     inner_dim = config.intermediate_size
     fused_mlp = getattr(config, "fused_mlp", False)
     if fused_mlp:
-        assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
             "fused_mlp only " "supports approximate gelu"
         )
     if not fused_mlp:
         approximate = (
             "tanh"
-            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
             else "none"
         )
         mlp_cls = partial(
             Mlp,
             hidden_features=inner_dim,
-            activation=partial(F.gelu, approximate=approximate),
             return_residual=return_residual,
         )
     else:
@@ -330,10 +331,10 @@ class XLMRobertaPredictionHeadTransform(nn.Module):
         self.dense = linear_cls(config.hidden_size, config.hidden_size)
         approximate = (
             "tanh"
-            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
             else "none"
         )
-        self.transform_act_fn = nn.GELU(approximate=approximate)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -424,6 +425,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             "gelu_new",
             "gelu_fast",
             "gelu_pytorch_tanh",
         ]
         self.embeddings = XLMRobertaEmbeddings(

 from .mha import MHA
 from .mlp import FusedMLP, Mlp
 from .stochastic_depth import StochasticDepth
+from .activation import swiglu
 try:
     inner_dim = config.intermediate_size
     fused_mlp = getattr(config, "fused_mlp", False)
     if fused_mlp:
+        assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh", "swiglu"], (
             "fused_mlp only " "supports approximate gelu"
         )
     if not fused_mlp:
         approximate = (
             "tanh"
+            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh", "swiglu"]
             else "none"
         )
         mlp_cls = partial(
             Mlp,
             hidden_features=inner_dim,
+            activation=swiglu,
             return_residual=return_residual,
         )
     else:
         self.dense = linear_cls(config.hidden_size, config.hidden_size)
         approximate = (
             "tanh"
+            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh", "swiglu"]
             else "none"
         )
+        self.transform_act_fn = swiglu
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             "gelu_new",
             "gelu_fast",
             "gelu_pytorch_tanh",
+            "swiglu",
         ]
         self.embeddings = XLMRobertaEmbeddings(

modeling_xlm_roberta_for_glue.py DELETED Viewed

@@ -1,109 +0,0 @@
-from typing import Optional, Union, Tuple
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
-from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, TokenClassifierOutput
-from .modeling_xlm_roberta import XLMRobertaPreTrainedModel, XLMRobertaModel
-from .configuration_xlm_roberta import XLMRobertaFlashConfig
-class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
-    def __init__(self, config: XLMRobertaFlashConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.roberta = XLMRobertaModel(config)
-        classifier_dropout = (
-            config.classifier_dropout
-            if config.classifier_dropout is not None
-            else config.hidden_dropout_prob
-        )
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        assert head_mask is None
-        assert inputs_embeds is None
-        assert output_attentions is None
-        assert output_hidden_states is None
-        assert return_dict
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (
-                    labels.dtype == torch.long or labels.dtype == torch.int
-                ):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )