Upload 10 files

Browse files

Files changed (10) hide show

attn_ref.py +29 -0
configuration_flash_t5.py +3 -9
cross_entropy_loss.py +277 -0
custom_heads_flash_t5.py +312 -0
fa2_compilable.py +642 -0
flash_attention_v2_bias.py +859 -0
gated_mlp.py +729 -0
modeling_flash_t5.py +839 -0
positional_encoding.py +337 -0
rms_norm.py +227 -0

attn_ref.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+def attn_ref(q, k, v, b, sm_scale, dropout_p=0.0, causal=False, upcast=False):
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+        if b is not None:
+            b = b.float()
+    if b is not None:
+        if (b.shape[0] != q.shape[0]) or (b.shape[1] != q.shape[1]):
+            b = b.expand(q.shape[0], q.shape[1], q.shape[2], k.shape[2])
+    ms = torch.arange(q.shape[2], device=q.device).unsqueeze(-1)
+    ns = torch.arange(k.shape[2], device=q.device)
+    p = torch.matmul(q, k.transpose(2, 3))
+    p *= sm_scale
+    if b is not None:
+        p += b
+    if causal:
+        p = torch.where(ms + k.shape[2] - q.shape[2] >= ns, p, float("-inf"))
+    p = torch.softmax(p.float(), dim=-1).to(q.dtype)
+    if dropout_p > 0.0:
+        p = torch.dropout(p, dropout_p, train=True)
+    ref_out = torch.matmul(p, v)
+    return ref_out

configuration_flash_t5.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 from transformers import T5Config
 AUTO_MAP = {
-    "AutoModel": "modeling_flash_t5.FlashT5ForConditionalGeneration",
     "AutoModelForSeq2SeqLM": "modeling_flash_t5.FlashT5ForConditionalGeneration",
     "AutoModelForTokenClassification": "custom_heads_flash_t5.FlashT5ForTokenClassification",
     "AutoModelForQuestionAnswering": "custom_heads_flash_t5.FlashT5ForQuestionAnswering",
@@ -26,7 +26,7 @@ class FlashT5Config(T5Config):
         use_randomized_position_encoding=False,
         label_smoothing=0.0,
         z_loss=None,
-        attention_type="ref",
         max_sequence_length=1024,
         attention_dropout_rate=0.0,
         alibi_mode="symetric",
@@ -39,9 +39,6 @@ class FlashT5Config(T5Config):
         rotary_base=10000,
         rotary_interleaved=False,
         rotary_scale_base=None,
-        fire_mlp_width=32,
-        use_masking=False,
-        attention_scale=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -53,7 +50,7 @@ class FlashT5Config(T5Config):
         self.use_randomized_position_encoding = use_randomized_position_encoding
         self.label_smoothing = label_smoothing
         self.z_loss = z_loss
-        self.attention_type = attention_type
         self.max_sequence_length = max_sequence_length
         self.alibi_mode = alibi_mode
         self.attention_dropout_rate = attention_dropout_rate
@@ -66,9 +63,6 @@ class FlashT5Config(T5Config):
         self.rotary_interleaved = rotary_interleaved
         self.rotary_scale_base = rotary_scale_base
         self.rotary_emb_fraction = rotary_emb_fraction
-        self.fire_mlp_width = fire_mlp_width
-        self.use_masking = use_masking
-        self.attention_scale = attention_scale
         self.auto_map = AUTO_MAP

 from transformers import T5Config
 AUTO_MAP = {
+    "AutoModel": "modeling_flash_t5.FlashT5EncoderModel",
     "AutoModelForSeq2SeqLM": "modeling_flash_t5.FlashT5ForConditionalGeneration",
     "AutoModelForTokenClassification": "custom_heads_flash_t5.FlashT5ForTokenClassification",
     "AutoModelForQuestionAnswering": "custom_heads_flash_t5.FlashT5ForQuestionAnswering",
         use_randomized_position_encoding=False,
         label_smoothing=0.0,
         z_loss=None,
+        use_flash_attention=None,
         max_sequence_length=1024,
         attention_dropout_rate=0.0,
         alibi_mode="symetric",
         rotary_base=10000,
         rotary_interleaved=False,
         rotary_scale_base=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.use_randomized_position_encoding = use_randomized_position_encoding
         self.label_smoothing = label_smoothing
         self.z_loss = z_loss
+        self.use_flash_attention = use_flash_attention
         self.max_sequence_length = max_sequence_length
         self.alibi_mode = alibi_mode
         self.attention_dropout_rate = attention_dropout_rate
         self.rotary_interleaved = rotary_interleaved
         self.rotary_scale_base = rotary_scale_base
         self.rotary_emb_fraction = rotary_emb_fraction
         self.auto_map = AUTO_MAP

cross_entropy_loss.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+# Copyright 2024 CATIE. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modification to the original version from Unsloth:
+# - return the z-loss
+# - support for torch.compile
+import triton
+import triton.language as tl
+import torch
+MAX_FUSED_SIZE = 65536
+next_power_of_2 = triton.next_power_of_2
+def calculate_settings(n):
+    BLOCK_SIZE = next_power_of_2(n)
+    if BLOCK_SIZE > MAX_FUSED_SIZE:
+        raise RuntimeError(f"Cannot launch Triton kernel since n = {n} exceeds "\
+                           f"the maximum CUDA blocksize = {MAX_FUSED_SIZE}.")
+    num_warps = 4
+    if   BLOCK_SIZE >= 32768: num_warps = 32
+    elif BLOCK_SIZE >=  8192: num_warps = 16
+    elif BLOCK_SIZE >=  2048: num_warps = 8
+    return BLOCK_SIZE, num_warps
+@triton.jit
+def _cross_entropy_forward(logits_ptr, logits_row_stride,
+                           loss_ptr,
+                           lse_ptr,
+                           labels_ptr,
+                           n_cols,
+                           BLOCK_SIZE: tl.constexpr,
+                           IS_EVEN: tl.constexpr):
+    """
+        Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
+        Pi = exp(xi) / sum(exp(xi))
+        CE_i = -y log(p) = -y log[ exp(x) / sum(exp(x)) ]
+             = -y [ x - log[sum(exp(x))] ]
+             = y * (log[sum(exp(x))] - x)
+        If y == 0: CE_i = 0
+        If y == 1: CE_i = logsumexp - x
+    """
+    row_idx = tl.program_id(0)
+    logits_ptr += row_idx * logits_row_stride
+    loss_ptr   += row_idx
+    lse_ptr    += row_idx
+    labels_ptr += row_idx
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    # TODO: Fixup int32 locations to int64
+    label_idx = tl.load(labels_ptr).to(tl.int32)
+    if IS_EVEN:
+        logits = tl.load(logits_ptr + col_offsets).to(tl.float32)
+    else:
+        logits = tl.load(logits_ptr + col_offsets, mask=mask, other=-float("inf")).to(tl.float32)
+    max_logits = tl.max(logits, 0)
+    # Maximum stops overflow
+    lse = tl.log(tl.sum(tl.exp(logits - max_logits), 0)) + max_logits
+    tl.store(lse_ptr, lse)
+    if label_idx != -100:
+        logits_label = tl.load(logits_ptr + label_idx).to(tl.float32)
+        loss = lse - logits_label
+    else:
+        loss = 0.0
+    tl.store(loss_ptr, loss)
+@triton.jit
+def _cross_entropy_backward(logits_ptr, logits_row_stride,
+                            dinputs_ptr, dinputs_row_stride,
+                            dloss_ptr,  dloss_row_stride,
+                            dzloss_ptr, dzloss_row_stride,
+                            lse_ptr,
+                            labels_ptr,
+                            n_cols,
+                            BLOCK_SIZE: tl.constexpr,
+                            USE_Z_LOSS: tl.constexpr,
+                            IS_EVEN: tl.constexpr):
+    """
+        CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
+        dC/dx = d/dx (y * log[sum(exp(x))] - x * y)
+        From https://en.wikipedia.org/wiki/LogSumExp
+        d/dx logsumexp = exp(x) / sum(exp(x)) = softmax(x)
+        dC/dx = y * exp(x) / sum(exp(x)) - d/dx (x * y)
+        dC/dx = y * exp[ log[exp(x) / sum(exp(x))] ] using x = exp(log(x)) trick
+        dC/dx = y * exp[x - logsumexp] - d/dx (x * y)
+        If y == 0: dC/dx = 0
+        If y == 1 and x == label: dC/dlabel = exp[x - logsumexp] - 1
+        If y == 1 and x != label: dC/dx     = exp[x - logsumexp]
+    """
+    row_idx = tl.program_id(0)
+    logits_ptr += row_idx * logits_row_stride
+    dinputs_ptr += row_idx * dinputs_row_stride
+    dloss_ptr  += row_idx *  dloss_row_stride
+    dzloss_ptr  += row_idx *  dzloss_row_stride
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    # TODO: Fixup int32 locations to int64
+    label_idx = tl.load(labels_ptr + row_idx).to(tl.int32)
+    if label_idx != -100:
+        dloss = tl.load(dloss_ptr)
+        dzloss = tl.load(dzloss_ptr)
+    else:
+        dloss = 0.0
+        dzloss = 0.0
+    if IS_EVEN:
+        logits = tl.load(logits_ptr + col_offsets).to(tl.float32)
+    else:
+        logits = tl.load(logits_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
+    lse = tl.load(lse_ptr + row_idx)
+    probs = tl.exp(logits - lse)
+    probs = tl.where(col_offsets == label_idx, probs - 1.0, probs)
+    din = dloss * probs
+    # Z_loss
+    if USE_Z_LOSS:
+        if label_idx != -100:
+            dzloss = tl.load(dzloss_ptr)
+        else:
+            dzloss = 0.0
+        row_minus_max = logits
+        numerator = tl.exp(row_minus_max)
+        denominator = tl.sum(numerator, axis=0)
+        softmax_output = numerator / denominator
+        din += softmax_output * dzloss
+    if IS_EVEN:
+        tl.store(dinputs_ptr + col_offsets, din)
+    else:
+        tl.store(dinputs_ptr + col_offsets, din, mask=mask)
+# Wrapper for triton kernel for torch.compile - should be unecessary for PyTorch 2.3 ?
+torch.library.define("flasht5::cross_entropy_triton_fwd", "(Tensor logits, Tensor labels, int n_cols, int n_rows, int BLOCK_SIZE, int num_warps) -> (Tensor, Tensor)")
+@torch.library.impl("flasht5::cross_entropy_triton_fwd", "default")
+def cross_entropy_triton_fwd(logits, labels, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    losses    = torch.empty(n_rows, dtype=torch.float32, device=logits.device)
+    logsumexp = torch.empty(n_rows, dtype=torch.float32, device=logits.device)
+    _cross_entropy_forward[(n_rows,)](
+        logits, logits.stride(0),
+        losses,
+        logsumexp,
+        labels,
+        n_cols,
+        BLOCK_SIZE = BLOCK_SIZE,
+        IS_EVEN=((n_cols % BLOCK_SIZE) == 0),
+        num_warps  = num_warps,
+    )
+    return losses, logsumexp
+@torch.library.impl_abstract("flasht5::cross_entropy_triton_fwd", cross_entropy_triton_fwd)
+def cross_entropy_triton_fwd_abstract(logits, labels, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    losses    = torch.empty(n_rows, dtype=torch.float32, device=logits.device)
+    logsumexp = torch.empty(n_rows, dtype=torch.float32, device=logits.device)
+    return losses, logsumexp
+torch.library.define("flasht5::cross_entropy_triton_bwd", "(Tensor dlosses, Tensor dlogsumexp, Tensor logits, Tensor logsumexp, Tensor labels, float z_loss_factor, int n_cols, int n_rows, int BLOCK_SIZE, int num_warps) -> Tensor")
+@torch.library.impl("flasht5::cross_entropy_triton_bwd", "default")
+def cross_entropy_triton_bwd(dlosses, dlogsumexp, logits, logsumexp, labels, z_loss_factor, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    dinputs = torch.empty_like(logits)
+    _cross_entropy_backward[(n_rows,)](
+        logits,   logits.stride(0),
+        dinputs, dinputs.stride(0),
+        dlosses, dlosses.stride(0),
+        dlogsumexp, dlogsumexp.stride(0),
+        logsumexp,
+        labels,
+        n_cols,
+        BLOCK_SIZE = BLOCK_SIZE,
+        USE_Z_LOSS = (z_loss_factor != 0.0),
+        IS_EVEN=((n_cols % BLOCK_SIZE) == 0),
+        num_warps  = num_warps,
+    )
+    return dinputs
+@torch.library.impl_abstract("flasht5::cross_entropy_triton_bwd", cross_entropy_triton_bwd)
+def cross_entropy_triton_bwd_abstract(dlosses, dlogsumexp, logits, logsumexp, labels, z_loss_factor, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    return torch.empty_like(logits)
+class Fast_CrossEntropyLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, logits, labels, z_loss_factor):
+        n_rows, n_cols = logits.shape
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        losses, logsumexp = torch.ops.flasht5.cross_entropy_triton_fwd(
+            logits,
+            labels,
+            n_cols,
+            n_rows,
+            BLOCK_SIZE = BLOCK_SIZE,
+            num_warps  = num_warps
+        )
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.z_loss_factor = z_loss_factor
+        ctx.save_for_backward(logits, logsumexp, labels)
+        return losses, logsumexp
+    @staticmethod
+    def backward(ctx, dlosses, dlogsumexp):
+        logits, logsumexp, labels = ctx.saved_tensors
+        n_rows, n_cols = logits.shape
+        dinputs = torch.ops.flasht5.cross_entropy_triton_bwd(
+            dlosses,
+            dlogsumexp,
+            logits,
+            logsumexp,
+            labels,
+            ctx.z_loss_factor,
+            n_cols,
+            n_rows,
+            ctx.BLOCK_SIZE,
+            ctx.num_warps
+        )
+        return dinputs, None, None
+def fast_cross_entropy_loss(logits, labels, z_loss_factor=0.0):
+    """
+    Arguments:
+        logits: (batch, seq_len, vocab_size)
+        labels: (batch, seq_len,)
+    Returns:
+        losses: float
+    """
+    batch, seq_len, d = logits.shape
+    assert(labels.shape == (batch, seq_len))
+    assert (d <= MAX_FUSED_SIZE)
+    loss, lse = Fast_CrossEntropyLoss.apply(
+        logits.view(batch*seq_len, d),
+        labels.view(-1),
+        z_loss_factor
+    )
+    n_items = torch.count_nonzero(labels != -100)
+    return loss.sum() / n_items, (z_loss_factor * torch.square(lse).sum()) / n_items

custom_heads_flash_t5.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+import copy
+from typing import Optional, Union, Tuple, List
+from transformers.modeling_outputs import (
+    Seq2SeqQuestionAnsweringModelOutput,
+    QuestionAnsweringModelOutput,
+    TokenClassifierOutput,
+    BaseModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+    SequenceClassifierOutput
+)
+from .modeling_flash_t5 import FlashT5PreTrainedModel, FlashT5Stack, FlashT5Model, FlashT5EncoderModel
+from .configuration_flash_t5 import FlashT5Config
+################## Encoder only head ##################
+class FlashT5ForTokenClassification(FlashT5PreTrainedModel):
+    def __init__(self, config: FlashT5Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.encoder = FlashT5Stack(config, self.shared)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Initialize classifier
+        self.classifier.weight.data.normal_(mean=0.0, std=config.initializer_factor * 1.0)
+        self.classifier.bias.data.zero_()
+        self.model_parallel = False
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits, outputs[2:-1])
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class FlashT5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+    def __init__(self, config: FlashT5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+        # initialize weights
+        factor = config.initializer_factor
+        self.dense.weight.data.normal_(mean=0.0, std=factor * ((config.d_model) ** -0.5))
+        if hasattr(self.dense, "bias") and self.dense.bias is not None:
+            self.dense.bias.data.zero_()
+        self.out_proj.weight.data.normal_(mean=0.0, std=factor * ((config.d_model) ** -0.5))
+        if hasattr(self.out_proj, "bias") and self.out_proj.bias is not None:
+            self.out_proj.bias.data.zero_()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+class FlashT5ForSequenceClassification(FlashT5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    def __init__(self, config: FlashT5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+        self.config.problem_type = None
+        self.config.is_encoder_decoder = False
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.is_encoder_decoder = False
+        encoder_config.use_cache = False
+        self.encoder = FlashT5Stack(encoder_config, self.shared)
+        self.classification_head = FlashT5ClassificationHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.model_parallel = False
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        batch_size, _, hidden_size = sequence_output.shape
+        sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions
+        )
+################## Seq2Seq head ##################
+class FlashT5ForQuestionAnswering(FlashT5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    def __init__(self, config: FlashT5Config):
+        super().__init__(config)
+        self.transformer = FlashT5EncoderModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.model_parallel = False
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, MTxEncoderForQuestionAnswering
+        >>> tokenizer = AutoTokenizer.from_pretrained("MTx-small")
+        >>> model = MTxEncoderForQuestionAnswering.from_pretrained("MTx-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> start_logits = outputs.start_logits
+        >>> end_logits = outputs.end_logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

fa2_compilable.py ADDED Viewed

	@@ -0,0 +1,642 @@

+# Copyright (c) 2023, Tri Dao.
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+# isort: off
+# We need to import the CUDA kernels after importing torch
+import flash_attn_2_cuda as flash_attn_cuda
+# isort: on
+torch.library.define("fa2::fwd", "(Tensor q, Tensor k, Tensor v, Tensor out, Tensor alibi_slopes, float dropout_p, float softmax_scale, bool causal, int window_size_left, int window_size_right, Tensor attn_bias, bool return_softmax, Tensor gen_) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)")
+@torch.library.impl("fa2::fwd", "default")
+def cuda_fa2_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    alibi_slopes: torch.Tensor,
+    dropout_p: float,
+    softmax_scale: float,
+    causal: bool,
+    window_size_left: int,
+    window_size_right: int,
+    attn_bias: torch.Tensor,
+    return_softmax: bool,
+    gen_: torch.Tensor,
+):
+    out, q, k, v, out_padded, attn_bias, softmax_lse, S_dmask, rng_state = flash_attn_cuda.fwd(q, k, v, out, alibi_slopes, dropout_p, softmax_scale, causal, window_size_left, window_size_right, attn_bias, return_softmax, None)
+    return  out, q, k, v, out_padded, attn_bias, softmax_lse, S_dmask, rng_state
+@torch.library.impl_abstract("fa2::fwd", cuda_fa2_fwd)
+def meta_fa2_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    alibi_slopes: torch.Tensor,
+    dropout_p: float,
+    softmax_scale: float,
+    causal: bool,
+    window_size_left: int,
+    window_size_right: int,
+    attn_bias: torch.Tensor,
+    return_softmax: bool,
+    gen_: torch.Tensor
+):
+    round_multiple = lambda x, m: (x + m - 1) // m * m
+    batch_size = q.shape[0]
+    seqlen_q = q.shape[1]
+    seqlen_k = k.shape[1]
+    num_heads = q.shape[2]
+    head_dim_og = q.shape[3]
+    seqlen_q_rounded = round_multiple(seqlen_q, 128)
+    seqlen_k_rounded = round_multiple(seqlen_k, 128)
+    seqlen_q_rounded_8 = round_multiple(seqlen_q, 8)
+    seqlen_k_rounded_8 = round_multiple(seqlen_k, 8)
+    head_dim = round_multiple(head_dim_og, 8)
+    if attn_bias is not None:
+        batch_size_bias = attn_bias.shape[0]
+        num_heads_bias = attn_bias.shape[1]
+    return (torch.empty_strided((batch_size, seqlen_q, num_heads, head_dim_og),
+                (head_dim*num_heads*seqlen_q, head_dim*num_heads, head_dim, 1), device=q.device, dtype=q.dtype), # out
+        q.new_empty((batch_size, seqlen_q, num_heads, head_dim)), # q_padded
+        k.new_empty((batch_size, seqlen_k, num_heads, head_dim)), # k_padded
+        v.new_empty((batch_size, seqlen_k, num_heads, head_dim)), # v_padded
+        q.new_empty((batch_size, seqlen_q, num_heads, head_dim)), # out_padded
+        q.new_empty((batch_size_bias, num_heads_bias, seqlen_q_rounded_8, seqlen_k_rounded_8)) if attn_bias is not None else None, # attn_bias
+        q.new_empty((batch_size, num_heads, seqlen_q)), # softmax_lse
+        q.new_empty((batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded)) if return_softmax and (dropout_p > 0) else None, # p
+        torch.empty((2), dtype=torch.int64, device=q.device) # rng_state
+        )
+torch.library.define("fa2::bwd", "(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor dq, Tensor dk, Tensor dv, Tensor alibi_slopes, float dropout_p, float softmax_scale, bool causal, int window_size_left, int window_size_right, bool deterministic, Tensor attn_bias, bool attn_bias_require_grad, Tensor ds, int seqlen_k_orig, Tensor gen_, Tensor rng_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)")
+@torch.library.impl("fa2::bwd", "default")
+def cuda_fa2_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    dv: torch.Tensor,
+    alibi_slopes: torch.Tensor,
+    dropout_p: float,
+    softmax_scale: float,
+    causal: bool,
+    window_size_left: int,
+    window_size_right: int,
+    deterministic: bool,
+    attn_bias: torch.Tensor,
+    attn_bias_require_grad: bool,
+    ds: torch.Tensor,
+    seqlen_k_orig: int,
+    gen_: torch.Tensor,
+    rng_sate: torch.Tensor
+):
+    dq, dk, dv, ds, s = flash_attn_cuda.bwd(dout, q, k, v, out, softmax_lse, dq, dk, dv, alibi_slopes, dropout_p, softmax_scale, causal, window_size_left, window_size_right, deterministic, attn_bias, attn_bias_require_grad, ds, None, rng_sate)
+    return dq, dk, dv, ds, s
+@torch.library.impl_abstract("fa2::bwd", cuda_fa2_bwd)
+def meta_fa2_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    dv: torch.Tensor,
+    alibi_slopes: torch.Tensor,
+    dropout_p: float,
+    softmax_scale: float,
+    causal: bool,
+    window_size_left: int,
+    window_size_right: int,
+    deterministic: bool,
+    attn_bias: torch.Tensor,
+    attn_bias_require_grad: bool,
+    ds: torch.Tensor,
+    seqlen_k_orig: int,
+    gen_: torch.Tensor,
+    rng_sate: torch.Tensor
+):
+    round_multiple = lambda x, m: (x + m - 1) // m * m
+    batch_size = dout.shape[0]
+    seqlen_q = dout.shape[1]
+    seqlen_k = k.shape[1]
+    seqlen_q_rounded = round_multiple(seqlen_q, 128)
+    num_heads = dout.shape[2]
+    head_dim_og = dout.shape[3]
+    head_dim = round_multiple(head_dim_og, 8)
+    seqlen_q_round8 = round_multiple(seqlen_q, 8)
+    seqlen_k_round8 = round_multiple(seqlen_k_orig, 8)
+    if attn_bias is not None:
+        batch_size_bias = attn_bias.shape[0]
+        num_heads_bias = attn_bias.shape[1]
+    return (torch.empty_strided((batch_size, seqlen_q, num_heads, head_dim_og),
+                (head_dim*num_heads*seqlen_q, head_dim*num_heads, head_dim, 1), device=q.device, dtype=q.dtype),
+        torch.empty_strided((batch_size, seqlen_k_orig, num_heads, head_dim_og),
+                (head_dim*num_heads*seqlen_k, head_dim*num_heads, head_dim, 1), device=k.device, dtype=k.dtype),
+        torch.empty_strided((batch_size, seqlen_k, num_heads, head_dim_og),
+                (head_dim*num_heads*seqlen_k, head_dim*num_heads, head_dim, 1), device=v.device, dtype=v.dtype),
+        torch.empty_strided((batch_size_bias, num_heads_bias, seqlen_q, seqlen_k_orig),
+                (num_heads_bias*seqlen_q_round8*seqlen_k_round8, seqlen_q_round8*seqlen_k_round8, seqlen_q_round8, 1), device=v.device, dtype=v.dtype)
+                if attn_bias_require_grad else None,
+        q.new_empty((batch_size, num_heads, seqlen_q_rounded))
+        )
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size_left,
+        window_size_right,
+        alibi_slopes,
+        deterministic,
+        attn_bias,
+        return_softmax,
+        return_ds
+    ):
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        out, q_padded, k_padded, v_padded, out_padded, attn_bias_padded, softmax_lse, S_dmask, rng_state = torch.ops.fa2.fwd(
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            None,
+            alibi_slopes,
+            dropout_p,
+            softmax_scale,
+            causal,
+            window_size_left,
+            window_size_right,
+            attn_bias,
+            return_softmax and dropout_p > 0,
+            None
+        )
+        ## WORKAROUND a Pytorch bug, should use _padded version of the tensors but this is buggy when passing them directly to save_for_backward
+        ## For now, this breaks the backward when headdim is not a multiple of 8 and/or seqlen_q, seqlen_k are not a multiple of 8
+        ## TODO: make the padding here instead
+        ctx.save_for_backward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], out, softmax_lse, rng_state, attn_bias, alibi_slopes)
+        #ctx.save_for_backward(q_padded, k_padded, v_padded, out_padded, softmax_lse, rng_state, attn_bias_padded, alibi_slopes)
+        ctx.dropout_p = dropout_p
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size_left = window_size_left
+        ctx.window_size_right = window_size_right
+        ctx.deterministic = deterministic
+        ctx.bias_requires_grad = True if attn_bias is not None and return_ds else False
+        ctx.seqlen_k_orig = qkv.shape[1]
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, rng_state, attn_bias, alibi_slopes = ctx.saved_tensors
+        dq, dk, dv, ds, _ = torch.ops.fa2.bwd(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            None,
+            None,
+            None,
+            alibi_slopes,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size_left,
+            ctx.window_size_right,
+            ctx.deterministic,
+            attn_bias,
+            ctx.bias_requires_grad,
+            None,
+            ctx.seqlen_k_orig,
+            None,
+            rng_state
+        )
+        dqkv = torch.stack([dq, dk, dv], dim=2)
+        return dqkv, None, None, None, None, None, None, None, ds, None, None
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size_left,
+        window_size_right,
+        alibi_slopes,
+        deterministic,
+        attn_bias,
+        return_softmax,
+        return_ds
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q_padded, k_padded, v_padded, out_padded, attn_bias_padded, softmax_lse, S_dmask, rng_state = torch.ops.fa2.fwd(
+            q,
+            kv[:, :, 0],
+            kv[:, :, 1],
+            None,
+            alibi_slopes,
+            dropout_p,
+            softmax_scale,
+            causal,
+            window_size_left,
+            window_size_right,
+            attn_bias,
+            return_softmax and dropout_p > 0,
+            None
+        )
+        ## WORKAROUND a Pytorch bug, should use _padded version of the tensors but this is buggy when passing them directly to save_for_backward
+        ## For now, this breaks the backward when headdim is not a multiple of 8 and/or seqlen_q, seqlen_k are not a multiple of 8
+        ## TODO: make the padding here instead
+        ctx.save_for_backward(q, kv[:, :, 0], kv[:, :, 1], out, softmax_lse, rng_state, attn_bias, alibi_slopes)
+        #ctx.save_for_backward(q_padded, k_padded, v_padded, out_padded, softmax_lse, rng_state, attn_bias_padded, alibi_slopes)
+        ctx.dropout_p = dropout_p
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size_left = window_size_left
+        ctx.window_size_right = window_size_right
+        ctx.deterministic = deterministic
+        ctx.bias_requires_grad = True if attn_bias is not None and return_ds else False
+        ctx.seqlen_k_orig = kv.shape[1]
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, rng_state, attn_bias, alibi_slopes = ctx.saved_tensors
+        dq, dk, dv, ds, _ = torch.ops.fa2.bwd(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            None,
+            None,
+            None,
+            alibi_slopes,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size_left,
+            ctx.window_size_right,
+            ctx.deterministic,
+            attn_bias,
+            ctx.bias_requires_grad,
+            None,
+            ctx.seqlen_k_orig,
+            None,
+            rng_state
+        )
+        dkv = torch.stack([dk, dv], dim=2)
+        return dq, dkv, None, None, None, None, None, None, None, ds, None, None
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size_left,
+        window_size_right,
+        alibi_slopes,
+        deterministic,
+        attn_bias,
+        return_softmax,
+        return_ds
+    ):
+        batch_size, seqlen_q = q.shape[:2]
+        seqlen_k = k.shape[1]
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        if attn_bias is not None:
+            attn_bias = attn_bias.to(q.dtype)
+        out, q_padded, k_padded, v_padded, out_padded, attn_bias_padded, softmax_lse, S_dmask, rng_state = torch.ops.fa2.fwd(
+            q,
+            k,
+            v,
+            None,
+            alibi_slopes,
+            dropout_p,
+            softmax_scale,
+            causal,
+            window_size_left,
+            window_size_right,
+            attn_bias,
+            return_softmax and dropout_p > 0,
+            None
+        )
+        ## WORKAROUND a Pytorch bug, should use _padded version of the tensors but this is buggy when passing them directly to save_for_backward
+        ## For now, this breaks the backward when headdim is not a multiple of 8 and/or seqlen_q, seqlen_k are not a multiple of 8
+        ## TODO: make the padding here instead
+        ctx.save_for_backward(q, k, v, out, softmax_lse, rng_state, attn_bias, alibi_slopes)
+        #ctx.save_for_backward(q_padded, k_padded, v_padded, out_padded, softmax_lse, rng_state, attn_bias_padded, alibi_slopes)
+        ctx.dropout_p = dropout_p
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size_left = window_size_left
+        ctx.window_size_right = window_size_right
+        ctx.deterministic = deterministic
+        ctx.bias_requires_grad = True if attn_bias is not None and return_ds else False
+        ctx.seqlen_k_orig = k.shape[1]
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, rng_state, attn_bias, alibi_slopes = ctx.saved_tensors
+        dout = dout.contiguous()
+        dq, dk, dv, ds, _ = torch.ops.fa2.bwd(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            None,
+            None,
+            None,
+            alibi_slopes,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size_left,
+            ctx.window_size_right,
+            ctx.deterministic,
+            attn_bias,
+            ctx.bias_requires_grad,
+            None,
+            ctx.seqlen_k_orig,
+            None,
+            rng_state
+        )
+        return dq, dk, dv, None, None, None, None, None, None, None, ds, None, None
+def flash_attn_qkvpacked_func(
+    qkv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size_left=-1,
+    window_size_right=-1,  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    attn_bias=None,
+    return_attn_probs=False,
+    return_ds=False
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If Q, K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of Q, K, V.
+    For multi-query and grouped-query attention (MQA/GQA), please see
+    flash_attn_kvpacked_func and flash_attn_func.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, headdim)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) is added to
+            the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnQKVPackedFunc.apply(
+        qkv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size_left,
+        window_size_right,
+        alibi_slopes,
+        deterministic,
+        attn_bias,
+        return_attn_probs,
+        return_ds
+    )
+def flash_attn_kvpacked_func(
+    q,
+    kv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size_left=-1,
+    window_size_right=-1,  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    attn_bias=None,
+    return_attn_probs=False,
+    return_ds=False
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of K, V.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        kv: (batch_size, seqlen, 2, nheads_k, headdim)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnKVPackedFunc.apply(
+        q,
+        kv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size_left,
+        window_size_right,
+        alibi_slopes,
+        deterministic,
+        attn_bias,
+        return_attn_probs,
+        return_ds
+    )
+def flash_attn_func(
+    q,
+    k,
+    v,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size_left=-1,
+    window_size_right=-1,  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    attn_bias=None,
+    return_attn_probs=False,
+    return_ds=False
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k: (batch_size, seqlen, nheads_k, headdim)
+        v: (batch_size, seqlen, nheads_k, headdim)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnFunc.apply(
+        q,
+        k,
+        v,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size_left,
+        window_size_right,
+        alibi_slopes,
+        deterministic,
+        attn_bias,
+        return_attn_probs,
+        return_ds,
+    )

flash_attention_v2_bias.py ADDED Viewed

	@@ -0,0 +1,859 @@

+# Copyright 2023 BAAI
+# Copyright 2024 CATIE
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modifications to the orignal file
+# - Support for biases following https://github.com/FlagOpen/FlagAttention/pull/5
+# - Support for shape (1,1,q,k) biases
+import math
+import torch
+import triton
+import triton.language as tl
+# Wrapper for triton kernel for torch.compile - should be unecessary for PyTorch 2.3 ?
+torch.library.define("flasht5::flash_attn_v2_fwd", "(Tensor q, Tensor k, Tensor v, Tensor bias, bool causal, float sm_scale, int BLOCK_M, int BLOCK_N, int num_warps, int num_stages) -> (Tensor, Tensor)")
+@torch.library.impl("flasht5::flash_attn_v2_fwd", "default")
+def flash_attn_v2_fwd(q, k, v, bias, causal, sm_scale, BLOCK_M, BLOCK_N, num_warps, num_stages):
+    B, H, M, D = q.shape
+    N = k.shape[2]
+    P_SEQ = N - M
+    larger_m = M > N
+    # Trick to support shape such as (1, 1, seqlen_q, seqlen_k)
+    bias_batch_stride = bias.stride(0) if bias is not None else 0
+    bias_heads_stride = bias.stride(1) if bias is not None else 0
+    if bias is not None:
+        if (bias.shape[0] != q.shape[0]) and (bias.shape[0] == 1):
+            bias_batch_stride = 0
+        if (bias.shape[1] != q.shape[1]) and (bias.shape[1] == 1):
+            bias_heads_stride = 0
+    divisible_m = M % BLOCK_M == 0
+    divisible_n = N % BLOCK_N == 0
+    # consider using 3d grid to avoid div & rem
+    grid = (triton.cdiv(M, BLOCK_M), H, B)
+    o = torch.empty_like(q)
+    L = torch.empty((B, H, M), device=q.device, dtype=torch.float32)
+    _fwd_kernel[grid](
+        q, k, v, bias, sm_scale,
+        L, o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        bias_batch_stride, bias_heads_stride,
+        bias.stride(2) if bias is not None else 0,
+        bias.stride(3) if bias is not None else 0,
+        B, H, M, N, P_SEQ,
+        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_DMODEL=D,
+        IS_CAUSAL=causal, LARGER_M=larger_m,
+        DIVISIBLE_M=divisible_m, DIVISIBLE_N=divisible_n,
+        HAS_BIAS=(bias is not None),
+        num_warps=num_warps, num_stages=num_stages,
+    )
+    return o, L
+@torch.library.impl_abstract("flasht5::flash_attn_v2_fwd", flash_attn_v2_fwd)
+def flash_attn_v2_fwd_abstract(q, k, v, bias, causal, sm_scale, BLOCK_M, BLOCK_N, num_warps, num_stages):
+    B, H, M, D = q.shape
+    o = torch.empty_like(q)
+    L = torch.empty((B, H, M), dtype=torch.float32, device=q.device)
+    return o, L
+torch.library.define("flasht5::flash_attn_v2_bwd", "(Tensor o, Tensor do, Tensor q, Tensor k, Tensor v, Tensor bias, Tensor L, bool causal, float sm_scale, int BLOCK_M, int BLOCK_N, int num_warps, int num_stages) -> (Tensor, Tensor, Tensor, Tensor)")
+@torch.library.impl("flasht5::flash_attn_v2_bwd", "default")
+def flash_attn_v2_bwd(o, do, q, k, v, bias, L, causal, sm_scale, BLOCK_M, BLOCK_N, num_warps, num_stages):
+    B, H, M, D = q.shape
+    N = k.shape[2]
+    P_SEQ = N - M
+    larger_m = M > N
+    divisible_m = M % BLOCK_M == 0
+    divisible_n = N % BLOCK_N == 0
+    # Trick to support shape such as (1, 1, seqlen_q, seqlen_k)
+    bias_batch_stride = bias.stride(0) if bias is not None else 0
+    bias_heads_stride = bias.stride(1) if bias is not None else 0
+    if bias is not None:
+        if (bias.shape[0] != q.shape[0]) and (bias.shape[0] == 1):
+            bias_batch_stride = 0
+        if (bias.shape[1] != q.shape[1]) and (bias.shape[1] == 1):
+            bias_heads_stride = 0
+    delta = torch.empty_like(L)
+    grid = (triton.cdiv(M, BLOCK_M), H, B)
+    _bwd_preprocess[grid](
+        o, do,
+        delta,
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        do.stride(0), do.stride(1), do.stride(2), do.stride(3),
+        delta.stride(0), delta.stride(1), delta.stride(2),
+        M,
+        BLOCK_M=BLOCK_M, D_HEAD=D,
+        DIVISIBLE_M=divisible_m,
+    )
+    dk = torch.empty_like(k)
+    dv = torch.empty_like(v)
+    HAS_BIAS = bias is not None
+    RETURN_DS = HAS_BIAS
+    USE_DS_ATOMIC_ADD = (bias_batch_stride == 0) or (bias_heads_stride == 0)
+    ds = None
+    if RETURN_DS:
+        ds = torch.empty_like(bias)
+        if USE_DS_ATOMIC_ADD:
+            ds = ds.zero_()
+    grid = (triton.cdiv(N, BLOCK_N), H, B)
+    _bwd_kv_kernel[grid](
+        q, k, v, bias, sm_scale, do,
+        dk, dv, ds,
+        L, delta,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        bias_batch_stride, bias_heads_stride,
+        bias.stride(2) if HAS_BIAS else 0,
+        bias.stride(3) if HAS_BIAS else 0,
+        do.stride(0), do.stride(1), do.stride(2), do.stride(3),
+        dk.stride(0), dk.stride(1), dk.stride(2), dk.stride(3),
+        dv.stride(0), dv.stride(1), dv.stride(2), dv.stride(3),
+        B, H, M, N, P_SEQ,
+        BLOCK_M=BLOCK_M, BLOCK_DMODEL=D, BLOCK_N=BLOCK_N, CAUSAL=causal,
+        DIVISIBLE_M=divisible_m, DIVISIBLE_N=divisible_n,
+        HAS_BIAS=HAS_BIAS,
+        RETURN_DS=RETURN_DS, USE_DS_ATOMIC_ADD=USE_DS_ATOMIC_ADD,
+        num_stages=num_stages, num_warps=num_warps,
+    )
+    dq = torch.empty_like(q)
+    grid = (triton.cdiv(M, BLOCK_M), H, B)
+    _bwd_q_kernel[grid](
+        q, k, v, bias, sm_scale, do,
+        dq,
+        L, delta,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        bias_batch_stride, bias_heads_stride,
+        bias.stride(2) if HAS_BIAS else 0,
+        bias.stride(3) if HAS_BIAS else 0,
+        do.stride(0), do.stride(1), do.stride(2), do.stride(3),
+        dq.stride(0), dq.stride(1), dq.stride(2), dq.stride(3),
+        B, H, M, N, P_SEQ,
+        BLOCK_M=BLOCK_M, BLOCK_DMODEL=D, BLOCK_N=BLOCK_N,
+        CAUSAL=causal, LARGER_M=larger_m,
+        DIVISIBLE_M=divisible_m, DIVISIBLE_N=divisible_n,
+        HAS_BIAS=HAS_BIAS,
+        num_stages=num_stages, num_warps = num_warps,
+    )
+    return dq, dk, dv, ds
+@torch.library.impl_abstract("flasht5::flash_attn_v2_bwd", flash_attn_v2_bwd)
+def cross_entropy_triton_bwd_abstract(o, do, q, k, v, bias, L, causal, sm_scale, BLOCK_M, BLOCK_N, num_warps, num_stages):
+    dq = torch.empty_like(q)
+    dk = torch.empty_like(k)
+    dv = torch.empty_like(v)
+    ds = torch.empty_like(bias) if bias is not None else None
+    return dq, dk, dv, ds
+class FlashAttention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, bias, causal, sm_scale):
+        Dq, Dk, Dv = q.shape[-1], k.shape[-1], v.shape[-1]
+        assert Dq == Dk == Dv
+        assert Dk in {16, 32, 64, 128}
+        B, H, M, D = q.shape
+        N = k.shape[2]
+        if sm_scale is None:
+            sm_scale = 1. / math.sqrt(D)
+        config = get_fwd_config(B, H, M, N, D, causal)
+        BLOCK_M, BLOCK_N, num_stages, num_warps = config
+        o, L = torch.ops.flasht5.flash_attn_v2_fwd(q, k, v, bias, causal, sm_scale, BLOCK_M, BLOCK_N, num_warps, num_stages)
+        # autograd context maintenance
+        ctx.save_for_backward(q, k, v, bias, o, L)
+        ctx.sm_scale = sm_scale
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do, *ignored):
+        q, k, v, bias, o, L = ctx.saved_tensors
+        sm_scale = ctx.sm_scale
+        causal = ctx.causal
+        B, H, M, D = q.shape
+        N = k.shape[2]
+        if sm_scale is None:
+            sm_scale = 1. / math.sqrt(D)
+        config = get_bwd_config(B, H, M, N, D, causal)
+        BLOCK_M, BLOCK_N, num_stages, num_warps = config
+        dq, dk, dv, ds = torch.ops.flasht5.flash_attn_v2_bwd(o, do, q, k, v, bias, L, causal, sm_scale, BLOCK_M, BLOCK_N, num_warps, num_stages)
+        return dq, dk, dv, ds, None, None, None, None
+def attention(q, k, v, bias, causal=False, sm_scale=None):
+    """
+    An implementation of FlashAttention v2(https://arxiv.org/abs/2307.08691).
+    Arguments:
+        q(torch.Tensor): The first queries. The shape is (batch_size, nheads, seqlen_q, headdim).
+        k(torch.Tensor): The first keys. The shape is (batch_size, nheads, seqlen_k, headdim).
+        v(torch.Tensor): The values. The shape is (batch_size, nheads, seqlen_k, headdim).
+        causal(bool): Whether causal masking is applied to attention scores before applying softmax.
+        sm_scale(float): The scaling of attention scores before applying softmax.
+    Returns:
+        out(torch.Tensor): The output. The shape is (batch_size, nheads, seqlen_q, headdim).
+    """
+    return FlashAttention.apply(q, k, v, bias, causal, sm_scale)
+# --------------------------- Forward ---------------------------
+# NOTE: this function can be overwritten at runtime to use your custom config
+def get_fwd_config(B, H, M, N, D, causal):
+    if torch.cuda.get_device_capability() == (8, 0):
+        if not causal:
+            if D <= 64:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 64, 3, 4
+            else:
+                if M <= 1024:
+                    BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 32, 3, 4
+                else:
+                    BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 128, 3, 8
+        else:
+            if D <= 64:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 64, 4, 4
+            else:
+                if M <= 1024:
+                    BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 32, 2, 4
+                else:
+                    BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 128, 3, 8
+    elif torch.cuda.get_device_capability() == (8, 6):
+        if not causal:
+            if D <= 64:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 64, 3, 4
+            else:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 32, 2, 4
+        else: # causal
+            if D <= 64:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 64, 64, 3, 4
+            else:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 128, 32, 2, 4
+    else:
+        BLOCK_M, BLOCK_N, num_stages, num_warps = 32, 32, 1, 4
+    return (BLOCK_M, BLOCK_N, num_stages, num_warps)
+@triton.jit
+def _fwd_kernel(
+    Q, K, V, B, sm_scale,
+    L, O,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    stride_bz, stride_bh, stride_bm, stride_bn,
+    Z, H, M, N, P_SEQ,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr, LARGER_M: tl.constexpr,
+    DIVISIBLE_M: tl.constexpr, DIVISIBLE_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    input_dtype = Q.dtype.element_ty
+    # -- grid id --
+    start_m = tl.program_id(0)
+    off_h = tl.program_id(1)
+    off_z = tl.program_id(2)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    log2e: tl.constexpr = 1.4426950408889634
+    # offset pointers for (batch, head)
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    O += off_z * stride_oz + off_h * stride_oh
+    if HAS_BIAS:
+        B += off_z * stride_bz + off_h * stride_bh
+    L += (off_z * H + off_h) * M # l's shape is (B, H, M)
+    offs_m_base = tl.arange(0, BLOCK_M)
+    offs_m = start_m * BLOCK_M + offs_m_base
+    offs_n_base = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_DMODEL)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk) # (BLOCK_M, BLOCK_DMODEL)
+    o_ptrs = O + (offs_m[:, None] * stride_om + offs_k[None, :] * stride_ok) # (BLOCK_M, BLOCK_DMODEL)
+    l_ptrs = L + offs_m
+    # initialize pointer to m and l, fp32 for accumulators
+    m_i = tl.full([BLOCK_M], value=-float("inf"), dtype=tl.float32)
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # load q
+    mask_m = offs_m < M
+    if DIVISIBLE_M:
+        q = tl.load(q_ptrs, cache_modifier=".cg")
+    else:
+        q = tl.load(q_ptrs, mask=mask_m[:, None], cache_modifier=".cg")
+    #Dot I trick: to place q in registers, it saves shared memory
+    if BLOCK_DMODEL < 128:
+        I = tl.where(offs_k[:, None] == offs_k,
+                     tl.full((BLOCK_DMODEL, BLOCK_DMODEL), 1.0, dtype=input_dtype),
+                     tl.full((BLOCK_DMODEL, BLOCK_DMODEL), 0.0, dtype=input_dtype))
+        q = tl.dot(q, I).to(input_dtype)
+    # else:
+    #     I = tl.where(offs_m_base[:, None] == offs_m_base,
+    #                  tl.full((BLOCK_M, BLOCK_M), 1.0, dtype=input_dtype),
+    #                  tl.full((BLOCK_M, BLOCK_M), 0.0, dtype=input_dtype))
+    #     q = tl.dot(I, q).to(input_dtype)
+    # NOTE: Loop-Bound-For-N
+    # The indices in m-dimension that this block may access is in `[start_m * BLOCK_M, (start_m + 1) * BLOCK_M)`.
+    # According to the rule of causal masking, then max index in n-dimension that this block may access
+    # is `P_SEQ + (start_m + 1) * BLOCK_M`.
+    # However, the upper bound of index in n-dimension should never exceed the sequence length of k/v(`P_SEQ + N_CTX`).
+    # `P_SEQ + (start_m + 1) * BLOCK_M` may be larger than `N`.
+    # At this case, there would be illegal memory access when loading k & v tiles
+    # if mask_n is not applied for loading(only when `DIVISIBLE_N`` is true).
+    # See also https://github.com/FlagOpen/FlagAttention/pull/8
+    if IS_CAUSAL:
+        hi = tl.minimum(N, P_SEQ + (start_m + 1) * BLOCK_M)
+        if LARGER_M:
+            hi = tl.maximum(0, hi)
+    else:
+        hi = N
+    # loop over k, v and update accumulators
+    offs_n_init = offs_n_base
+    k_ptrs = K + (offs_k[:, None] * stride_vk + offs_n_init[None, :] * stride_vn) # (BLOCK_DMODEL, BLOCK_N)
+    v_ptrs = V + (offs_n_init[:, None] * stride_kn + offs_k[None, :] * stride_kk) # (BLOCK_N, BLOCK_DMODEL)
+    if HAS_BIAS:
+        bias_ptrs = B + (offs_m[:, None] * stride_bm + offs_n_init[None, :] * stride_bn) # (BLOCK_M, BLOCK_N)
+    for start_n in range(0, hi, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        offs_n = start_n + offs_n_base
+        # -- load k, v --
+        mask_n = offs_n < N
+        if DIVISIBLE_N:
+            k = tl.load(k_ptrs, cache_modifier=".cg")
+            v = tl.load(v_ptrs, cache_modifier=".cg")
+        else:
+            k = tl.load(k_ptrs, mask=mask_n[None, :], cache_modifier=".cg")
+            v = tl.load(v_ptrs, mask=mask_n[:, None], cache_modifier=".cg")
+        # -- load bias --
+        if HAS_BIAS:
+            if DIVISIBLE_M and DIVISIBLE_N:
+                b = tl.load(bias_ptrs)
+            else:
+                b = tl.load(bias_ptrs, mask_m[:, None] & mask_n[None, :])
+        # -- compute qk ---
+        s = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        s += tl.dot(q, k) * sm_scale
+        if HAS_BIAS:
+            s += b
+        if not DIVISIBLE_N:
+            s = tl.where(mask_n[None, :], s, float("-inf"))
+        if IS_CAUSAL:
+            causal_mask = (P_SEQ + offs_m[:, None]) >= offs_n[None, :]
+            s = tl.where(causal_mask, s, float("-inf"))
+        # -- compute scaling constant ---
+        m_i_new = tl.maximum(m_i, tl.max(s, 1))
+        alpha = tl.math.exp2((m_i - m_i_new)*log2e)
+        p = tl.math.exp2((s - m_i_new[:, None])*log2e)
+        # -- scale and update acc: acc *= alpha[:, None]--
+        acc *= alpha[:, None]
+        acc += tl.dot(p.to(input_dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        k_ptrs += BLOCK_N * stride_kn
+        v_ptrs += BLOCK_N * stride_vn
+        if HAS_BIAS:
+            bias_ptrs += BLOCK_N * stride_bn
+    # write back l & o
+    if IS_CAUSAL and LARGER_M:
+        is_empty_line = (offs_m + P_SEQ) < 0
+        acc = tl.where(is_empty_line[:, None], 0.0, acc * (1.0 / l_i[:, None]))
+        l = tl.where(is_empty_line, float("-inf"), m_i + tl.log(l_i))
+    else:
+        acc = acc * (1.0 / l_i[:, None])
+        l = m_i + tl.log(l_i) # log(normalizer)
+    if DIVISIBLE_M:
+        tl.store(l_ptrs, l, cache_modifier=".cg")
+        tl.store(o_ptrs, acc.to(input_dtype), cache_modifier=".cg")
+    else:
+        tl.store(l_ptrs, l, mask=mask_m, cache_modifier=".cg")
+        tl.store(o_ptrs, acc.to(input_dtype), mask=mask_m[:, None], cache_modifier=".cg")
+# --------------------------- Backward ---------------------------
+# NOTE: this function can be overwritten at runtime to use your custom config
+def get_bwd_config(B, H, M, N, D, causal):
+    if torch.cuda.get_device_capability() == (8, 0):
+        if not causal:
+            BLOCK_M = 128 if D <= 64 else 64
+            BLOCK_N = 64
+            num_stages = 2
+            num_warps = 4
+        else:
+            BLOCK_M = 64
+            BLOCK_N = 64
+            num_stages = 3 if D <= 64 else 2
+            num_warps = 4
+    elif torch.cuda.get_device_capability() == (8, 6): # tune for RTX-3090, device_capability(8, 6)
+        if not causal:
+            if D <= 64:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 64, 64, 2, 4
+            else:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 64, 64, 2, 8
+        else:
+            if D <= 64:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 64, 64, 2, 4
+            else:
+                BLOCK_M, BLOCK_N, num_stages, num_warps = 32, 32, 2, 4
+    else:
+        BLOCK_M, BLOCK_N, num_stages, num_warps = 32, 32, 1, 4
+    return (BLOCK_M, BLOCK_N, num_stages, num_warps)
+@triton.jit
+def _bwd_preprocess(
+    Out, DO,
+    Delta,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    stride_doz, stride_doh, stride_dom, stride_dok,
+    stride_dz, stride_dh, stride_dm,
+    M,
+    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
+    DIVISIBLE_M: tl.constexpr,
+):
+    off_h = tl.program_id(1)
+    off_z = tl.program_id(2)
+    Out += off_z * stride_oz + off_h * stride_oh
+    DO += off_z * stride_doz + off_h * stride_doh
+    Delta += off_z * stride_dz + off_h * stride_dh
+    # compute (Out * Dout).sum() for vector interpretation
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_n = tl.arange(0, D_HEAD)
+    # load
+    o_ptrs = Out + off_m[:, None] * stride_om + off_n[None, :] * stride_ok
+    do_ptrs = DO + off_m[:, None] * stride_dom + off_n[None, :] * stride_dok
+    if DIVISIBLE_M:
+        o = tl.load(o_ptrs).to(tl.float32)
+        do = tl.load(do_ptrs).to(tl.float32)
+    else:
+        mask_m = off_m < M
+        o = tl.load(o_ptrs, mask=mask_m[:, None]).to(tl.float32)
+        do = tl.load(do_ptrs, mask=mask_m[:, None]).to(tl.float32)
+    # compute
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    d_ptrs = Delta + off_m * stride_dm
+    if DIVISIBLE_M:
+        tl.store(d_ptrs, delta)
+    else:
+        tl.store(d_ptrs, delta, mask=mask_m)
+@triton.jit
+def _bwd_kv_kernel(
+    Q, K, V, B, sm_scale, DO,
+    DK, DV, DS,
+    L,
+    D,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_bz, stride_bh, stride_bm, stride_bn,
+    stride_doz, stride_doh, stride_dom, stride_dok,
+    stride_dkz, stride_dkh, stride_dkn, stride_dkk,
+    stride_dvz, stride_dvh, stride_dvn, stride_dvk,
+    Z, H, M, N, P_SEQ,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    DIVISIBLE_M: tl.constexpr, DIVISIBLE_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    RETURN_DS: tl.constexpr, USE_DS_ATOMIC_ADD: tl.constexpr,
+):
+    input_dtype = Q.dtype.element_ty
+    # -- grid id --
+    start_n = tl.program_id(0)
+    off_h = tl.program_id(1)
+    off_z = tl.program_id(2)
+    log2e: tl.constexpr = 1.4426950408889634
+    qk_scale = sm_scale * log2e
+    # offset pointers for (batch, head)
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    if HAS_BIAS:
+        B += off_z * stride_bz + off_h * stride_bh
+    DO += off_z * stride_doz + off_h * stride_doh
+    # offset pointers for batch/head
+    DK += off_z * stride_dkz + off_h * stride_dkh
+    DV += off_z * stride_dvz + off_h * stride_dvh
+    if RETURN_DS:
+        DS += off_z * stride_bz + off_h * stride_bh
+    # offset pointers for batch/head
+    D += (off_z * H + off_h) * M
+    L += (off_z * H + off_h) * M
+    if CAUSAL:
+        lo = tl.maximum(start_n * BLOCK_N - P_SEQ, 0)
+        lo = (lo // BLOCK_M) * BLOCK_M
+    else:
+        lo = 0
+    offs_m_init = lo + tl.arange(0, BLOCK_M)
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m_base = tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_DMODEL)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_m_init[:, None] * stride_qm + offs_k[None, :] * stride_qk) # (BLOCK_M, BLOCK_DMODEL)
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) # (BLOCK_N, BLOCK_DMODEL)
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_k[None, :] * stride_vk) # (BLOCK_N, BLOCK_DMODEL)
+    do_ptrs = DO + (offs_m_init[:, None] * stride_dom + offs_k[None, :] * stride_dok) # (BLOCK_M, BLOCK_DMODEL)
+    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_k[None, :] * stride_dvk) # (BLOCK_N, BLOCK_DMODEL)
+    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_k[None, :] * stride_dkk) # (BLOCK_N, BLOCK_DMODEL)
+    if HAS_BIAS:
+        bias_ptrs = B + (offs_m_init[:, None] * stride_bm + offs_n[None, :] * stride_bn)
+    if RETURN_DS:
+        ds_ptrs = DS + (offs_m_init[:, None] * stride_bm + offs_n[None, :] * stride_bn)
+    # k and v stay in SRAM throughout
+    mask_n = offs_n < N
+    if DIVISIBLE_N:
+        v = tl.load(v_ptrs)
+        k = tl.load(k_ptrs)
+    else:
+        v = tl.load(v_ptrs, mask=mask_n[:, None])
+        k = tl.load(k_ptrs, mask=mask_n[:, None])
+    # initialize dk amd dv
+    dk = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+    dv = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+    # loop over a col
+    for start_m in range(lo, M, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        offs_m = start_m + offs_m_base
+        causal_mask = (P_SEQ + offs_m[:, None]) >= (offs_n[None, :]) # (BLOCK_M, BLOCK_N)
+        # load q1, k1, q2, k2, v, do on-chip
+        mask_m = offs_m < M
+        if DIVISIBLE_M:
+            q = tl.load(q_ptrs)
+        else:
+            valid_mask = mask_m[:, None] # & mask_n
+            q = tl.load(q_ptrs, mask=mask_m[:, None])
+        # load bias
+        if HAS_BIAS:
+            if DIVISIBLE_M and DIVISIBLE_N:
+                b = tl.load(bias_ptrs)
+            else:
+                b = tl.load(bias_ptrs, mask=mask_m[:, None] & mask_n[None, :])
+        # recompute p = softmax(qk * sm_scale, dim=-1)
+        s = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        s += tl.dot(q, tl.trans(k)) * sm_scale
+        if HAS_BIAS:
+            s += b
+        # NOTE: since softmax in backward is pointwise, the normalizer has been saved in fwd)
+        # So masking on s is not needed.
+        # s = tl.where(valid_mask, s , float("-inf"))
+        # if CAUSAL:
+        #     s = tl.where(causal_mask, s, float("-inf"))
+        # -- recompute p ---
+        if DIVISIBLE_M:
+            l = tl.load(L + offs_m)
+        else:
+            l = tl.load(L + offs_m, mask=mask_m)
+        p = tl.math.exp2((s - l[:, None])*log2e) # (BLOCK_M, BLOCK_N)
+        if not DIVISIBLE_M:
+            p = tl.where(valid_mask, p, 0.0)
+        if CAUSAL:
+            p = tl.where(causal_mask, p, 0.0)
+        # compute dv = dot(p, do)
+        if DIVISIBLE_M:
+            do = tl.load(do_ptrs)
+        else:
+            do = tl.load(do_ptrs, mask=mask_m[:, None]) # (BLOCK_M, BLOCK_DMODEL)
+        dv += tl.dot(tl.trans(p.to(do.dtype)), do) # (BLOCK_N, BLOCK_DMODEL)  # still correct
+        # compute dp = dot(v, do)
+        if DIVISIBLE_M:
+            delta = tl.load(D + offs_m)
+        else:
+            delta = tl.load(D + offs_m, mask=mask_m)
+        dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        dp += tl.dot(do, tl.trans(v))
+        # compute ds = p * (dp - delta[:, None])
+        ds = p * (dp - delta[:, None]) # (BLOCK_M, BLOCK_N)
+        if not DIVISIBLE_M:
+            ds = tl.where(valid_mask, ds, 0.0)
+        if CAUSAL:
+            ds = tl.where(causal_mask, ds, 0.0)
+        ds = ds.to(input_dtype)
+        if RETURN_DS:
+            if DIVISIBLE_M and DIVISIBLE_N:
+                if USE_DS_ATOMIC_ADD:
+                    tl.atomic_add(ds_ptrs, ds)
+                else:
+                    tl.store(ds_ptrs, ds)
+            else:
+                if USE_DS_ATOMIC_ADD:
+                    tl.atomic_add(ds_ptrs, ds, mask=mask_m[:, None] & mask_n[None, :])
+                else:
+                    tl.store(ds_ptrs, ds, mask=mask_m[:, None] & mask_n[None, :])
+        # compute dk = dot(ds.T, q) masking
+        dk += tl.dot(tl.trans(ds), q)
+        # increment pointers
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_dom
+        if HAS_BIAS:
+            bias_ptrs += BLOCK_M * stride_bm
+        if RETURN_DS:
+            ds_ptrs += BLOCK_M * stride_bm
+    dk *= sm_scale
+    if DIVISIBLE_N:
+        tl.store(dk_ptrs, dk.to(input_dtype)) # (BLOCK_N, BLOCK_DMODEL)
+        tl.store(dv_ptrs, dv.to(input_dtype)) # (BLOCK_N, BLOCK_DMODEL,)
+    else:
+        tl.store(dk_ptrs, dk.to(input_dtype), mask=mask_n[:, None]) # (BLOCK_N, BLOCK_DMODEL)
+        tl.store(dv_ptrs, dv.to(input_dtype), mask=mask_n[:, None]) # (BLOCK_N, BLOCK_DMODEL,)
+@triton.jit
+def _bwd_q_kernel(
+    Q, K, V, B, sm_scale, DO,
+    DQ,
+    L,
+    D,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_bz, stride_bh, stride_bm, stride_bn,
+    stride_doz, stride_doh, stride_dom, stride_dok,
+    stride_dqz, stride_dqh, stride_dqm, stride_dqk,
+    Z, H, M, N, P_SEQ,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr,
+    CAUSAL: tl.constexpr, LARGER_M: tl.constexpr,
+    DIVISIBLE_M: tl.constexpr, DIVISIBLE_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr
+):
+    input_dtype = Q.dtype.element_ty
+    # -- grid id --
+    start_m = tl.program_id(0)
+    off_h = tl.program_id(1)
+    off_z = tl.program_id(2)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    log2e: tl.constexpr = 1.4426950408889634
+    # offset pointers for (batch, head)
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    if HAS_BIAS:
+        B += off_z * stride_bz + off_h * stride_bh
+    DO += off_z * stride_doz + off_h * stride_doh
+    D += (off_z * H + off_h) * M
+    L += (off_z * H + off_h) * M
+    # offset pointers for batch/head
+    DQ += off_z * stride_dqz + off_h * stride_dqh
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n_base = tl.arange(0, BLOCK_N)
+    offs_n_init = offs_n_base
+    offs_k = tl.arange(0, BLOCK_DMODEL)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk) # (BLOCK_M, BLOCK_DMODEL)
+    k_ptrs = K + (offs_n_init[:, None] * stride_kn + offs_k[None, :] * stride_kk) # (BLOCK_N, BLOCK_DMODEL)
+    v_ptrs = V + (offs_n_init[:, None] * stride_vn + offs_k[None, :] * stride_vk) # (BLOCK_N, BLOCK_DMODEL)
+    if HAS_BIAS:
+        bias_ptrs = B + (offs_m[:, None] * stride_bm + offs_n_init[None, :] * stride_bn)
+    dq_ptrs = DQ + (offs_m[:, None] * stride_dqm + offs_k[None, :] * stride_dqk) # (BLOCK_M, BLOCK_DMODEL)
+    do_ptrs = DO + (offs_m[:, None] * stride_dom + offs_k[None, :] * stride_dok) # (BLOCK_M, BLOCK_DMODEL)
+    # pointer to row-wise quantities in value-like data
+    d_ptrs = D + offs_m
+    l_ptrs = L + offs_m
+    # load q: it will stay in SRAM throughout
+    mask_m = offs_m < M
+    if DIVISIBLE_M:
+        q = tl.load(q_ptrs)
+        do = tl.load(do_ptrs)
+        delta = tl.load(d_ptrs)
+        l = tl.load(l_ptrs)
+    else:
+        q = tl.load(q_ptrs, mask=mask_m[:, None])
+        do = tl.load(do_ptrs, mask=mask_m[:, None])
+        delta = tl.load(d_ptrs, mask=mask_m)
+        l = tl.load(l_ptrs, mask=mask_m)
+    # initialize dq
+    dq = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # loop over k, v and update accumulator
+    # see note "Loop-Bound-For-N"
+    if CAUSAL:
+        hi = tl.minimum(N, P_SEQ + (start_m + 1) * BLOCK_M)
+        if LARGER_M:
+            hi = tl.maximum(0, hi)
+    else:
+        hi = N
+    # loop over a row
+    for start_n in range(0, hi, BLOCK_N):
+        offs_n = start_n + offs_n_base
+        # load k1, k2, v on chip
+        mask_n = offs_n < N
+        if DIVISIBLE_N:
+            v = tl.load(v_ptrs)
+            k = tl.load(k_ptrs)
+        else:
+            v = tl.load(v_ptrs, mask=mask_n[:, None])
+            k = tl.load(k_ptrs, mask=mask_n[:, None])
+        # load bias
+        if HAS_BIAS:
+            if DIVISIBLE_M and DIVISIBLE_N:
+                b = tl.load(bias_ptrs)
+            else:
+                b = tl.load(bias_ptrs, mask=mask_m[:, None] & mask_n[None, :])
+        # recompute p = softmax(qk * sm_scale, dim=-1)
+        if not DIVISIBLE_N:
+            valid_mask = mask_n # & mask_m[:, None]
+        if CAUSAL:
+            causal_mask = (P_SEQ + offs_m[:, None]) >= (offs_n[None, :]) # (BLOCK_M, BLOCK_N)
+        s = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        s += tl.dot(q, tl.trans(k)) * sm_scale
+        if HAS_BIAS:
+            s += b
+        # NOTE: since softmax in backward is pointwise, the normalizer has been saved in fwd)
+        # So masking on s is not needed.
+        # if CAUSAL:
+        #     s = tl.where(causal_mask & valid_mask, s, float("-inf"))
+        # else:
+        #     s = tl.where(valid_mask, s, float("-inf"))
+        p = tl.math.exp2((s - l[:, None])*log2e) # (BLOCK_M, BLOCK_N)
+        # compute dp = dot(v, do)
+        dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        dp += tl.dot(do.to(input_dtype), tl.trans(v))
+        # no need to mask dp
+        # if CAUSAL:
+        #     dp = tl.where(causal_mask & valid_mask, dp, 0.0)
+        # else:
+        #     dp = tl.where(valid_mask, dp, 0.0)
+        # compute ds = p * (dp - delta[:, None])
+        # move scale out to dq at last
+        ds = p * (dp - delta[:, None]) # (BLOCK_M, BLOCK_N)
+        # mask ds to ensure no small values
+        if not DIVISIBLE_N:
+            ds = tl.where(valid_mask, ds, 0.0)
+        if CAUSAL:
+            ds = tl.where(causal_mask, ds, 0.0)
+        dq += tl.dot(ds.to(input_dtype), k)
+        # increment pointers
+        k_ptrs += BLOCK_N * stride_kn
+        v_ptrs += BLOCK_N * stride_vn
+        if HAS_BIAS:
+            bias_ptrs += BLOCK_N * stride_bn
+    dq *= sm_scale
+    if DIVISIBLE_M:
+        tl.store(dq_ptrs, dq.to(input_dtype))
+    else:
+        tl.store(dq_ptrs, dq.to(input_dtype), mask=mask_m[:, None])

gated_mlp.py ADDED Viewed

	@@ -0,0 +1,729 @@

+import torch
+import math
+import triton
+import triton.language as tl
+from torch.cuda.amp import custom_bwd, custom_fwd
+def to_tl_dtype(input):
+    if input == torch.float32:
+        return tl.float32
+    elif input == torch.float16:
+        return tl.float16
+    elif input == torch.bfloat16:
+        return tl.bfloat16
+    elif input == torch.int64:
+        return tl.int64
+    else:
+        raise ValueError(f"Unable to convert the given input: '{input}'.")
+## Activation function from https://github.com/facebookresearch/xformers/blob/main/xformers/triton/k_activations.py
+_kAlpha = math.sqrt(2.0 / math.pi)
+def gelu_torch(x):
+    """
+    GeLU_ activation - Gaussian error linear unit
+    .. _GeLU: https://arxiv.org/pdf/1606.08415.pdf
+    """
+    return 0.5 * x * (1 + torch.tanh(_kAlpha * (x + 0.044715 * x * x * x)))
+def gelu_grad_torch(x):
+    # CREDITS: Fast implementation proposed in
+    # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/fused_bias_gelu.py#L30
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    return 0.5 * x * (
+        (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)
+    ) + 0.5 * (1 + tanh_out)
+# ReLU
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+@triton.jit
+def relu(x):
+    """
+    ReLU_ activation function
+    .. _ReLU: https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
+    """
+    return tl.where(x >= 0, x, 0.0)
+@triton.jit
+def relu_grad(x):
+    # ReLU is different from other activations
+    # in that it does not require the input to retrospectively compute its gradient
+    # here the input is the downstream gradient, and we return the upstream gradient directly
+    return tl.where(x >= 0, 1.0, 0.0)
+@triton.jit
+def gelu(x):
+    """
+    GeLU_ activation - Gaussian error linear unit
+    .. _GeLU: https://arxiv.org/pdf/1606.08415.pdf
+    """
+    return 0.5 * x * (1 + tanh(_kAlpha * (x + 0.044715 * x * x * x)))
+@triton.jit
+def gelu_grad(x):
+    # CREDITS: Fast implementation proposed in
+    # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/fused_bias_gelu.py#L30
+    tanh_out = tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    return 0.5 * x * (
+        (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)
+    ) + 0.5 * (1 + tanh_out)
+@triton.jit
+def gated_matmul_fwd(
+    # Pointers to matrices
+    out, input, w1, w2,
+    act_input_1, act_input_2,
+    # Matrix dimensions
+    M, N, K,
+    stride_om,
+    stride_im,
+    stride_wn,
+    # Meta-parameters
+    dtype: tl.constexpr,
+    BLOCK_M: tl.constexpr, GROUP_M: tl.constexpr,
+    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    USE_GELU: tl.constexpr,
+    SAVE_ACTIVATION_INPUTS: tl.constexpr,
+    IS_EVEN_MNK: tl.constexpr
+):
+    """
+    Kernel for computing Out = activation(A x W + C)
+    - Input has shape (M, K)
+    - Weight 1 has shape (K, N)
+    - Weight 2 has shape (K, N)
+    - Output has shape (M, N)
+    """
+    pid = tl.program_id(0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)  # number of program ids along the M axis
+    num_pid_n = tl.cdiv(N, BLOCK_N)  # number of programs ids along the N axis
+    num_pid_in_group = GROUP_M * num_pid_n  # number of programs in group
+    group_id = pid // num_pid_in_group  # id of the group this program is in
+    first_pid_m = group_id * GROUP_M  # row-id of the first program in the group
+    GROUP_M = min(
+        num_pid_m - first_pid_m, GROUP_M
+    )  # if `num_pid_m` isn't divisible by `GROUP_M`, the last group is smaller
+    # *within groups*, programs are ordered in a column-major order
+    # row-id /col-id of the program in the *launch grid*
+    pid_m = first_pid_m + (pid % GROUP_M)
+    pid_n = (pid % num_pid_in_group) // GROUP_M
+    input_block_ptr = tl.make_block_ptr(
+        base=input,
+        shape=(M, K),
+        strides=(stride_im, 1),
+        offsets=(pid_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_K),
+        order=(1, 0),
+    )
+    w1_block_ptr = tl.make_block_ptr(
+        base=w1,
+        shape=(K, N),
+        strides=(1, stride_wn),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(0, 1),
+    )
+    w2_block_ptr = tl.make_block_ptr(
+        base=w2,
+        shape=(K, N),
+        strides=(1, stride_wn),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(0, 1),
+    )
+    # initialize and iteratively update accumulator
+    acc1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    acc2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for i in range(0, K, BLOCK_K):
+        if IS_EVEN_MNK:
+            x = tl.load(input_block_ptr)
+            w1_blk = tl.load(w1_block_ptr)
+            w2_blk = tl.load(w2_block_ptr)
+        else:
+            x = tl.load(input_block_ptr, boundary_check=(0, 1))
+            w1_blk = tl.load(w1_block_ptr, boundary_check=(0, 1))
+            w2_blk = tl.load(w2_block_ptr, boundary_check=(0, 1))
+        acc1 += tl.dot(x, w1_blk)
+        acc2 += tl.dot(x, w2_blk)
+        input_block_ptr = tl.advance(input_block_ptr, (0, BLOCK_K))
+        w1_block_ptr = tl.advance(w1_block_ptr, (BLOCK_K, 0))
+        w2_block_ptr = tl.advance(w2_block_ptr, (BLOCK_K, 0))
+    if SAVE_ACTIVATION_INPUTS:
+        act_in_1_ptrs = tl.make_block_ptr(
+            base=act_input_1,
+            shape=(M, N),
+            strides=(stride_om, 1),
+            offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+        act_in_2_ptrs = tl.make_block_ptr(
+            base=act_input_2,
+            shape=(M, N),
+            strides=(stride_om, 1),
+            offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+        if IS_EVEN_MNK:
+            tl.store(act_in_1_ptrs, acc1.to(dtype))
+            tl.store(act_in_2_ptrs, acc2.to(dtype))
+        else:
+            tl.store(act_in_1_ptrs, acc1.to(dtype), boundary_check=(0, 1))
+            tl.store(act_in_2_ptrs, acc2.to(dtype), boundary_check=(0, 1))
+    if USE_GELU:
+        acc1 = gelu(acc1)
+    else:
+        acc1 = relu(acc1)
+    # gating
+    acc = acc1 * acc2
+    # write back result
+    out_ptrs = tl.make_block_ptr(
+        base=out,
+        shape=(M, N),
+        strides=(stride_om, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    if IS_EVEN_MNK:
+        tl.store(out_ptrs, acc.to(dtype))
+    else:
+        tl.store(out_ptrs, acc.to(dtype), boundary_check=(0, 1))
+@triton.jit
+def gated_matmul_bwd_ygrad(
+    dout,
+    y1_grad, y2_grad,
+    act_input_1, act_input_2,
+    M, N,
+    stride_dom,
+    # Meta-parameters
+    dtype: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    USE_GELU: tl.constexpr,
+    IS_EVEN_MNK: tl.constexpr):
+    """
+    Kernel for backward gated MLP
+    Ref :
+    y2_grad = torch.mul(gelu(x @ w1), dout)
+    y1_grad = torch.mul(gelu_grad(x @ w1) * (x @ w2), dout)
+    """
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+    # block pointers
+    actin_1_block_ptr = tl.make_block_ptr(
+        base=act_input_1,
+        shape=(M, N),
+        strides=(stride_dom, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    actin_2_block_ptr = tl.make_block_ptr(
+        base=act_input_2,
+        shape=(M, N),
+        strides=(stride_dom, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    dout_block_ptr = tl.make_block_ptr(
+        base=dout,
+        shape=(M, N),
+        strides=(stride_dom, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    if IS_EVEN_MNK:
+        dout_blk = tl.load(dout_block_ptr)
+        actin_1_blk = tl.load(actin_1_block_ptr)
+        actin_2_blk = tl.load(actin_2_block_ptr)
+    else:
+        dout_blk = tl.load(dout_block_ptr, boundary_check=(0, 1))
+        actin_1_blk = tl.load(actin_1_block_ptr, boundary_check=(0, 1))
+        actin_2_blk = tl.load(actin_2_block_ptr, boundary_check=(0, 1))
+    if USE_GELU:
+        actin_act = gelu(actin_1_blk)
+        actin_act_grad = gelu_grad(actin_1_blk)
+    else:
+        actin_act = relu(actin_1_blk)
+        actin_act_grad = relu_grad(actin_1_blk)
+    actin_act *= dout_blk # y2_grad
+    actin_act_grad *= actin_2_blk
+    actin_act_grad *= dout_blk # y1_grad
+    y1_grad_ptrs = tl.make_block_ptr(
+        base=y1_grad,
+        shape=(M, N),
+        strides=(stride_dom, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    y2_grad_ptrs = tl.make_block_ptr(
+        base=y2_grad,
+        shape=(M, N),
+        strides=(stride_dom, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    if IS_EVEN_MNK:
+        tl.store(y1_grad_ptrs, actin_act_grad.to(dtype))
+        tl.store(y2_grad_ptrs, actin_act.to(dtype))
+    else:
+        tl.store(y1_grad_ptrs, actin_act_grad.to(dtype), boundary_check=(0, 1))
+        tl.store(y2_grad_ptrs, actin_act.to(dtype), boundary_check=(0, 1))
+@triton.jit
+def gated_matmul_bwd_input(
+    # Pointers to matrices
+    w1, w2, # weights inputs
+    y1_grad, y2_grad, # partial computation
+    din,  # outputs
+    # Matrix dimensions
+    M, N, K,
+    stride_dom, stride_im,
+    stride_wn,
+    # Meta-parameters
+    dtype: tl.constexpr,
+    BLOCK_M: tl.constexpr, GROUP_M: tl.constexpr,
+    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    IS_EVEN_MNK: tl.constexpr
+):
+    """
+    Kernel for backward gated MLP
+    We group along the N axis
+    Ref :
+    x_grad = torch.matmul(y2_grad, w2.t()) + torch.matmul(y1_grad, w1.t())
+    """
+    pid = tl.program_id(0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)  # number of program ids along the M axis
+    num_pid_k = tl.cdiv(K, BLOCK_K)  # number of programs ids along the K axis
+    num_pid_in_group = GROUP_M * num_pid_k  # number of programs in group
+    group_id = pid // num_pid_in_group  # id of the group this program is in
+    first_pid_m = group_id * GROUP_M  # row-id of the first program in the group
+    GROUP_M = min(
+        num_pid_m - first_pid_m, GROUP_M
+    )  # if `num_pid_m` isn't divisible by `GROUP_M`, the last group is smaller
+    # *within groups*, programs are ordered in a column-major order
+    # row-id /col-id of the program in the *launch grid*
+    pid_m = first_pid_m + (pid % GROUP_M)
+    pid_k = (pid % num_pid_in_group) // GROUP_M
+    y1_grad_block_ptr = tl.make_block_ptr(
+        base=y1_grad,
+        shape=(M, N),
+        strides=(stride_dom, 1),
+        offsets=(pid_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    y2_grad_block_ptr = tl.make_block_ptr(
+        base=y2_grad,
+        shape=(M, N),
+        strides=(stride_dom, 1),
+        offsets=(pid_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    w1_block_ptr = tl.make_block_ptr(
+        base=w1,
+        shape=(N, K),
+        strides=(stride_wn, 1),
+        offsets=(0, pid_k * BLOCK_K),
+        block_shape=(BLOCK_N, BLOCK_K),
+        order=(1, 0),
+    )
+    w2_block_ptr = tl.make_block_ptr(
+        base=w2,
+        shape=(N, K),
+        strides=(stride_wn, 1),
+        offsets=(0, pid_k * BLOCK_K),
+        block_shape=(BLOCK_N, BLOCK_K),
+        order=(1, 0),
+    )
+    # initialize and iteratively update accumulator
+    acc_dx = tl.zeros((BLOCK_M, BLOCK_K), dtype=tl.float32)
+    for i in range(0, N, BLOCK_N):
+        if IS_EVEN_MNK:
+            w1_blk = tl.load(w1_block_ptr)
+            w2_blk = tl.load(w2_block_ptr)
+            y1_grad_blk = tl.load(y1_grad_block_ptr)
+            y2_grad_blk = tl.load(y2_grad_block_ptr)
+        else:
+            w1_blk = tl.load(w1_block_ptr, boundary_check=(0, 1))
+            w2_blk = tl.load(w2_block_ptr, boundary_check=(0, 1))
+            y1_grad_blk = tl.load(y1_grad_block_ptr, boundary_check=(0, 1))
+            y2_grad_blk = tl.load(y2_grad_block_ptr, boundary_check=(0, 1))
+        acc_dx += tl.dot(y2_grad_blk, w2_blk)
+        acc_dx += tl.dot(y1_grad_blk, w1_blk)
+        w1_block_ptr = tl.advance(w1_block_ptr, (BLOCK_N, 0))
+        w2_block_ptr = tl.advance(w2_block_ptr, (BLOCK_N, 0))
+        y1_grad_block_ptr = tl.advance(y1_grad_block_ptr, (0, BLOCK_N))
+        y2_grad_block_ptr = tl.advance(y2_grad_block_ptr, (0, BLOCK_N))
+    # write back result
+    dx_ptrs = tl.make_block_ptr(
+        base=din,
+        shape=(M, K),
+        strides=(stride_im, 1),
+        offsets=(pid_m * BLOCK_M, pid_k * BLOCK_K),
+        block_shape=(BLOCK_M, BLOCK_K),
+        order=(1, 0),
+    )
+    if IS_EVEN_MNK:
+        tl.store(dx_ptrs, acc_dx.to(dtype))
+    else:
+        tl.store(dx_ptrs, acc_dx.to(dtype), boundary_check=(0, 1))
+@triton.jit
+def gated_matmul_bwd_weights(
+    # Pointers to matrices
+    input,
+    y1_grad, y2_grad, # precomputations
+    dw1, dw2, # outputs
+    # Matrix dimensions
+    M, N, K,
+    stride_dom, stride_im,
+    stride_wn,
+    # Meta-parameters
+    dtype: tl.constexpr,
+    BLOCK_M: tl.constexpr, GROUP_N: tl.constexpr,
+    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    IS_EVEN_MNK: tl.constexpr
+):
+    """
+    Kernel for backward gated MLP
+    We group along the M axis
+    Ref :
+    w1_grad = torch.matmul(y1_grad.t(), x)
+    w2_grad = torch.matmul(y2_grad.t(), x)
+    """
+    pid = tl.program_id(0)
+    num_pid_n = tl.cdiv(N, BLOCK_N)  # number of program ids along the M axis
+    num_pid_k = tl.cdiv(K, BLOCK_K)  # number of programs ids along the K axis
+    num_pid_in_group = GROUP_N * num_pid_k  # number of programs in group
+    group_id = pid // num_pid_in_group  # id of the group this program is in
+    first_pid_n = group_id * GROUP_N  # row-id of the first program in the group
+    GROUP_N = min(
+        num_pid_n - first_pid_n, GROUP_N
+    )  # if `num_pid_m` isn't divisible by `GROUP_M`, the last group is smaller
+    # *within groups*, programs are ordered in a column-major order
+    # row-id /col-id of the program in the *launch grid*
+    pid_n = first_pid_n + (pid % GROUP_N)
+    pid_k = (pid % num_pid_in_group) // GROUP_N
+    # block pointers
+    y1_grad_block_ptr = tl.make_block_ptr(
+        base=y1_grad,
+        shape=(N, M),
+        strides=(1, stride_dom),
+        offsets=(pid_n * BLOCK_N, 0),
+        block_shape=(BLOCK_N, BLOCK_M),
+        order=(0, 1),
+    )
+    y2_grad_block_ptr = tl.make_block_ptr(
+        base=y2_grad,
+        shape=(N, M),
+        strides=(1, stride_dom),
+        offsets=(pid_n * BLOCK_N, 0),
+        block_shape=(BLOCK_N, BLOCK_M),
+        order=(0, 1),
+    )
+    input_block_ptr = tl.make_block_ptr(
+        base=input,
+        shape=(M, K),
+        strides=(stride_im, 1),
+        offsets=(0, pid_k * BLOCK_K),
+        block_shape=(BLOCK_M, BLOCK_K),
+        order=(1, 0),
+    )
+    ref = tl.load(input + tl.arange(0, 1))
+    # initialize and iteratively update accumulator
+    acc_dw1 = tl.zeros((BLOCK_N, BLOCK_K), dtype=tl.float32)
+    acc_dw2 = tl.zeros((BLOCK_N, BLOCK_K), dtype=tl.float32)
+    for i in range(0, M, BLOCK_M):
+        if IS_EVEN_MNK:
+            y1grad_blk = tl.load(y1_grad_block_ptr)
+            y2grad_blk = tl.load(y2_grad_block_ptr)
+            x = tl.load(input_block_ptr)
+        else:
+            y1grad_blk = tl.load(y1_grad_block_ptr, boundary_check=(0, 1))
+            y2grad_blk = tl.load(y2_grad_block_ptr, boundary_check=(0, 1))
+            x = tl.load(input_block_ptr, boundary_check=(0, 1))
+        acc_dw1 += tl.dot(y1grad_blk, x)
+        acc_dw2 += tl.dot(y2grad_blk, x)
+        y1_grad_block_ptr = tl.advance(y1_grad_block_ptr, (0, BLOCK_M))
+        y2_grad_block_ptr = tl.advance(y2_grad_block_ptr, (0, BLOCK_M))
+        input_block_ptr = tl.advance(input_block_ptr, (BLOCK_M, 0))
+    # write back result
+    dw1_ptrs = tl.make_block_ptr(
+        base=dw1,
+        shape=(N, K),
+        strides=(stride_wn, 1),
+        offsets=(pid_n * BLOCK_N, pid_k * BLOCK_K),
+        block_shape=(BLOCK_N, BLOCK_K),
+        order=(1, 0),
+    )
+    dw2_ptrs = tl.make_block_ptr(
+        base=dw2,
+        shape=(N, K),
+        strides=(stride_wn, 1),
+        offsets=(pid_n * BLOCK_N, pid_k * BLOCK_K),
+        block_shape=(BLOCK_N, BLOCK_K),
+        order=(1, 0),
+    )
+    if IS_EVEN_MNK:
+        tl.store(dw1_ptrs, acc_dw1.to(dtype))
+        tl.store(dw2_ptrs, acc_dw2.to(dtype))
+    else:
+        tl.store(dw1_ptrs, acc_dw1.to(dtype), boundary_check=(0, 1))
+        tl.store(dw2_ptrs, acc_dw2.to(dtype), boundary_check=(0, 1))
+class GatedMLP(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, w1, w2, use_gelu=True):
+        BLOCK_M = 128
+        BLOCK_N = 64
+        BLOCK_K = 64
+        GROUP_M = 8
+        SAVE_ACT_IN = x.requires_grad
+        if torch.is_autocast_enabled():
+            x = x.to(torch.get_autocast_gpu_dtype())
+            w1 = w1.to(torch.get_autocast_gpu_dtype())
+            w2 = w2.to(torch.get_autocast_gpu_dtype())
+        assert x.is_contiguous()
+        assert w1.is_contiguous()
+        assert w2.is_contiguous()
+        assert w1.shape == w2.shape
+        assert x.shape[2] == w1.shape[1]
+        assert x.shape[2] == w2.shape[1]
+        x_ = x if x.ndim == 2 else x.flatten(0, -2)
+        M, K = x_.shape
+        N, K = w1.shape
+        IS_EVEN_MNK = ((M % BLOCK_M) == 0) and ((N % BLOCK_N) == 0) and ((K % BLOCK_K) == 0)
+        out = torch.empty((M, N), device=x.device, dtype=x.dtype)
+        tl_dtype = to_tl_dtype(x.dtype)
+        act_input_1, act_input_2 = None, None
+        if SAVE_ACT_IN:
+            act_input_1 = torch.empty_like(out)
+            act_input_2 = torch.empty_like(out)
+        grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),)
+        gated_matmul_fwd[grid](
+            out,
+            x_, w1, w2,
+            act_input_1, act_input_2,
+            M, N, K,
+            out.stride(0), x_.stride(0),
+            w1.stride(0),
+            tl_dtype,
+            BLOCK_M, GROUP_M, BLOCK_N, BLOCK_K,
+            use_gelu,
+            SAVE_ACT_IN,
+            IS_EVEN_MNK,
+        )
+        ctx.save_for_backward(x_, w1, w2, act_input_1, act_input_2)
+        ctx.use_gelu = use_gelu
+        ctx.is_even_nmk = IS_EVEN_MNK
+        ctx.x_shape = x.shape
+        out = out if x.ndim == 2 else out.reshape(*x.shape[:-1], N)
+        return out
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        BLOCK_M = 64
+        BLOCK_N = 64
+        BLOCK_K = 64
+        GROUP_M = 8
+        x_, w1, w2, act_input_1, act_input_2 = ctx.saved_tensors
+        M, K = x_.shape
+        N, K = w1.shape
+        tl_dtype = to_tl_dtype(x_.dtype)
+        '''
+        din = torch.empty_like(x_)
+        dw1 = torch.empty_like(w1)
+        dw2 = torch.empty_like(w2)
+        dout_ = dout if dout.ndim == 2 else dout.flatten(0, -2)
+        y1_grad = torch.empty_like(dout_)
+        y2_grad = torch.empty_like(dout_)
+        grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, BLOCK_N))
+        gated_matmul_bwd_ygrad[grid](
+            dout_,
+            y1_grad, y2_grad,
+            act_input_1, act_input_2,
+            M, N,
+            dout_.stride(0),
+            # Meta-parameters
+            tl_dtype,
+            BLOCK_M, BLOCK_N,
+            ctx.use_gelu,
+            ctx.is_even_nmk)
+        grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(K, BLOCK_K),)
+        gated_matmul_bwd_input[grid](
+            w1, w2,
+            y1_grad, y2_grad,
+            din,
+            M, N, K,
+            dout_.stride(0), x_.stride(0),
+            w1.stride(0),
+            tl_dtype,
+            BLOCK_M, GROUP_M,
+            BLOCK_N, BLOCK_K,
+            ctx.is_even_nmk)
+        # reorder sizes
+        BLOCK_M = 64
+        BLOCK_N = 64
+        grid = (triton.cdiv(N, BLOCK_N) * triton.cdiv(K, BLOCK_K),)
+        gated_matmul_bwd_weights[grid](
+            x_,
+            y1_grad, y2_grad,
+            dw1, dw2,
+            M, N, K,
+            y1_grad.stride(0), x_.stride(0),
+            dw1.stride(0),
+            tl_dtype,
+            BLOCK_M, GROUP_M,
+            BLOCK_N, BLOCK_K,
+            ctx.is_even_nmk)
+        din = din if len(ctx.x_shape) == 2 else din.reshape(ctx.x_shape)
+        '''
+        dout_ = dout if dout.ndim == 2 else dout.flatten(0, -2)
+        y1_grad = torch.empty_like(dout_)
+        y2_grad = torch.empty_like(dout_)
+        grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, BLOCK_N))
+        gated_matmul_bwd_ygrad[grid](
+            dout_,
+            y1_grad, y2_grad,
+            act_input_1, act_input_2,
+            M, N,
+            dout_.stride(0),
+            # Meta-parameters
+            tl_dtype,
+            BLOCK_M, BLOCK_N,
+            ctx.use_gelu,
+            ctx.is_even_nmk)
+        #y2_grad = torch.mul(gelu_torch(x_ @ w1.t()), dout_)
+        #y1_grad = torch.mul(gelu_grad_torch(x_ @ w1.t()) * (x_ @ w2.t()), dout_)
+        din = torch.matmul(y2_grad, w2) + torch.matmul(y1_grad, w1)
+        dw1 = torch.matmul(y1_grad.t(), x_)
+        dw2 = torch.matmul(y2_grad.t(), x_)
+        din = din if len(ctx.x_shape) == 2 else din.reshape(ctx.x_shape)
+        return din, dw1, dw2, None
+gated_mlp = GatedMLP.apply

modeling_flash_t5.py ADDED Viewed

	@@ -0,0 +1,839 @@

+# From: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
+from dataclasses import dataclass
+import copy
+import math
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+from transformers.modeling_utils import ModuleUtilsMixin
+from transformers.modeling_outputs import ModelOutput, Seq2SeqModelOutput, BaseModelOutput
+from transformers import PreTrainedModel
+try:
+    from .rms_norm import fast_rms_layernorm
+except ImportError:
+    fast_rms_layernorm = None
+try:
+    from .cross_entropy_loss import fast_cross_entropy_loss
+except ImportError:
+    fast_cross_entropy_loss = None
+try:
+    from .flash_attention_v2_bias import attention as flash_attention_triton
+except ImportError:
+    fast_cross_entropy_loss = None
+try:
+    from .gated_mlp import gated_mlp
+except ImportError:
+    gated_mlp = None
+try:
+    #from flash_attn import flash_attn_kvpacked_func, flash_attn_func
+    from .fa2_compilable import flash_attn_kvpacked_func, flash_attn_func
+except ImportError:
+    flash_attn_kvpacked_func, flash_attn_func = None, None
+from .attn_ref import attn_ref
+from .configuration_flash_t5 import FlashT5Config
+from .positional_encoding import ALiBiPositionalEncoding, RelativePositionalEncoding, RotaryPositionalEncoding
+@dataclass
+class EncoderOutput(ModelOutput):
+    hidden_states: torch.FloatTensor = None
+    attention_mask: torch.FloatTensor = None
+@dataclass
+class Seq2SeqLMOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+    encoder_outputs: EncoderOutput = None
+class FlashT5CrossEntropyLoss(nn.Module):
+    def __init__(self, z_loss_factor=0.0, label_smoothing=0.0, use_triton_crossentropy=False):
+        super().__init__()
+        if use_triton_crossentropy and fast_cross_entropy_loss is None:
+            raise ImportError("fast_cross_entropy_loss is not available")
+        self.use_triton_crossentropy = use_triton_crossentropy
+        self.z_loss_factor = z_loss_factor
+        self.cross_entropy_loss = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
+    def compute_zloss(self, logits: torch.Tensor, z_loss: float):
+        logits_sum = torch.logsumexp(logits, dim=-1, keepdim=True)
+        log_z = torch.squeeze(logits_sum, axis=-1)
+        total_z_loss = z_loss * torch.square(log_z)
+        return total_z_loss.mean()
+    def forward(self, logits, labels):
+        if self.use_triton_crossentropy:
+            return fast_cross_entropy_loss(logits, labels, z_loss_factor=self.z_loss_factor)
+        # use standard method
+        batch, seq_len, d = logits.shape
+        logits_flatten = logits.float().view(batch*seq_len, d) # Must cast to float32 for numerical stability
+        labels_flatten = labels.view(-1)
+        loss = self.cross_entropy_loss(logits_flatten, labels_flatten)
+        z_loss = 0.0
+        if self.z_loss_factor != 0.0:
+            z_loss = self.compute_zloss(logits_flatten[labels_flatten != -100],
+                                   z_loss=self.z_loss_factor)
+        return loss, z_loss
+class FlashT5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, use_triton_layernorm=False):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        if use_triton_layernorm and fast_rms_layernorm is None:
+            raise ImportError("fast_rms_layernorm is not available")
+        self.use_triton_layernorm = use_triton_layernorm
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        if self.use_triton_layernorm:
+            return fast_rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+class FlashT5DenseAct(nn.Module):
+    def __init__(self, config: FlashT5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = torch.nn.GELU(approximate='tanh') if config.use_gelu_act else torch.nn.ReLU()
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+        return hidden_states
+class FlashT5DenseGatedAct(nn.Module):
+    def __init__(self, config: FlashT5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = torch.nn.GELU(approximate='tanh') if config.use_gelu_act else torch.nn.ReLU()
+        self.use_triton_gated_mlp = config.use_triton_gated_mlp
+        if self.use_triton_gated_mlp and gated_mlp is None:
+            raise ImportError("gated_mlp is not available")
+        self.use_gelu_act = config.use_gelu_act
+    def forward(self, hidden_states):
+        if self.use_triton_gated_mlp:
+            return gated_mlp(hidden_states, self.wi_0.weight, self.wi_1.weight, self.use_gelu_act)
+        hidden_act = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_act * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class FlashT5LayerFF(nn.Module):
+    def __init__(self, config: FlashT5Config):
+        super().__init__()
+        if config.use_glu_mlp:
+            self.act = FlashT5DenseGatedAct(config)
+        else:
+            self.act = FlashT5DenseAct(config)
+        self.layer_norm = FlashT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon, use_triton_layernorm=config.use_triton_layernorm)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states).type_as(hidden_states)
+        forwarded_states = self.act(forwarded_states)
+        forwarded_states = self.wo(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+class FlashT5Attention(nn.Module, ModuleUtilsMixin):
+    def __init__(self, config: FlashT5Config, has_positional_encoding=False, is_causal=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_positional_encoding = has_positional_encoding
+        self.is_causal = is_causal
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.p_dropout = config.attention_dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+        self.use_flash_attention = config.use_flash_attention
+        self.position_encoding_type = config.position_encoding_type
+        self.max_sequence_length = config.max_sequence_length
+        self.softmax_scale = 1.0/math.sqrt(self.n_heads)
+        self.use_full_bias_size = config.use_full_bias_size
+        if self.use_flash_attention == "triton" and flash_attention_triton is None:
+            raise ImportError("flash_attention_triton is not available")
+        elif self.use_flash_attention == "fa2" and flash_attn_func is None:
+            raise ImportError("Flash Attention 2 is not available")
+        assert (self.p_dropout == 0.0) or (self.use_flash_attention != "triton"), "Triton attention does not support dropout"
+        self.pe_encoding = None
+        if self.position_encoding_type == "ALiBi" and has_positional_encoding:
+            # build alibi matrix with an upper bound on seq length
+            self.pe_encoding = ALiBiPositionalEncoding(self.max_sequence_length, self.n_heads, config.alibi_mode, config.use_randomized_position_encoding)
+        elif self.position_encoding_type == "t5" and has_positional_encoding:
+            self.pe_encoding = RelativePositionalEncoding(self.relative_attention_num_buckets, self.relative_attention_max_distance, self.n_heads, self.max_sequence_length, config.use_randomized_position_encoding)
+        elif self.position_encoding_type == "RoPE":
+            self.pe_encoding = RotaryPositionalEncoding(int(self.key_value_proj_dim * config.rotary_emb_fraction), self.max_sequence_length, config.rotary_base, config.rotary_interleaved, config.rotary_scale_base, config.use_randomized_position_encoding)
+        self.Wq = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.Wk = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.Wv = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        batch_size, seq_length = hidden_states.shape[:2]
+        key_length = seq_length if key_value_states is None else key_value_states.shape[1]
+        q = self.Wq(hidden_states)
+        if key_value_states is None:
+            k = self.Wk(hidden_states)
+            v = self.Wv(hidden_states)
+        else:
+            k = self.Wk(key_value_states)
+            v = self.Wv(key_value_states)
+        q = q.view(batch_size, seq_length, self.n_heads, self.key_value_proj_dim)
+        k = k.view(batch_size, key_length, self.n_heads, self.key_value_proj_dim)
+        v = v.view(batch_size, key_length, self.n_heads, self.key_value_proj_dim)
+        if position_bias is None and self.pe_encoding is not None:
+            q, k, v, position_bias = self.pe_encoding(q, k, v)
+        if position_bias is not None and self.use_full_bias_size and (self.use_flash_attention == "fa2" or self.use_flash_attention == "triton"):
+            position_bias = position_bias.expand(q.shape[0], q.shape[2], q.shape[1], k.shape[1]).contiguous()
+        if self.use_flash_attention == "fa2":
+            output = flash_attn_func(q, k, v, dropout_p=self.p_dropout, softmax_scale=self.softmax_scale, attn_bias=position_bias, causal=self.is_causal)
+        elif self.use_flash_attention == "triton":
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            output = flash_attention_triton(q, k, v, position_bias, self.is_causal, self.softmax_scale)
+            output = output.permute(0, 2, 1, 3)
+        else: # use flash attention
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            output = attn_ref(q, k, v, position_bias, dropout_p=self.p_dropout, sm_scale=self.softmax_scale, causal=self.is_causal)
+            output = output.permute(0, 2, 1, 3)
+        output = self.o(output.reshape(output.shape[0], output.shape[1], self.inner_dim))
+        return (output, position_bias)
+class FlashT5LayerSelfAttention(nn.Module):
+    def __init__(self, config, has_positional_encoding=False):
+        super().__init__()
+        self.self_attention = FlashT5Attention(config, has_positional_encoding=has_positional_encoding, is_causal=config.is_decoder)
+        self.layer_norm = FlashT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon, use_triton_layernorm=config.use_triton_layernorm)
+        self.dropout = nn.Dropout(config.dropout_rate)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states).type_as(hidden_states)
+        attention_output = self.self_attention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]
+        return outputs
+class FlashT5LayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.cross_attention = FlashT5Attention(config, has_positional_encoding=False)
+        self.layer_norm = FlashT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon, use_triton_layernorm=config.use_triton_layernorm)
+        self.dropout = nn.Dropout(config.dropout_rate)
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.cross_attention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]
+        return outputs
+class FlashT5Block(nn.Module):
+    def __init__(self, config, has_positional_encoding=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.self_attention_layer = FlashT5LayerSelfAttention(config, has_positional_encoding=has_positional_encoding)
+        if self.is_decoder:
+            self.cross_attention_layer = FlashT5LayerCrossAttention(config)
+        self.ff_layer = FlashT5LayerFF(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+    ):
+        self_attention_outputs = self.self_attention_layer(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+        )
+        hidden_states = self_attention_outputs[0]
+        attention_outputs = self_attention_outputs[1:]  # Relative position weights
+        if self.is_decoder and encoder_hidden_states is not None:
+            cross_attention_outputs = self.cross_attention_layer(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+            )
+            hidden_states = cross_attention_outputs[0]
+            # Keep relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[1:]
+        # Apply Feed Forward layer
+        hidden_states = self.ff_layer(hidden_states)
+        outputs = (hidden_states,) + attention_outputs
+        return outputs  # hidden-states, (self-attention position bias), (cross-attention position bias)
+class FlashT5Stack(nn.Module, ModuleUtilsMixin):
+    def __init__(self, config, embed_tokens):
+        super().__init__()
+        assert embed_tokens is not None
+        self.config = config
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+        self.use_flash_attention = config.use_flash_attention
+        self.block = nn.ModuleList(
+            [FlashT5Block(config, has_positional_encoding=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = FlashT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon, use_triton_layernorm=config.use_triton_layernorm)
+        self.dropout = nn.Dropout(config.dropout_rate)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None) -> BaseModelOutput:
+        input_shape = input_ids.size()
+        batch_size, seq_length = input_shape
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if torch.is_autocast_enabled() and input_ids.device.type == 'cuda':
+            inputs_embeds = inputs_embeds.to(torch.get_autocast_gpu_dtype())
+        # Masking
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, seq_length, device=inputs_embeds.device, dtype=torch.bool)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.bool
+            )
+        position_bias = None
+        encoder_decoder_position_bias = None
+        hidden_states = self.dropout(inputs_embeds)
+        for _, layer_module in enumerate(self.block):
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+            )
+            # We share the position biases between the layers - the first layer store them
+            position_bias = layer_outputs[1]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[2]
+            hidden_states = layer_outputs[0]
+        hidden_states = self.final_layer_norm(hidden_states).type_as(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states
+        )
+class FlashT5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FlashT5Config
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FlashT5Block"]
+    _keep_in_fp32_modules = []
+    def _init_weights(self, module):
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, FlashT5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, (FlashT5ForConditionalGeneration)):
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * self.config.d_model ** -0.5)
+        elif isinstance(module, FlashT5DenseGatedAct):
+            d_ff, d_model = module.wi_0.weight.data.size()
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+        elif isinstance(module, FlashT5LayerFF):
+            d_ff, d_model = module.wo.weight.data.size()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((d_ff) ** -0.5))
+        elif isinstance(module, FlashT5Attention):
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.Wq.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.Wk.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.Wv.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_positional_encoding:
+                if hasattr(module.pe_encoding, "relative_attention_bias"):
+                    module.pe_encoding.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+        return shifted_input_ids
+class FlashT5Model(FlashT5PreTrainedModel):
+    def __init__(self, config: FlashT5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = FlashT5Stack(encoder_config, self.shared)
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = FlashT5Stack(decoder_config, self.shared)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds
+            )
+        hidden_states = encoder_outputs[0]
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask
+        )
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+        )
+class FlashT5ForConditionalGeneration(FlashT5PreTrainedModel):
+    def __init__(self, config: FlashT5Config):
+        super().__init__(config)
+        config.is_encoder_decoder = False
+        assert not config.tie_word_embeddings
+        self.config = config
+        self.model_dim = config.d_model
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        self.encoder = FlashT5Stack(encoder_config, self.shared)
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = FlashT5Stack(decoder_config, self.shared)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.loss_fct = FlashT5CrossEntropyLoss(z_loss_factor=config.z_loss,
+                                                label_smoothing=config.label_smoothing,
+                                                use_triton_crossentropy=config.use_triton_crossentropy)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # do nothing
+        model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+        return model_inputs
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, value):
+        self.shared = value
+    def generate(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        max_length = 32,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+            input_ids: B x L_encoder, int64
+            attention_mask: B x L_encoder, int64
+                1 for tokens to attend to, 0 for tokens to ignore
+            Generation:
+                Starts with 0, ends with 1, padding is 0
+            # For 20 input/outputs, the diff between my implementation and HF is 9.8s vs 11.4s
+        """
+        B, _ = input_ids.size()
+        labels = torch.zeros(B, 1, dtype=torch.long, device=input_ids.device)
+        encoder_outputs = None
+        for _ in range(max_length):
+            out = self.forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=labels,
+                encoder_outputs=encoder_outputs,
+            )
+            encoder_outputs = out.encoder_outputs
+            top_labels = out.logits[:, -1].argmax(-1).unsqueeze(-1)
+            labels = torch.cat([labels, top_labels], dim=-1)
+            if (labels == 1).sum(-1).clamp(min=0, max=1).sum().item() == B:
+                break
+        labels[:, -1] = 1
+        # Mask out the padding, i.e., all positions after the first 1 with 0
+        B, L = labels.size()
+        mask = torch.arange(L, device=labels.device).unsqueeze(0) <= (labels == 1).long().argmax(-1).unsqueeze(-1)
+        labels = labels.masked_fill(~mask, 0)
+        return labels
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        encoder_outputs = None,
+    ) -> Seq2SeqLMOutput:
+        """
+            input_ids: B x L_encoder, int64
+            attention_mask: B x L_encoder, int64
+                1 for tokens to attend to, 0 for tokens to ignore
+            labels: B x L_decoder, int64
+        """
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
+        hidden_states = encoder_outputs.hidden_states
+        if labels is not None and decoder_input_ids is None:
+            decoder_input_ids = self._shift_right(labels)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+        )
+        sequence_output = decoder_outputs[0]
+        lm_logits = self.lm_head(sequence_output)
+        loss = None
+        if labels is not None:
+            loss, z_loss = self.loss_fct(lm_logits, labels)
+            loss += z_loss
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            encoder_outputs=encoder_outputs,
+        )
+class FlashT5EncoderModel(FlashT5PreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
+    def __init__(self, config: FlashT5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = FlashT5Stack(encoder_config, self.shared)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
+            " 'block.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.model_parallel = True
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.encoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+    def get_encoder(self):
+        return self.encoder
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, T5EncoderModel
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        >>> model = T5EncoderModel.from_pretrained("t5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return encoder_outputs

positional_encoding.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from flash_attn.layers.rotary import apply_rotary_emb_qkv_, apply_rotary_emb_func, apply_rotary_emb_kv_
+class RelativePositionalEncoding(nn.Module):
+    def __init__(self, relative_attention_num_buckets, relative_attention_max_distance, n_heads, max_sequence_length, bidirectional=True, randomized_position=False):
+        super().__init__()
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.n_heads = n_heads
+        self.max_sequence_length = max_sequence_length
+        self.bidirectional = bidirectional
+        self.randomized_position = randomized_position
+        self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+    def compute_bias(self, query_length, key_length, device=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        if self.randomized_position:
+            context_position = torch.arange(self.max_sequence_length, dtype=torch.long, device=device)
+            context_indices_rand, _ = torch.sort(torch.randperm(self.max_sequence_length)[:query_length])
+            context_indices_rand[0] = 0 # root the first element of the sequence
+            context_position = context_position[context_indices_rand][:, None]
+            memory_position = torch.arange(self.max_sequence_length, dtype=torch.long, device=device)
+            memory_indices_rand, _ = torch.sort(torch.randperm(self.max_sequence_length)[:key_length])
+            memory_indices_rand[0] = 0 # root the first element of the sequence
+            memory_position = memory_position[memory_indices_rand][None, :]
+        else:
+            context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+            memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=self.bidirectional,
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+    def forward(self, q, k=None, v=None):
+        query_length = q.shape[1]
+        key_length = k.shape[1] if k is not None else query_length
+        bias = self.compute_bias(query_length, key_length, device=q.device).contiguous().to(q.dtype)
+        return q, k, v, bias
+class ALiBiPositionalEncoding(nn.Module):
+    def __init__(self, max_sequence_length, num_heads, mode='symetric', randomized_position=False):
+        super().__init__()
+        self.max_sequence_length = max_sequence_length
+        self.num_heads = num_heads
+        self.mode = mode
+        self.randomized_position = randomized_position
+        self.alibi_bias = self.build_alibi_bias_matrix(num_heads, max_sequence_length, mode)
+    @staticmethod
+    def fill_with_neg_inf(t):
+        """FP16-compatible function that fills a tensor with -inf."""
+        return t.float().fill_(float("-inf")).type_as(t)
+    def get_slopes(self, n):
+        def get_slopes_power_of_2(n):
+            start = (2**(-2**-(math.log2(n)-3)))
+            ratio = start
+            return [start*ratio**i for i in range(n)]
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)                   #In the paper, we only train models that have 2^a heads for some a. This function has
+        else:                                                 #some good properties that only occur when the input is a power of 2. To maintain that even
+            closest_power_of_2 = 2**math.floor(math.log2(n))  #when the number of heads is not a power of 2, we use this workaround.
+            return get_slopes_power_of_2(closest_power_of_2) + self.get_slopes(2*closest_power_of_2)[0::2][:n-closest_power_of_2]
+    def build_symetric_alibi_bias_matrix(self, num_heads, maxpos):
+        context_position = torch.arange(maxpos)[:, None]
+        memory_position = torch.arange(maxpos)[None, :]
+        relative_position = memory_position - context_position
+        relative_position = torch.abs(relative_position).unsqueeze(0).expand(num_heads, -1,-1)
+        slopes = torch.Tensor(self.get_slopes(num_heads)) * -1
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * relative_position
+        return alibi.view(1, num_heads, maxpos, maxpos)
+    def build_asymetric_alibi_bias_matrix(self, num_heads, maxpos):
+        _future_mask_right = torch.triu(self.fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1).unsqueeze(0).repeat(num_heads // 2, 1, 1)
+        _future_mask_left = torch.tril(self.fill_with_neg_inf(torch.zeros([maxpos, maxpos])), -1).unsqueeze(0).repeat(num_heads // 2, 1, 1)
+        nonsym_mask = torch.cat((_future_mask_right, _future_mask_left), dim = 0).unsqueeze(0)
+        slopes = torch.Tensor(self.get_slopes(num_heads // 2)) * -1
+        context_position = torch.arange(maxpos)[:, None]
+        memory_position = torch.arange(maxpos)[None, :]
+        relative_position = memory_position - context_position
+        relative_position = torch.abs(relative_position).unsqueeze(0).expand(num_heads // 2, -1,-1)
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * relative_position
+        alibi = alibi.view(1, num_heads // 2, maxpos, maxpos)
+        alibi = alibi.repeat(1, 2, 1, 1)
+        return alibi.view(1, num_heads, maxpos, maxpos) + nonsym_mask.view(1, num_heads, maxpos, maxpos)
+    def build_alibi_bias_matrix(self, num_heads, maxpos, mode='symetric'):
+        if mode == 'symetric':
+            return self.build_symetric_alibi_bias_matrix(num_heads, maxpos)
+        elif mode == 'asymetric':
+            return self.build_asymetric_alibi_bias_matrix(num_heads, maxpos)
+        else:
+            raise ValueError("ALiBi mode " + mode + " is not implemented.")
+    def forward(self, q, k=None, v=None):
+        query_length = q.shape[1]
+        key_length = k.shape[1] if k is not None else query_length
+        assert (self.alibi_bias.shape[1] < query_length) & (self.alibi_bias.shape[1] < key_length), "Sequence length larger than allowed alibi bound"
+        if self.randomized_position:
+            query_indices_rand, _ = torch.sort(torch.randperm(self.max_sequence_length)[:query_length])
+            key_indices_rand, _ = torch.sort(torch.randperm(self.max_sequence_length)[:key_length])
+            # ground sequences
+            query_indices_rand[0] = 0
+            key_indices_rand[0] = 0
+            bias = self.alibi_bias[:, :, query_indices_rand, key_indices_rand].to(q.device)
+        else:
+            bias = self.alibi_bias[:, :, :query_length, :key_length].to(q.device)
+        return q, k, v, bias.to(q.dtype).contiguous()
+class RotaryPositionalEncoding(nn.Module):
+    def __init__(self, dim,
+        max_sequence_length,
+        base=10000.0,
+        interleaved=False,
+        scale_base=None,
+        randomized_position=False):
+        super().__init__()
+        self.max_sequence_length = max_sequence_length
+        self.randomized_position = randomized_position
+        self.dim = dim
+        self.base = base
+        self.interleaved = interleaved
+        self.scale_base = scale_base
+        inv_freq = self._compute_inv_freq()
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        scale = (
+            (torch.arange(0, dim, 2, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None
+            else None
+        )
+        self.register_buffer("scale", scale, persistent=False)
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+    def _compute_inv_freq(self, device=None):
+        return 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
+        )
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (
+            self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            inv_freq = self._compute_inv_freq(device=device)
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            t = torch.arange(seqlen, device=device, dtype=dtype)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+                self._cos_k_cached = None
+                self._sin_k_cached = None
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
+                    - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+    def forward(self, q, k=None, v=None):
+        if self._cos_cached is None:
+            self._update_cos_sin_cache(self.max_sequence_length, device=q.device, dtype=q.dtype)
+        if k is None and v is None:
+            q = apply_rotary_emb_qkv_(
+                    q,
+                    self._cos_cached,
+                    self._sin_cached,
+                    self._cos_k_cached,
+                    self._sin_k_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=0
+                )
+        elif v is None and k is not None:
+            q = apply_rotary_emb_func(
+                q,
+                self._cos_cached,
+                self._sin_cached,
+                interleaved=self.interleaved,
+                inplace=True,
+                seqlen_offsets=0
+            )
+            k = apply_rotary_emb_kv_(
+                k,
+                self._cos_cached if self._cos_k_cached is None else self._cos_k_cached,
+                self._sin_cached if self._sin_k_cached is None else self._sin_k_cached,
+                interleaved=self.interleaved,
+                seqlen_offsets=0,
+            )
+        else:
+            q = apply_rotary_emb_func(
+                q,
+                self._cos_cached,
+                self._sin_cached,
+                interleaved=self.interleaved,
+                inplace=True,
+                seqlen_offsets=0
+            )
+            k = apply_rotary_emb_func(
+                k,
+                self._cos_cached if self._cos_k_cached is None else self._cos_k_cached,
+                self._sin_cached if self._sin_k_cached is None else self._sin_k_cached,
+                interleaved=self.interleaved,
+                seqlen_offsets=0,
+            )
+            v = apply_rotary_emb_func(
+                v,
+                self._cos_cached if self._cos_k_cached is None else self._cos_k_cached,
+                self._sin_cached if self._sin_k_cached is None else self._sin_k_cached,
+                interleaved=self.interleaved,
+                seqlen_offsets=0,
+            )
+        return q, k, v, None

rms_norm.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+# Copyright 2024 CATIE. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modifications to the orignal file
+# - add weights gradients
+# - remove the mask if size is a power of 2
+# - support for torch.compile
+import triton
+import triton.language as tl
+import torch
+MAX_FUSED_SIZE = 65536
+next_power_of_2 = triton.next_power_of_2
+def calculate_settings(n):
+    BLOCK_SIZE = next_power_of_2(n)
+    if BLOCK_SIZE > MAX_FUSED_SIZE:
+        raise RuntimeError(f"Cannot launch Triton kernel since n = {n} exceeds "\
+                           f"the maximum CUDA blocksize = {MAX_FUSED_SIZE}.")
+    num_warps = 4
+    if   BLOCK_SIZE >= 32768: num_warps = 32
+    elif BLOCK_SIZE >=  8192: num_warps = 16
+    elif BLOCK_SIZE >=  2048: num_warps = 8
+    return BLOCK_SIZE, num_warps
+@triton.jit
+def _rms_layernorm_forward(
+    Y, Y_row_stride,
+    X, X_row_stride,
+    W, W_row_stride,
+    r, r_row_stride,
+    n_cols, eps,
+    BLOCK_SIZE : tl.constexpr,
+    IS_EVEN_X: tl.constexpr
+):
+    """
+        Fast RMS Layernorm kernel
+        Inspiration from a Triton tutorial:
+        https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    Y += row_idx * Y_row_stride
+    X += row_idx * X_row_stride
+    r += row_idx * r_row_stride
+    if IS_EVEN_X:
+        X_row = tl.load(X + col_offsets).to(tl.float32)
+        W_row = tl.load(W + col_offsets)
+    else:
+        X_row = tl.load(X + col_offsets, mask=mask, other=0).to(tl.float32)
+        W_row = tl.load(W + col_offsets, mask=mask, other=0)
+    row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    tl.store(r, inv_var)
+    normed = X_row * inv_var
+    normed = normed.to(W_row.dtype) # Exact copy from HF
+    output = normed * W_row
+    if IS_EVEN_X:
+        tl.store(Y + col_offsets, output)
+    else:
+        tl.store(Y + col_offsets, output, mask=mask)
+@triton.jit
+def _rms_layernorm_backward(
+    dY, dY_row_stride,
+    X,   X_row_stride,
+    W,   W_row_stride,
+    r,   r_row_stride,
+    dW, dW_row_stride,
+    dX, dX_row_stride,
+    n_cols, eps,
+    BLOCK_SIZE : tl.constexpr,
+    IS_EVEN_X: tl.constexpr
+):
+    """
+        Fast RMS Layernorm kernel for the backward pass
+        Inspiration from a Triton tutorial:
+        https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    dY += row_idx * dY_row_stride
+    X  += row_idx *  X_row_stride
+    r  += row_idx *  r_row_stride
+    dW += row_idx * dW_row_stride
+    dX += row_idx * dX_row_stride
+    if IS_EVEN_X:
+        dY_row = tl.load(dY + col_offsets).to(tl.float32)
+        X_row  = tl.load(X  + col_offsets).to(tl.float32)
+        W_row  = tl.load(W  + col_offsets).to(tl.float32)
+    else:
+        dY_row = tl.load(dY + col_offsets, mask=mask, other=0).to(tl.float32)
+        X_row  = tl.load(X  + col_offsets, mask=mask, other=0).to(tl.float32)
+        W_row  = tl.load(W  + col_offsets, mask=mask, other=0).to(tl.float32)
+    # Get saved row variance
+    inv_var = tl.load(r).to(tl.float32)
+    normed = X_row * inv_var
+    dW_row = dY_row * normed
+    dY_W = dY_row * W_row
+    rowsum_dY_normed = tl.sum(dY_W * normed, axis = 0)
+    output = inv_var/n_cols * (n_cols*dY_W - normed*rowsum_dY_normed)
+    if IS_EVEN_X:
+        tl.store(dW + col_offsets, dW_row)
+        tl.store(dX + col_offsets, output)
+    else:
+        tl.store(dW + col_offsets, dW_row, mask=mask)
+        tl.store(dX + col_offsets, output, mask=mask)
+# Wrapper for triton kernel for torch.compile - should be unecessary for PyTorch 2.3 ?
+torch.library.define("flasht5::rmsnorm_triton_fwd", "(Tensor X, Tensor W, float eps, int n_cols, int n_rows, int BLOCK_SIZE, int num_warps) -> (Tensor, Tensor)")
+@torch.library.impl("flasht5::rmsnorm_triton_fwd", "default")
+def rmsnorm_triton_fwd(X, W, eps, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device="cuda")
+    r = torch.empty(n_rows, dtype=torch.float32, device="cuda")
+    _rms_layernorm_forward[(n_rows,)](
+        Y, Y.stride(0),
+        X, X.stride(0),
+        W, W.stride(0),
+        r, r.stride(0),
+        n_cols, eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        IS_EVEN_X=((n_cols % BLOCK_SIZE) == 0),
+        num_warps=num_warps
+    )
+    return Y, r
+@torch.library.impl_abstract("flasht5::rmsnorm_triton_fwd", rmsnorm_triton_fwd)
+def rmsnorm_triton_fwd_abstract(X, W, eps, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    Y = X.new_empty((n_rows, n_cols))
+    r = X.new_empty((n_rows))
+    return Y, r
+torch.library.define("flasht5::rmsnorm_triton_bwd", "(Tensor dY, Tensor r, Tensor X, Tensor W, float eps, int n_cols, int n_rows, int BLOCK_SIZE, int num_warps) -> (Tensor, Tensor)")
+@torch.library.impl("flasht5::rmsnorm_triton_bwd", "default")
+def rmsnorm_triton_bwd(dY, r, X, W, eps, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    dX = torch.empty_like(dY)
+    dW = torch.empty_like(dY)
+    _rms_layernorm_backward[(n_rows,)](
+        dY, dY.stride(0),
+        X,  X.stride(0),
+        W,  1,
+        r,  1,
+        dW, dW.stride(0),
+        dX, dX.stride(0),
+        n_cols, eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        IS_EVEN_X=((n_cols % BLOCK_SIZE) == 0),
+        num_warps=num_warps,
+    )
+    return dX, dW
+@torch.library.impl_abstract("flasht5::rmsnorm_triton_bwd", rmsnorm_triton_bwd)
+def rmsnorm_triton_bwd_abstract(dY, r, X, W, eps, n_cols, n_rows, BLOCK_SIZE, num_warps):
+    return torch.empty_like(dY), torch.empty_like(dY)
+class Fast_RMS_Layernorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X, W, eps):
+        shape = X.shape
+        dim = shape[-1]
+        X = X.view(-1, dim)
+        n_rows, n_cols = X.shape
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        Y, r = torch.ops.flasht5.rmsnorm_triton_fwd(X, W, eps, n_cols, n_rows, BLOCK_SIZE, num_warps)
+        ctx.eps = eps
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps  = num_warps
+        ctx.save_for_backward(X, W, r)
+        return Y.view(*shape)
+    @staticmethod
+    def backward(ctx, dY):
+        shape = dY.shape
+        dim = shape[-1]
+        dY = dY.view(-1, dim)
+        X, W, r = ctx.saved_tensors
+        n_rows, n_cols = dY.shape
+        dX = torch.empty_like(dY)
+        dW = torch.empty_like(dY)
+        dW, dX = torch.ops.flasht5.rmsnorm_triton_bwd(dY, r, X, W, ctx.eps, n_cols, n_rows, ctx.BLOCK_SIZE, ctx.num_warps)
+        dX = dX.view(*shape)
+        return dX, dW.sum(0), None
+def fast_rms_layernorm(X, W, eps):
+    out = Fast_RMS_Layernorm.apply(X, W, eps)
+    return out