m10an
/

DNABERT-S

m10an commited on Oct 23, 2024

Commit

d5ca709

verified ·

1 Parent(s): 550be4b

Update bert_layers.py

on/off triton attention using config

Files changed (1) hide show

bert_layers.py CHANGED Viewed

@@ -126,6 +126,7 @@ class BertUnpadSelfAttention(nn.Module):
             warnings.warn(
                 'Unable to import Triton; defaulting MosaicBERT attention implementation to pytorch (this will reduce throughput when using this model).'
             )
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
                 max_seqlen_in_batch: int, indices: torch.Tensor,
@@ -158,7 +159,7 @@ class BertUnpadSelfAttention(nn.Module):
                         'b s (t h d) -> b s t h d',
                         t=3,
                         h=self.num_attention_heads)
-        if self.p_dropout or flash_attn_qkvpacked_func is None:
             # if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
             q = qkv[:, :, 0, :, :].permute(0, 2, 1, 3)  # b h s d
             k = qkv[:, :, 1, :, :].permute(0, 2, 3, 1)  # b h d s

             warnings.warn(
                 'Unable to import Triton; defaulting MosaicBERT attention implementation to pytorch (this will reduce throughput when using this model).'
             )
+        self.flash_attn_triton_disabled = (flash_attn_qkvpacked_func is None) or (config.flash_attn_type != 'triton')
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
                 max_seqlen_in_batch: int, indices: torch.Tensor,
                         'b s (t h d) -> b s t h d',
                         t=3,
                         h=self.num_attention_heads)
+        if self.p_dropout or self.flash_attn_triton_disabled:
             # if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
             q = qkv[:, :, 0, :, :].permute(0, 2, 1, 3)  # b h s d
             k = qkv[:, :, 1, :, :].permute(0, 2, 3, 1)  # b h d s