fix cpu inference

Browse files

Signed-off-by: Meow <ongjackm@gmail.com>

Files changed (3) hide show

mha.py +1 -0
modeling_xlm_roberta.py +1 -3
rotary.py +22 -11

mha.py CHANGED Viewed

@@ -463,6 +463,7 @@ class MHA(nn.Module):
                 scale_base=rotary_emb_scale_base,
                 interleaved=rotary_emb_interleaved,
                 device=device,
             )
         if fused_bias_fc and FusedDense is None:

                 scale_base=rotary_emb_scale_base,
                 interleaved=rotary_emb_interleaved,
                 device=device,
+                use_flash_attn=use_flash_attn,
             )
         if fused_bias_fc and FusedDense is None:

modeling_xlm_roberta.py CHANGED Viewed

@@ -63,9 +63,7 @@ logger = logging.getLogger(__name__)
 def get_use_flash_attn(config: XLMRobertaFlashConfig):
-    if not getattr(config, "use_flash_attn", False):
-        return False
-    if not torch.cuda.is_available():
         return False
     if importlib.util.find_spec("flash_attn") is None:
         logger.warning(

 def get_use_flash_attn(config: XLMRobertaFlashConfig):
+    if not getattr(config, "use_flash_attn", False) or not torch.cuda.is_available():
         return False
     if importlib.util.find_spec("flash_attn") is None:
         logger.warning(

rotary.py CHANGED Viewed

@@ -4,20 +4,11 @@
 # Copyright (c) 2023, Tri Dao.
-import math
 from typing import Optional, Tuple, Union
 import torch
 from einops import rearrange, repeat
-if torch.cuda.is_available():
-    try:
-        from flash_attn.ops.triton.rotary import apply_rotary
-    except ImportError:
-        def apply_rotary(*args, **kwargs):
-            raise RuntimeError("RoPE requires flash-attention to be installed")
 def rotate_half(x, interleaved=False):
     if not interleaved:
@@ -69,6 +60,8 @@ class ApplyRotaryEmb(torch.autograd.Function):
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
     ):
         out = apply_rotary(
             x,
             cos,
@@ -95,6 +88,8 @@ class ApplyRotaryEmb(torch.autograd.Function):
     @staticmethod
     def backward(ctx, do):
         seqlen_offsets = ctx.seqlen_offsets
         if seqlen_offsets is None:
             cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
@@ -169,12 +164,15 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):
         seqlen_offsets: Union[int, torch.Tensor] = 0,
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
     ):
         # batch, seqlen, three, nheads, headdim = qkv.shape
         assert qkv.shape[-3] == 3
         if cos_k is None and sin_k is None and qkv.is_contiguous():
-            if torch.cuda.is_available():
                 # Call 1 kernel instead of 2 kernels
                 # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
                 # dimensions, we get the same tensor
@@ -205,6 +203,8 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):
                 )
                 qkv = torch.stack((q_rot, k_rot, qkv[:, :, 2]), dim=2)
         else:
             cos_k = cos if cos_k is None else cos_k
             sin_k = sin if sin_k is None else sin_k
             q, k = qkv[..., 0, :, :], qkv[..., 1, :, :]
@@ -241,6 +241,8 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):
     @staticmethod
     def backward(ctx, dqkv):
         seqlen_offsets = ctx.seqlen_offsets
         if seqlen_offsets is None:
             cos, sin, cos_k, sin_k, cu_seqlens, seqlen_offsets = ctx.saved_tensors
@@ -301,6 +303,7 @@ def apply_rotary_emb_qkv_(
     seqlen_offsets: Union[int, torch.Tensor] = 0,
     cu_seqlens: Optional[torch.Tensor] = None,
     max_seqlen: Optional[int] = None,
 ):
     """
     Arguments:
@@ -321,7 +324,7 @@ def apply_rotary_emb_qkv_(
     Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
     """
     return ApplyRotaryEmbQKV_.apply(
-        qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets, cu_seqlens, max_seqlen
     )
@@ -337,6 +340,8 @@ class ApplyRotaryEmbKV_(torch.autograd.Function):
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
     ):
         # batch, seqlen, two, nheads, headdim = kv.shape
         assert kv.shape[-3] == 2
         k = kv[..., 0, :, :]
@@ -364,6 +369,8 @@ class ApplyRotaryEmbKV_(torch.autograd.Function):
     @staticmethod
     def backward(ctx, dkv):
         seqlen_offsets = ctx.seqlen_offsets
         if seqlen_offsets is None:
             cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
@@ -443,6 +450,7 @@ class RotaryEmbedding(torch.nn.Module):
         scale_base=None,
         pos_idx_in_fp32=True,
         device=None,
     ):
         """
         interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
@@ -462,6 +470,7 @@ class RotaryEmbedding(torch.nn.Module):
         self.dim = dim
         self._base = float(base)
         self.pos_idx_in_fp32 = pos_idx_in_fp32
         # Generate and save the inverse frequency buffer (non trainable)
         inv_freq = self._compute_inv_freq(device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
@@ -588,6 +597,7 @@ class RotaryEmbedding(torch.nn.Module):
                     seqlen_offsets=seqlen_offset,
                     cu_seqlens=cu_seqlens,
                     max_seqlen=max_seqlen,
                 )
             else:
                 return apply_rotary_emb_qkv_(
@@ -600,6 +610,7 @@ class RotaryEmbedding(torch.nn.Module):
                     seqlen_offsets=seqlen_offset,
                     cu_seqlens=cu_seqlens,
                     max_seqlen=max_seqlen,
                 )
         else:
             q = qkv

 # Copyright (c) 2023, Tri Dao.
 from typing import Optional, Tuple, Union
 import torch
 from einops import rearrange, repeat
 def rotate_half(x, interleaved=False):
     if not interleaved:
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
     ):
+        from flash_attn.ops.triton.rotary import apply_rotary
         out = apply_rotary(
             x,
             cos,
     @staticmethod
     def backward(ctx, do):
+        from flash_attn.ops.triton.rotary import apply_rotary
         seqlen_offsets = ctx.seqlen_offsets
         if seqlen_offsets is None:
             cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
         seqlen_offsets: Union[int, torch.Tensor] = 0,
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
+        use_flash_attn: bool = True,
     ):
         # batch, seqlen, three, nheads, headdim = qkv.shape
         assert qkv.shape[-3] == 3
         if cos_k is None and sin_k is None and qkv.is_contiguous():
+            if use_flash_attn:
+                from flash_attn.ops.triton.rotary import apply_rotary
                 # Call 1 kernel instead of 2 kernels
                 # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
                 # dimensions, we get the same tensor
                 )
                 qkv = torch.stack((q_rot, k_rot, qkv[:, :, 2]), dim=2)
         else:
+            from flash_attn.ops.triton.rotary import apply_rotary
             cos_k = cos if cos_k is None else cos_k
             sin_k = sin if sin_k is None else sin_k
             q, k = qkv[..., 0, :, :], qkv[..., 1, :, :]
     @staticmethod
     def backward(ctx, dqkv):
+        from flash_attn.ops.triton.rotary import apply_rotary
         seqlen_offsets = ctx.seqlen_offsets
         if seqlen_offsets is None:
             cos, sin, cos_k, sin_k, cu_seqlens, seqlen_offsets = ctx.saved_tensors
     seqlen_offsets: Union[int, torch.Tensor] = 0,
     cu_seqlens: Optional[torch.Tensor] = None,
     max_seqlen: Optional[int] = None,
+    use_flash_attn=True,
 ):
     """
     Arguments:
     Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
     """
     return ApplyRotaryEmbQKV_.apply(
+        qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets, cu_seqlens, max_seqlen, use_flash_attn,
     )
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
     ):
+        from flash_attn.ops.triton.rotary import apply_rotary
         # batch, seqlen, two, nheads, headdim = kv.shape
         assert kv.shape[-3] == 2
         k = kv[..., 0, :, :]
     @staticmethod
     def backward(ctx, dkv):
+        from flash_attn.ops.triton.rotary import apply_rotary
         seqlen_offsets = ctx.seqlen_offsets
         if seqlen_offsets is None:
             cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
         scale_base=None,
         pos_idx_in_fp32=True,
         device=None,
+        use_flash_attn=True,
     ):
         """
         interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
         self.dim = dim
         self._base = float(base)
         self.pos_idx_in_fp32 = pos_idx_in_fp32
+        self.use_flash_attn = use_flash_attn
         # Generate and save the inverse frequency buffer (non trainable)
         inv_freq = self._compute_inv_freq(device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
                     seqlen_offsets=seqlen_offset,
                     cu_seqlens=cu_seqlens,
                     max_seqlen=max_seqlen,
+                    use_flash_attn=self.use_flash_attn,
                 )
             else:
                 return apply_rotary_emb_qkv_(
                     seqlen_offsets=seqlen_offset,
                     cu_seqlens=cu_seqlens,
                     max_seqlen=max_seqlen,
+                    use_flash_attn=self.use_flash_attn,
                 )
         else:
             q = qkv