internlm
/

internlm-xcomposer-vl-7b

@@ -2,12 +2,10 @@ import math
 from typing import List, Union
 from typing import Optional, Tuple
-import rotary_emb
 import torch
 import torch.utils.checkpoint
 import torch.utils.checkpoint
 from einops import rearrange
-from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
@@ -23,51 +21,70 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLMXComposerConfig"
-class ApplyRotaryEmbQKV_(torch.autograd.Function):
-    """
-    ApplyRotaryEmbQKV_
-    """
     @staticmethod
-    def forward(ctx, qkv, cos, sin, cos_k=None, sin_k=None):
         """
-            qkv: (total, 3, nheads, headdim)
             cos, sin: (seqlen, rotary_dim / 2)
             cos_k, sin_k: (seqlen, rotary_dim / 2), optional
         rotary_dim must be <= headdim
         Apply rotary embedding *inplace* to the first rotary_dim of q and k.
         """
-        _, three, _, headdim = qkv.shape
         assert three == 3
         rotary_seqlen, rotary_dim = cos.shape
         rotary_dim *= 2
         assert rotary_dim <= headdim
         cos_k = cos if cos_k is None else cos_k
         sin_k = sin if sin_k is None else sin_k
-        assert sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen,
-                                                           rotary_dim // 2)
-        q1, q2 = qkv[:, 0, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(q1, q2, rearrange(cos, "s d -> s 1 d"),
-                                rearrange(sin, "s d -> s 1 d"), q1, q2, False)
-        k1, k2 = qkv[:, 1, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(k1, k2, rearrange(cos_k, "s d -> s 1 d"),
-                                rearrange(sin_k, "s d -> s 1 d"), k1, k2,
-                                False)
         ctx.save_for_backward(cos, sin, cos_k, sin_k)
         return qkv
     @staticmethod
     def backward(ctx, dqkv):
         cos, sin, cos_k, sin_k = ctx.saved_tensors
         rotary_dim = cos.shape[-1]
         rotary_dim *= 2
-        dq1, dq2 = dqkv[:, 0, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(dq1, dq2, rearrange(cos, "s d -> s 1 d"),
-                                rearrange(sin, "s d -> s 1 d"), dq1, dq2, True)
-        dk1, dk2 = dqkv[:, 1, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(dk1, dk2, rearrange(cos_k, "s d -> s 1 d"),
-                                rearrange(sin_k, "s d -> s 1 d"), dk1, dk2,
-                                True)
-        return dqkv, None, None, None, None
 class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
@@ -120,23 +137,6 @@ class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
                 self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
                 self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
-    def forward(self,
-                qkv: torch.Tensor,
-                indexes=0) -> Tuple[torch.Tensor, torch.Tensor]:
-        self._update_cos_sin_cache(qkv, indexes)
-        if self.scale is None:
-            return apply_rotary_emb_qkv_(qkv, self._cos_cached[indexes],
-                                         self._sin_cached[indexes]).to(
-                                             qkv.dtype)
-        else:
-            return apply_rotary_emb_qkv_(
-                qkv,
-                self._cos_cached[indexes],
-                self._sin_cached[indexes],
-                self._cos_k_cached[indexes],
-                self._sin_k_cached[indexes],
-            ).to(qkv.dtype)
     def eval_forward(self, qkv, seqlen_offset=0):
         """
         seqlen_offset: can be used in generation where the qkv being passed in is only the last
@@ -157,7 +157,6 @@ class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
             )
-apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
 legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply

 from typing import List, Union
 from typing import Optional, Tuple
 import torch
 import torch.utils.checkpoint
 import torch.utils.checkpoint
 from einops import rearrange
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
 _CONFIG_FOR_DOC = "InternLMXComposerConfig"
+def rotary_embed(x1, x2, cos, sin, conj):
+    x1, x2 = x1.float(), x2.float()
+    if conj:
+        x1, x2 = x1 * cos + x2 * sin, x1 * sin + x2 * cos
+    else:
+        x1, x2 = x1 * cos - x2 * sin, x1 * sin + x2 * cos
+    return x1, x2
+class LegacyApplyRotaryEmbQKV_(torch.autograd.Function):
     @staticmethod
+    def forward(ctx, qkv, cos, sin, cos_k=None, sin_k=None, interleaved=False):
         """
+            qkv: (batch_size, seqlen, 3, nheads, headdim)
             cos, sin: (seqlen, rotary_dim / 2)
             cos_k, sin_k: (seqlen, rotary_dim / 2), optional
+            interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
+                1st half and 2nd half (GPT-NeoX style).
         rotary_dim must be <= headdim
         Apply rotary embedding *inplace* to the first rotary_dim of q and k.
         """
+        batch, seqlen, three, nheads, headdim = qkv.shape
         assert three == 3
         rotary_seqlen, rotary_dim = cos.shape
         rotary_dim *= 2
         assert rotary_dim <= headdim
+        assert seqlen <= rotary_seqlen
         cos_k = cos if cos_k is None else cos_k
         sin_k = sin if sin_k is None else sin_k
+        assert sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen, rotary_dim // 2)
+        q_ro = qkv[:, :, 0, :, :rotary_dim]
+        q1, q2 = q_ro.chunk(2, dim=-1) if not interleaved else (q_ro[..., ::2], q_ro[..., 1::2])
+        # rotary_emb.apply_rotary(q1, q2, rearrange(cos[:seqlen], 's d -> s 1 d'),
+        #                         rearrange(sin[:seqlen], 's d -> s 1 d'), q1, q2, False)
+        q1, q2 = rotary_embed(q1, q2, rearrange(cos[:seqlen], 's d -> s 1 d'), rearrange(sin[:seqlen], 's d -> s 1 d'), False)
+        qkv[:, :, 0, :, :rotary_dim] = torch.cat([q1, q2], dim=-1)
+        k_ro = qkv[:, :, 1, :, :rotary_dim]
+        k1, k2 = k_ro.chunk(2, dim=-1) if not interleaved else (k_ro[..., ::2], k_ro[..., 1::2])
+        # rotary_emb.apply_rotary(k1, k2, rearrange(cos_k[:seqlen], 's d -> s 1 d'),
+        #                         rearrange(sin_k[:seqlen], 's d -> s 1 d'), k1, k2, False)
+        k1, k2 = rotary_embed(k1, k2, rearrange(cos_k[:seqlen], 's d -> s 1 d'), rearrange(sin_k[:seqlen], 's d -> s 1 d'), False)
+        qkv[:, :, 1, :, :rotary_dim] = torch.cat([k1, k2], dim=-1)
         ctx.save_for_backward(cos, sin, cos_k, sin_k)
+        ctx.interleaved = interleaved
         return qkv
     @staticmethod
     def backward(ctx, dqkv):
         cos, sin, cos_k, sin_k = ctx.saved_tensors
+        _, seqlen, _, _, headdim = dqkv.shape
         rotary_dim = cos.shape[-1]
         rotary_dim *= 2
+        dq_ro = dqkv[:, :, 0, :, :rotary_dim]
+        dq1, dq2 = (dq_ro.chunk(2, dim=-1) if not ctx.interleaved
+                    else (dq_ro[..., ::2], dq_ro[..., 1::2]))
+        rotary_emb.apply_rotary(dq1, dq2, rearrange(cos[:seqlen], 's d -> s 1 d'),
+                                rearrange(sin[:seqlen], 's d -> s 1 d'), dq1, dq2, True)
+        dk_ro = dqkv[:, :, 1, :, :rotary_dim]
+        dk1, dk2 = (dk_ro.chunk(2, dim=-1) if not ctx.interleaved
+                    else (dk_ro[..., ::2], dk_ro[..., 1::2]))
+        rotary_emb.apply_rotary(dk1, dk2, rearrange(cos_k[:seqlen], 's d -> s 1 d'),
+                                rearrange(sin_k[:seqlen], 's d -> s 1 d'), dk1, dk2, True)
+        return dqkv, None, None, None, None, None
 class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
                 self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
                 self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
     def eval_forward(self, qkv, seqlen_offset=0):
         """
         seqlen_offset: can be used in generation where the qkv being passed in is only the last
             )
 legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply