Spaces:
No application file
No application file
| import math | |
| from typing import Optional, Tuple | |
| import torch | |
| from torch import nn | |
| import torch.utils.checkpoint | |
| import torch.nn.functional as F | |
| from transformers.models.llama.modeling_llama import ( | |
| LlamaAttention, | |
| rotate_half, | |
| apply_rotary_pos_emb, | |
| repeat_kv, | |
| ) | |
| import types | |
| import transformers | |
| from einops import rearrange | |
| from flash_attn import __version__ as flash_attn_version | |
| from flash_attn.bert_padding import pad_input, unpad_input | |
| from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func | |
| __all__ = ["enable_llama_pos_shift_attention"] | |
| def apply_rotary_pos_emb_single(x, cos, sin, position_ids): | |
| # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. | |
| cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] | |
| sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] | |
| cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] | |
| sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] | |
| x_embed = (x * cos) + (rotate_half(x) * sin) | |
| return x_embed | |
| def llama_pos_shift_attention_forward( | |
| self, | |
| hidden_states: torch.Tensor, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| past_key_value: Optional[Tuple[torch.Tensor]] = None, | |
| output_attentions: bool = False, | |
| use_cache: bool = False, | |
| ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: | |
| bsz, q_len, _ = hidden_states.size() | |
| if self.config.pretraining_tp > 1: | |
| key_value_slicing = ( | |
| self.num_key_value_heads * self.head_dim | |
| ) // self.config.pretraining_tp | |
| query_slices = self.q_proj.weight.split( | |
| (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 | |
| ) | |
| key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) | |
| value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) | |
| query_states = [ | |
| F.linear(hidden_states, query_slices[i]) | |
| for i in range(self.config.pretraining_tp) | |
| ] | |
| query_states = torch.cat(query_states, dim=-1) | |
| key_states = [ | |
| F.linear(hidden_states, key_slices[i]) | |
| for i in range(self.config.pretraining_tp) | |
| ] | |
| key_states = torch.cat(key_states, dim=-1) | |
| value_states = [ | |
| F.linear(hidden_states, value_slices[i]) | |
| for i in range(self.config.pretraining_tp) | |
| ] | |
| value_states = torch.cat(value_states, dim=-1) | |
| else: | |
| query_states = self.q_proj(hidden_states) | |
| key_states = self.k_proj(hidden_states) | |
| value_states = self.v_proj(hidden_states) | |
| query_states = query_states.view( | |
| bsz, q_len, self.num_heads, self.head_dim | |
| ).transpose(1, 2) | |
| key_states = key_states.view( | |
| bsz, q_len, self.num_key_value_heads, self.head_dim | |
| ).transpose(1, 2) | |
| value_states = value_states.view( | |
| bsz, q_len, self.num_key_value_heads, self.head_dim | |
| ).transpose(1, 2) | |
| kv_seq_len = key_states.shape[-2] | |
| if past_key_value is not None: | |
| kv_seq_len += past_key_value[0].shape[-2] | |
| cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) | |
| ### Shift Pos: query pos is min(cache_size, idx) | |
| # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) | |
| query_states = apply_rotary_pos_emb_single(query_states, cos, sin, position_ids) | |
| ### | |
| if past_key_value is not None: | |
| # reuse k, v, self_attention | |
| key_states = torch.cat([past_key_value[0], key_states], dim=2) | |
| value_states = torch.cat([past_key_value[1], value_states], dim=2) | |
| past_key_value = (key_states, value_states) if use_cache else None | |
| ### Shift Pos: key pos is the pos in cache | |
| key_position_ids = torch.arange(kv_seq_len, device=position_ids.device).unsqueeze(0) | |
| key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids) | |
| ### | |
| # repeat k/v heads if n_kv_heads < n_heads | |
| key_states = repeat_kv(key_states, self.num_key_value_groups) | |
| value_states = repeat_kv(value_states, self.num_key_value_groups) | |
| attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt( | |
| self.head_dim | |
| ) | |
| if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): | |
| raise ValueError( | |
| f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" | |
| f" {attn_weights.size()}" | |
| ) | |
| if attention_mask is not None: | |
| if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): | |
| raise ValueError( | |
| f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" | |
| ) | |
| attn_weights = attn_weights + attention_mask | |
| # upcast attention to fp16 | |
| attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float16).to( #torch.float32 | |
| query_states.dtype | |
| ) | |
| attn_output = torch.matmul(attn_weights, value_states) | |
| if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): | |
| raise ValueError( | |
| f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" | |
| f" {attn_output.size()}" | |
| ) | |
| attn_output = attn_output.transpose(1, 2).contiguous() | |
| attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) | |
| if self.config.pretraining_tp > 1: | |
| attn_output = attn_output.split( | |
| self.hidden_size // self.config.pretraining_tp, dim=2 | |
| ) | |
| o_proj_slices = self.o_proj.weight.split( | |
| self.hidden_size // self.config.pretraining_tp, dim=1 | |
| ) | |
| attn_output = sum( | |
| [ | |
| F.linear(attn_output[i], o_proj_slices[i]) | |
| for i in range(self.config.pretraining_tp) | |
| ] | |
| ) | |
| else: | |
| attn_output = self.o_proj(attn_output) | |
| if not output_attentions: | |
| attn_weights = None | |
| return attn_output, attn_weights, past_key_value | |
| def llama_pos_shift_attention_forward_flashattn( | |
| self, | |
| hidden_states: torch.Tensor, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| past_key_value: Optional[Tuple[torch.Tensor]] = None, | |
| output_attentions: bool = False, | |
| use_cache: bool = False, | |
| ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: | |
| bsz, q_len, _ = hidden_states.size() | |
| query_states = self.q_proj(hidden_states) | |
| key_states = self.k_proj(hidden_states) | |
| value_states = self.v_proj(hidden_states) | |
| query_states = query_states.view( | |
| bsz, q_len, self.num_heads, self.head_dim | |
| ).transpose(1, 2) | |
| key_states = key_states.view( | |
| bsz, q_len, self.num_key_value_heads, self.head_dim | |
| ).transpose(1, 2) | |
| value_states = value_states.view( | |
| bsz, q_len, self.num_key_value_heads, self.head_dim | |
| ).transpose(1, 2) | |
| kv_seq_len = key_states.shape[-2] | |
| if past_key_value is not None: | |
| kv_seq_len += past_key_value[0].shape[-2] | |
| cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) | |
| ### Shift Pos: query pos is min(cache_size, idx) | |
| # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) | |
| query_states = apply_rotary_pos_emb_single(query_states, cos, sin, position_ids) | |
| ### | |
| if past_key_value is not None: | |
| # reuse k, v, self_attention | |
| key_states = torch.cat([past_key_value[0], key_states], dim=2) | |
| value_states = torch.cat([past_key_value[1], value_states], dim=2) | |
| past_key_value = (key_states, value_states) if use_cache else None | |
| ### Shift Pos: key pos is the pos in cache | |
| key_position_ids = torch.arange(kv_seq_len, device=position_ids.device).unsqueeze(0) | |
| key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids) | |
| ### | |
| # repeat k/v heads if n_kv_heads < n_heads | |
| key_states = repeat_kv(key_states, self.num_key_value_groups) | |
| value_states = repeat_kv(value_states, self.num_key_value_groups) | |
| if past_key_value is None: | |
| qkv = torch.stack( | |
| [query_states, key_states, value_states], dim=2 | |
| ) # [bsz, nh, 3, q_len, hd] | |
| qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd] | |
| key_padding_mask = torch.full((bsz, q_len), True, dtype=torch.bool, device=attention_mask.device) | |
| nheads = qkv.shape[-2] | |
| x = rearrange(qkv, "b s three h d -> b s (three h d)") | |
| x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask) | |
| x_unpad = rearrange( | |
| x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads | |
| ) | |
| output_unpad = flash_attn_varlen_qkvpacked_func( | |
| x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True | |
| ) | |
| output = rearrange( | |
| pad_input( | |
| rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len | |
| ), | |
| "b s (h d) -> b s h d", | |
| h=nheads, | |
| ) | |
| output = output.reshape(bsz, q_len, self.num_heads, self.head_dim) | |
| attn_output = self.o_proj(rearrange(output, "b s h d -> b s (h d)")) | |
| attn_weights = None | |
| else: | |
| attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt( | |
| self.head_dim | |
| ) | |
| if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): | |
| raise ValueError( | |
| f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" | |
| f" {attn_weights.size()}" | |
| ) | |
| if attention_mask is not None: | |
| if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): | |
| raise ValueError( | |
| f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" | |
| ) | |
| attn_weights = attn_weights + attention_mask | |
| # upcast attention to fp16 | |
| attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float16).to( #torch.float32 | |
| query_states.dtype | |
| ) | |
| attn_output = torch.matmul(attn_weights, value_states) | |
| if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): | |
| raise ValueError( | |
| f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" | |
| f" {attn_output.size()}" | |
| ) | |
| attn_output = attn_output.transpose(1, 2).contiguous() | |
| attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) | |
| if self.config.pretraining_tp > 1: | |
| attn_output = attn_output.split( | |
| self.hidden_size // self.config.pretraining_tp, dim=2 | |
| ) | |
| o_proj_slices = self.o_proj.weight.split( | |
| self.hidden_size // self.config.pretraining_tp, dim=1 | |
| ) | |
| attn_output = sum( | |
| [ | |
| F.linear(attn_output[i], o_proj_slices[i]) | |
| for i in range(self.config.pretraining_tp) | |
| ] | |
| ) | |
| else: | |
| attn_output = self.o_proj(attn_output) | |
| if not output_attentions: | |
| attn_weights = None | |
| return attn_output, attn_weights, past_key_value | |
| def enable_llama_pos_shift_attention(model, use_flash_attn=True): | |
| for name, module in reversed(model._modules.items()): | |
| if len(list(module.children())) > 0: | |
| enable_llama_pos_shift_attention( | |
| module, | |
| ) | |
| if isinstance(module, LlamaAttention): | |
| model._modules[name].forward = types.MethodType( | |
| llama_pos_shift_attention_forward_flashattn if use_flash_attn else llama_pos_shift_attention_forward, model._modules[name] | |
| ) | |