Spaces:

ASLP-lab
/

OSUM-EChat

Running on Zero

App Files Files Community

OSUM-EChat / patches /modelling_qwen2_infer_gpu.py

xlgeng

开始部署

841f290 3 months ago

raw

history blame

19.3 kB

	import math
	import torch
	import torch.nn as nn
	from typing import List, Optional, Tuple, Union
	import transformers.models
	from transformers.models.qwen2.modeling_qwen2 import (
	Qwen2RotaryEmbedding,
	Qwen2ForCausalLM,
	Qwen2MLP,
	Qwen2RMSNorm,
	apply_rotary_pos_emb,
	repeat_kv,
	_prepare_4d_causal_attention_mask_with_cache_position,
	)
	from transformers.utils import logging
	from transformers.modeling_outputs import CausalLMOutputWithPast
	from transformers.cache_utils import Cache, StaticCache, SlidingWindowCache
	from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
	from .utils import InferTaskCode

	logger = logging.get_logger(__name__)

	_GPU_QWEN_TORCH_COMPILE = True

	# ===================================================================
	# =============================Attention=============================
	# ===================================================================
	class GPUQwen2Attention(nn.Module):
	"""
	Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
	and "Generating Long Sequences with Sparse Transformers".
	"""
	def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
	super().__init__()
	self.config = config
	self.layer_idx = layer_idx
	if layer_idx is None:
	logger.warning_once(
	f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
	"to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
	"when creating this class."
	)

	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.hidden_size // self.num_heads
	self.num_key_value_heads = config.num_key_value_heads
	self.num_key_value_groups = self.num_heads // self.num_key_value_heads
	self.max_position_embeddings = config.max_position_embeddings
	self.rope_theta = config.rope_theta
	self.is_causal = True
	self.attention_dropout = config.attention_dropout

	if (self.head_dim * self.num_heads) != self.hidden_size:
	raise ValueError(
	f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
	f" and `num_heads`: {self.num_heads})."
	)
	self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
	self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
	self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
	self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

	self.rotary_emb = Qwen2RotaryEmbedding(
	self.head_dim,
	max_position_embeddings=self.max_position_embeddings,
	base=self.rope_theta,
	)

	# Adapted from Qwen2Attention.forward
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.LongTensor] = None,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

	# NOTE: RoPE return all embedding (to satisfy torch compile)
	cos, sin = self.rotary_emb(value_states, seq_len=past_key_value.get_max_length())
	query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

	if past_key_value is not None:
	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
	key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)

	causal_mask = attention_mask
	if attention_mask is not None: # no matter the length, we just slice it
	causal_mask = attention_mask[:, :, :, : past_key_value.get_max_length()]

	query_states = query_states.contiguous()
	key_states = key_states.contiguous()
	value_states = value_states.contiguous()

	# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
	# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
	# The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
	is_causal = True if causal_mask is None and q_len > 1 else False

	attn_output = torch.nn.functional.scaled_dot_product_attention(
	query_states,
	key_states,
	value_states,
	attn_mask=causal_mask,
	dropout_p=0.0,
	is_causal=is_causal,
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.view(bsz, q_len, self.hidden_size)

	attn_output = self.o_proj(attn_output)

	return attn_output, None, past_key_value


	# ===================================================================
	# =============================Layer=================================
	# ===================================================================
	class GPUQwen2DecoderLayer(nn.Module):
	def __init__(self, config: Qwen2Config, layer_idx: int):
	super().__init__()
	self.hidden_size = config.hidden_size

	if config.sliding_window and config._attn_implementation != "flash_attention_2":
	logger.warning_once(
	f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
	"unexpected results may be encountered."
	)
	self.self_attn = GPUQwen2Attention(config, layer_idx)

	self.mlp = Qwen2MLP(config)
	self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: Optional[bool] = False,
	use_cache: Optional[bool] = False,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs,
	) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`, optional): attention mask of size
	`(batch, sequence_length)` where padding elements are indicated by 0.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
	(see `past_key_values`).
	past_key_value (`Tuple(torch.FloatTensor)`, optional): cached past key and value projection states
	cache_position (`torch.LongTensor` of shape `(sequence_length)`, optional):
	Indices depicting the position of the input sequence tokens in the sequence.
	kwargs (`dict`, optional):
	Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
	into the model
	"""

	residual = hidden_states

	hidden_states = self.input_layernorm(hidden_states)

	# Self Attention
	hidden_states, self_attn_weights, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	cache_position=cache_position,
	)
	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (self_attn_weights,)

	if use_cache:
	outputs += (present_key_value,)

	return outputs

	# ===================================================================
	# ========================Qwen2ForCausalLM===========================
	# ===================================================================
	class InferQwen2ForCausalLM(Qwen2ForCausalLM):
	def __init__(self, config):
	super().__init__(config)
	self.compile_forward = torch.compile(self.simplify_forward, dynamic=False, fullgraph=True) \
	if _GPU_QWEN_TORCH_COMPILE else self.simplify_forward
	self.text_phase = True
	'''
	NOTE: 重写原Qwen2ForCausalLM forward函数，torchair直接编译原函数在返回CausalLMOutputWithPast时会出现编译错误
	'''
	def simplify_forward(self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	):
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict
	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	cache_position=cache_position,
	)

	return outputs

	def forward(self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	do_compile = True
	) -> Union[Tuple, CausalLMOutputWithPast]:
	if past_key_values is not None:
	past_key_values.training = False
	# print(self.text_phase)
	if input_ids is not None:
	if self.text_phase:
	inputs_embeds = self.model.embed_tokens(input_ids)
	else:
	inputs_embeds = self.speech_token_emded(input_ids)
	if torch.isin(input_ids, 151645).any():
	self.text_phase = False
	input_ids = None

	if (inputs_embeds is not None and cache_position[0] == 0) or do_compile==False :
	# prefill branch
	outputs = self.simplify_forward(input_ids,
	attention_mask,
	position_ids,
	past_key_values,
	inputs_embeds,
	labels,
	use_cache,
	output_attentions,
	output_hidden_states,
	return_dict,
	cache_position)
	else:
	# decoding
	outputs = self.compile_forward(input_ids,
	attention_mask,
	position_ids,
	past_key_values,
	inputs_embeds,
	labels,
	use_cache,
	output_attentions,
	output_hidden_states,
	return_dict,
	cache_position)

	last_hidden_states = outputs.last_hidden_state

	if self.text_phase:
	logits = self.lm_head(last_hidden_states)
	else:
	logits = self.speech_head(last_hidden_states)

	logits = logits.float()

	return CausalLMOutputWithPast(
	loss=None,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)


	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	inputs_embeds=None,
	cache_position=None,
	position_ids=None,
	use_cache=True,
	**kwargs,
	):
	"""
	Mainly add static cache support
	"""
	# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
	# Exception 1: when passing input_embeds, input_ids may be missing entries
	# Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
	if past_key_values is not None:
	if inputs_embeds is not None: # Exception 1
	input_ids = input_ids[:, -cache_position.shape[0] :]
	elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
	input_ids = input_ids[:, cache_position]

	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -input_ids.shape[1] :]
	# This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`,
	# as otherwise the input `position_ids` would have various stride during the decoding.
	# Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case,
	# `position_ids` is already contiguous but with varying stride which retriggers a capture.
	position_ids = position_ids.clone(memory_format=torch.contiguous_format)

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and cache_position[0] == 0:
	model_inputs = {"inputs_embeds": inputs_embeds}
	else:
	# NOTE: 与上述的position_ids相同，same as position_ids, for torch.compile and cuda graph
	input_ids = input_ids.clone(memory_format=torch.contiguous_format)
	model_inputs = {"input_ids": input_ids}

	if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
	if inputs_embeds is not None and cache_position[0] == 0:
	# prefill phase, inputs_embeds has shape (B,S,H)
	batch_size, sequence_length = inputs_embeds.shape[0], inputs_embeds.shape[1]
	device = inputs_embeds.device
	else:
	# decdoing phase, input_ids has shape (B,S)
	batch_size, sequence_length = input_ids.shape
	device = input_ids.device

	dtype = self.lm_head.weight.dtype
	min_dtype = torch.finfo(dtype).min

	if inputs_embeds is not None and inputs_embeds.ndim == 2 or input_ids is not None and input_ids.size(-1) == 1:
	# we only expand attention mask in docoding mode
	attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
	attention_mask,
	sequence_length=sequence_length,
	target_length=past_key_values.get_max_length(),
	dtype=dtype,
	device=device,
	min_dtype=min_dtype,
	cache_position=cache_position,
	batch_size=batch_size,
	)

	model_inputs.update(
	{
	"position_ids": position_ids,
	"cache_position": cache_position,
	"past_key_values": past_key_values,
	"use_cache": use_cache,
	"attention_mask": attention_mask,
	"do_compile": kwargs['do_compile'],
	}
	)
	return model_inputs

	# ===================================================================
	print("========================= DO Qwen2 PATCH ===========================")
	# ===================================================================
	transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel._supports_static_cache = True # enable static cache
	transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer = GPUQwen2DecoderLayer
	transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM = InferQwen2ForCausalLM