InnoMegrez2-Preview / modeling_megrez_moe.py

update readme and model file

bc9be8f about 1 month ago

46.7 kB

	# coding=utf-8
	# Copyright 2025 Infini-AI and The HuggingFace Inc. team. All rights reserved.
	#
	# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
	# and OPT implementations in this library. It has been modified from its
	# original forms to accommodate minor architectural differences compared
	# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""PyTorch Megrez model."""
	import math
	import warnings
	from typing import List, Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.distributed as dist
	import torch.nn.functional as F
	from torch import nn
	from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache
	from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
	from transformers.modeling_outputs import (BaseModelOutputWithPast, CausalLMOutputWithPast,
	SequenceClassifierOutputWithPast)
	from transformers.modeling_utils import PreTrainedModel
	from transformers.models.llama.modeling_llama import LlamaAttention, LlamaRotaryEmbedding
	from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
	from transformers.utils import (add_start_docstrings, add_start_docstrings_to_model_forward, logging,
	replace_return_docstrings)
	from transformers.utils.import_utils import is_torch_fx_available

	from .configuration_megrez_moe import MegrezMoeConfig

	# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
	# It means that the function will not be traced through and simply appear as a node in the graph.
	if is_torch_fx_available():
	if not is_torch_greater_or_equal_than_1_13:
	import torch.fx

	_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


	logger = logging.get_logger(__name__)

	_CONFIG_FOR_DOC = "MegrezMoeConfig"


	class MegrezMoeRMSNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	MegrezMoeRMSNorm is equivalent to T5LayerNorm
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)


	ALL_LAYERNORM_LAYERS.append(MegrezMoeRMSNorm)


	class MegrezMoeMLP(nn.Module):
	def __init__(self, config, hidden_size=None, intermediate_size=None):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
	self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size

	self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, x):
	down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
	return down_proj


	class MoEGate(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.top_k = config.num_experts_per_tok
	self.n_routed_experts = config.n_routed_experts
	self.routed_scaling_factor = config.routed_scaling_factor
	self.scoring_func = config.scoring_func
	self.alpha = config.aux_loss_alpha
	self.seq_aux = config.seq_aux
	self.topk_method = config.topk_method
	self.n_group = config.n_group
	self.topk_group = config.topk_group

	# topk selection algorithm
	self.norm_topk_prob = config.norm_topk_prob
	self.gating_dim = config.hidden_size
	self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
	self.reset_parameters()

	def reset_parameters(self) -> None:
	import torch.nn.init as init

	init.kaiming_uniform_(self.weight, a=math.sqrt(5))

	def forward(self, hidden_states):
	bsz, seq_len, h = hidden_states.shape
	### compute gating score
	hidden_states = hidden_states.view(-1, h)
	logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
	if self.scoring_func == "softmax":
	scores = logits.softmax(dim=-1, dtype=torch.float32)
	else:
	raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")

	### select top-k experts
	if self.topk_method == "greedy":
	topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
	elif self.topk_method == "group_limited_greedy":
	group_scores = scores.view(bsz * seq_len, self.n_group, -1).max(dim=-1).values # [n, n_group]
	group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group]
	group_mask = torch.zeros_like(group_scores) # [n, n_group]
	group_mask.scatter_(1, group_idx, 1) # [n, n_group]
	score_mask = (
	group_mask.unsqueeze(-1)
	.expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
	.reshape(bsz * seq_len, -1)
	) # [n, e]
	tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
	topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)

	### norm gate to sum 1
	if self.top_k > 1 and self.norm_topk_prob:
	denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
	topk_weight = topk_weight / denominator
	else:
	topk_weight = topk_weight * self.routed_scaling_factor
	### expert-level computation auxiliary loss
	if self.training and self.alpha > 0.0:
	scores_for_aux = scores
	aux_topk = self.top_k
	# always compute aux loss based on the naive greedy topk method
	topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
	if self.seq_aux:
	scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
	ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
	ce.scatter_add_(
	1,
	topk_idx_for_aux_loss,
	torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device),
	).div_(seq_len * aux_topk / self.n_routed_experts)
	aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(dim=1).mean() * self.alpha
	else:
	mask_ce = F.one_hot(topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts)
	ce = mask_ce.float().mean(0)
	Pi = scores_for_aux.mean(0)
	fi = ce * self.n_routed_experts
	aux_loss = (Pi * fi).sum() * self.alpha
	else:
	aux_loss = None
	return topk_idx, topk_weight, aux_loss


	class AddAuxiliaryLoss(torch.autograd.Function):
	"""
	The trick function of adding auxiliary (aux) loss,
	which includes the gradient of the aux loss during backpropagation.
	"""

	@staticmethod
	def forward(ctx, x, loss):
	assert loss.numel() == 1
	ctx.dtype = loss.dtype
	ctx.required_aux_loss = loss.requires_grad
	return x

	@staticmethod
	def backward(ctx, grad_output):
	grad_loss = None
	if ctx.required_aux_loss:
	grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
	return grad_output, grad_loss


	class MegrezMoeMoE(nn.Module):
	"""
	A mixed expert module containing shared experts.
	"""

	def __init__(self, config, layer_number, init_experts: bool = True):
	super().__init__()
	self.layer_number = layer_number
	self.config = config
	self.num_experts_per_tok = config.num_experts_per_tok

	if hasattr(config, "ep_size") and config.ep_size > 1:
	assert config.ep_size == dist.get_world_size()
	self.ep_size = config.ep_size
	self.experts_per_rank = config.n_routed_experts // config.ep_size
	self.ep_rank = dist.get_rank()
	if init_experts:
	self.experts = nn.ModuleList(
	[
	(
	MegrezMoeMLP(config, intermediate_size=config.moe_intermediate_size)
	if i >= self.ep_rank * self.experts_per_rank
	and i < (self.ep_rank + 1) * self.experts_per_rank
	else None
	)
	for i in range(config.n_routed_experts)
	]
	)
	else:
	self.experts = None
	else:
	self.ep_size = 1
	self.experts_per_rank = config.n_routed_experts
	self.ep_rank = 0
	if init_experts:
	self.experts = nn.ModuleList(
	[
	MegrezMoeMLP(config, intermediate_size=config.moe_intermediate_size)
	for i in range(config.n_routed_experts)
	]
	)
	else:
	self.experts = None

	self.gate = MoEGate(config)
	if config.n_shared_experts is not None:
	intermediate_size = config.moe_intermediate_size * config.n_shared_experts
	self.shared_experts = MegrezMoeMLP(config=config, intermediate_size=intermediate_size)

	def set_experts(self, experts):
	self.experts = experts

	def forward(self, hidden_states, pre_gate_hidden_states=None):
	identity = hidden_states
	orig_shape = hidden_states.shape
	if pre_gate_hidden_states is not None:
	topk_idx, topk_weight, aux_loss = self.gate(pre_gate_hidden_states)
	else:
	topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
	hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
	flat_topk_idx = topk_idx.view(-1)
	if self.training:
	hidden_states = hidden_states.repeat_interleave(self.num_experts_per_tok, dim=0)
	y = torch.empty_like(hidden_states)
	for i, expert in enumerate(self.experts):
	y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
	y = (y.view(topk_weight.shape, -1) topk_weight.unsqueeze(-1)).sum(dim=1)
	y = y.to(hidden_states.dtype).view(*orig_shape)
	y = AddAuxiliaryLoss.apply(y, aux_loss)
	else:
	y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
	if self.config.n_shared_experts is not None:
	shared_out = self.shared_experts(identity)
	y = y + shared_out
	# y = y + self.shared_experts(identity)
	return y

	@torch.no_grad()
	def moe_infer(self, x, topk_ids, topk_weight):
	cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
	cnts.scatter_(1, topk_ids, 1)
	tokens_per_expert = cnts.sum(dim=0)
	idxs = topk_ids.view(-1).argsort()
	sorted_tokens = x[idxs // topk_ids.shape[1]]
	sorted_tokens_shape = sorted_tokens.shape
	if self.ep_size > 1:
	tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
	tokens_per_expert_group = tokens_per_expert.new_empty(tokens_per_expert.shape[0])
	dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
	output_splits = tokens_per_expert_group.view(self.ep_size, -1).sum(1).cpu().numpy().tolist()
	gathered_tokens = sorted_tokens.new_empty(
	tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
	)
	input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
	dist.all_to_all(
	list(gathered_tokens.split(output_splits)),
	list(sorted_tokens.split(input_split_sizes)),
	)
	tokens_per_expert_post_gather = tokens_per_expert_group.view(self.ep_size, self.experts_per_rank).sum(dim=0)
	gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
	s = 0
	for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
	gatherd_idxs[s : s + k] = i % self.experts_per_rank
	s += k
	gatherd_idxs = gatherd_idxs.argsort()
	sorted_tokens = gathered_tokens[gatherd_idxs]
	tokens_per_expert = tokens_per_expert_post_gather
	tokens_per_expert = tokens_per_expert.cpu().numpy()

	outputs = []
	start_idx = 0
	for i, num_tokens in enumerate(tokens_per_expert):
	end_idx = start_idx + num_tokens
	if num_tokens == 0:
	continue
	expert = self.experts[i + self.ep_rank * self.experts_per_rank]
	tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
	expert_out = expert(tokens_for_this_expert)
	outputs.append(expert_out)
	start_idx = end_idx

	outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
	if self.ep_size > 1:
	new_x = torch.empty_like(outs)
	new_x[gatherd_idxs] = outs
	gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
	dist.all_to_all(
	list(gathered_tokens.split(input_split_sizes)),
	list(new_x.split(output_splits)),
	)
	outs = gathered_tokens

	new_x = torch.empty_like(outs)
	new_x[idxs] = outs
	final_out = (
	new_x.view(*topk_ids.shape, -1)
	.type(topk_weight.dtype)
	.mul_(topk_weight.unsqueeze(dim=-1))
	.sum(dim=1)
	.type(new_x.dtype)
	)
	return final_out


	# Copied from transformers.models.llama.modeling_llama.repeat_kv
	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""
	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	class MegrezMoeDecoderLayer(nn.Module):
	def __init__(self, config: MegrezMoeConfig, layer_idx: int):
	super().__init__()
	self.config = config
	self.layer_number = layer_idx

	self.experts_shared = (
	config.experts_shared_frequency is not None and layer_idx >= self.config.first_k_dense_replace
	)

	self.pre_gate = config.pre_gate

	self.hidden_size = config.hidden_size

	is_moe = (
	config.n_routed_experts is not None
	and layer_idx >= config.first_k_dense_replace
	and layer_idx % config.moe_layer_freq == 0
	)

	init_experts = (layer_idx - config.first_k_dense_replace) % config.experts_shared_frequency == 0
	self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx)
	self.mlp = MegrezMoeMoE(config, layer_idx, init_experts) if is_moe else MegrezMoeMLP(config)
	self.input_layernorm = MegrezMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = MegrezMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: Optional[bool] = False,
	use_cache: Optional[bool] = False,
	position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
	**kwargs,
	) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`, optional):
	attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
	query_sequence_length, key_sequence_length)` if default attention is used.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
	(see `past_key_values`).
	past_key_value (`Tuple(torch.FloatTensor)`, optional): cached past key and value projection states
	"""

	if self.pre_gate and self.layer_number >= self.config.first_k_dense_replace:
	hidden_states = torch.split(hidden_states, hidden_states.shape[0] // 2, dim=0)
	pre_gate_hidden_states = hidden_states[0]
	hidden_states = hidden_states[1]
	else:
	pre_gate_hidden_states = None

	if "padding_mask" in kwargs:
	warnings.warn(
	"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
	)

	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)

	# Self Attention
	hidden_states, self_attn_weights = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	position_embeddings=position_embeddings,
	**kwargs,
	)
	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	post_attention_layernorm_hidden_states = hidden_states
	if isinstance(self.mlp, MegrezMoeMoE):
	hidden_states = self.mlp(hidden_states, pre_gate_hidden_states=pre_gate_hidden_states)
	else:
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states
	pre_gate_hidden_states = post_attention_layernorm_hidden_states

	if self.pre_gate and self.layer_number < self.config.num_hidden_layers - 1:
	hidden_states = torch.cat([pre_gate_hidden_states, hidden_states], dim=0)

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (self_attn_weights,)

	return outputs


	MegrezMoe_START_DOCSTRING = r"""
	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`MegrezMoeConfig`]):
	Model configuration class with all the parameters of the model. Initializing with a config file does not
	load the weights associated with the model, only the configuration. Check out the
	[`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""


	@add_start_docstrings(
	"The bare MegrezMoe Model outputting raw hidden-states without any specific head on top.",
	MegrezMoe_START_DOCSTRING,
	)
	class MegrezMoePreTrainedModel(PreTrainedModel):
	config_class = MegrezMoeConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["MegrezMoeDecoderLayer"]
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True
	_supports_cache_class = True

	def _init_weights(self, module):
	std = self.config.initializer_range
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()


	MegrezMoe_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
	Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
	it.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	[What are input IDs?](../glossary#input-ids)
	attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
	`past_key_values`).

	If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
	and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
	information on the default strategy.

	- 1 indicates the head is not masked,
	- 0 indicates the head is masked.
	position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
	config.n_positions - 1]`.

	[What are position IDs?](../glossary#position-ids)
	past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, optional):
	Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
	returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

	Two formats are allowed:
	- a [`~cache_utils.Cache`] instance;
	- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
	shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
	cache format.

	The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
	legacy cache format will be returned.

	If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
	have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
	of shape `(batch_size, sequence_length)`.
	inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
	is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
	model's internal embedding lookup matrix.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
	`past_key_values`).
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""


	@add_start_docstrings(
	"The bare MegrezMoe Model outputting raw hidden-states without any specific head on top.",
	MegrezMoe_START_DOCSTRING,
	)
	class MegrezMoeModel(MegrezMoePreTrainedModel):
	"""
	Transformer decoder consisting of config.num_hidden_layers layers. Each layer is a [`MegrezMoeDecoderLayer`]

	Args:
	config: MegrezMoeConfig
	"""

	def __init__(self, config: MegrezMoeConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
	self.rotary_emb = LlamaRotaryEmbedding(config=config)
	self.layers = nn.ModuleList(
	[MegrezMoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
	)
	self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
	self.norm = MegrezMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	self.gradient_checkpointing = False
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, value):
	self.embed_tokens = value

	@add_start_docstrings_to_model_forward(MegrezMoe_INPUTS_DOCSTRING)
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	**flash_attn_kwargs,
	) -> Union[Tuple, BaseModelOutputWithPast]:
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache

	# retrieve input_ids and inputs_embeds
	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	elif input_ids is not None:
	batch_size, seq_length = input_ids.shape[:2]
	elif inputs_embeds is not None:
	batch_size, seq_length = inputs_embeds.shape[:2]
	else:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
	)
	use_cache = False

	past_key_values_length = 0
	if use_cache:
	use_legacy_cache = not isinstance(past_key_values, Cache)
	if use_legacy_cache:
	past_key_values = DynamicCache.from_legacy_cache(past_key_values)
	past_key_values_length = past_key_values.get_usable_length(seq_length)

	if position_ids is None:
	device = input_ids.device if input_ids is not None else inputs_embeds.device
	position_ids = torch.arange(
	past_key_values_length,
	seq_length + past_key_values_length,
	dtype=torch.long,
	device=device,
	)
	position_ids = position_ids.unsqueeze(0)

	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)
	if self._use_flash_attention_2:
	# 2d mask is passed through the layers
	attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
	else:
	# 4d mask is passed through the layers
	attention_mask = _prepare_4d_causal_attention_mask(
	attention_mask,
	(batch_size, seq_length),
	inputs_embeds,
	past_key_values_length,
	)

	# embed positions
	hidden_states = inputs_embeds

	# decoder layers
	all_hidden_states = () if output_hidden_states else None
	all_self_attns = () if output_attentions else None
	next_decoder_cache = None

	position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids)
	for layer_idx, decoder_layer in enumerate(self.layers):
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	shared_layer_idx = (
	(layer_idx - self.config.first_k_dense_replace)
	// self.config.experts_shared_frequency
	* self.config.experts_shared_frequency
	+ self.config.first_k_dense_replace
	)
	if layer_idx >= self.config.first_k_dense_replace and shared_layer_idx != layer_idx:
	decoder_layer.mlp.set_experts(self.layers[shared_layer_idx].mlp.experts)

	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	decoder_layer.__call__,
	hidden_states,
	attention_mask,
	position_ids,
	past_key_values,
	output_attentions,
	use_cache,
	position_embeddings,
	**flash_attn_kwargs,
	)
	else:
	layer_outputs = decoder_layer(
	hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_values,
	output_attentions=output_attentions,
	use_cache=use_cache,
	position_embeddings=position_embeddings,
	**flash_attn_kwargs,
	)
	if layer_idx >= self.config.first_k_dense_replace and shared_layer_idx != layer_idx:
	decoder_layer.mlp.set_experts(None)
	hidden_states = layer_outputs[0]

	if output_attentions:
	all_self_attns += (layer_outputs[1],)

	hidden_states = self.norm(hidden_states)
	# add hidden states from the last decoder layer
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=past_key_values,
	hidden_states=all_hidden_states,
	attentions=all_self_attns,
	)


	class MegrezMoeForCausalLM(MegrezMoePreTrainedModel):
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config):
	super().__init__(config)
	self.model = MegrezMoeModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	@add_start_docstrings_to_model_forward(MegrezMoe_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	r"""
	Args:
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, MegrezMoeForCausalLM

	>>> model = MegrezMoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
	>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

	>>> prompt = "Hey, are you conscious? Can you talk to me?"
	>>> inputs = tokenizer(prompt, return_tensors="pt")

	>>> # Generate
	>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
	>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
	```"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	hidden_states = outputs[0]
	logits = self.lm_head(hidden_states)
	logits = logits.float()

	loss = None
	if labels is not None:
	# Shift so that tokens < n predict n
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, self.config.vocab_size)
	shift_labels = shift_labels.view(-1)
	# Enable model parallelism
	shift_labels = shift_labels.to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	inputs_embeds=None,
	**kwargs,
	):
	if past_key_values is not None:
	if isinstance(past_key_values, Cache):
	cache_length = past_key_values.get_seq_length()
	past_length = past_key_values.seen_tokens
	# max_cache_length = past_key_values.get_max_length()
	max_cache_length = past_key_values.get_max_cache_shape()
	else:
	cache_length = past_length = past_key_values[0][0].shape[2]
	max_cache_length = None

	# Keep only the unprocessed tokens:
	# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
	# some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
	# input)
	if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
	input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
	# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
	# input_ids based on the past_length.
	elif past_length < input_ids.shape[1]:
	input_ids = input_ids[:, past_length:]
	# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

	# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
	if (
	max_cache_length is not None
	and attention_mask is not None
	and cache_length + input_ids.shape[1] > max_cache_length
	):
	attention_mask = attention_mask[:, -max_cache_length:]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -input_ids.shape[1] :]

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and past_key_values is None:
	model_inputs = {"inputs_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	@staticmethod
	def _reorder_cache(past_key_values, beam_idx):
	reordered_past = ()
	for layer_past in past_key_values:
	reordered_past += (
	tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
	)
	return reordered_past


	@add_start_docstrings(
	"""
	The MegrezMoe Model transformer with a sequence classification head on top (linear layer).

	[`MegrezMoeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
	(e.g. GPT-2) do.

	Since it does classification on the last token, it requires to know the position of the last token. If a
	`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
	no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
	padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
	each row of the batch).
	""",
	MegrezMoe_START_DOCSTRING,
	)
	class MegrezMoeForSequenceClassification(MegrezMoePreTrainedModel):
	def __init__(self, config):
	super().__init__(config)
	self.num_labels = config.num_labels
	self.model = MegrezMoeModel(config)
	self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	@add_start_docstrings_to_model_forward(MegrezMoe_INPUTS_DOCSTRING)
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, SequenceClassifierOutputWithPast]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
	config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
	`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	transformer_outputs = self.model(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	hidden_states = transformer_outputs[0]
	logits = self.score(hidden_states)

	if input_ids is not None:
	batch_size = input_ids.shape[0]
	else:
	batch_size = inputs_embeds.shape[0]

	if self.config.pad_token_id is None and batch_size != 1:
	raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
	if self.config.pad_token_id is None:
	sequence_lengths = -1
	else:
	if input_ids is not None:
	sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
	logits.device
	)
	else:
	sequence_lengths = -1

	pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]

	loss = None
	if labels is not None:
	labels = labels.to(logits.device)
	if self.config.problem_type is None:
	if self.num_labels == 1:
	self.config.problem_type = "regression"
	elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
	self.config.problem_type = "single_label_classification"
	else:
	self.config.problem_type = "multi_label_classification"

	if self.config.problem_type == "regression":
	loss_fct = MSELoss()
	if self.num_labels == 1:
	loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
	else:
	loss = loss_fct(pooled_logits, labels)
	elif self.config.problem_type == "single_label_classification":
	loss_fct = CrossEntropyLoss()
	loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
	elif self.config.problem_type == "multi_label_classification":
	loss_fct = BCEWithLogitsLoss()
	loss = loss_fct(pooled_logits, labels)
	if not return_dict:
	output = (pooled_logits,) + transformer_outputs[1:]
	return ((loss,) + output) if loss is not None else output

	return SequenceClassifierOutputWithPast(
	loss=loss,
	logits=pooled_logits,
	past_key_values=transformer_outputs.past_key_values,
	hidden_states=transformer_outputs.hidden_states,
	attentions=transformer_outputs.attentions,
	)