mistral-nanotron / modeling_mistral.py

update all

aa5ff8c 8 months ago

No virus

52.1 kB

	# coding=utf-8
	# Copyright 2018 HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" PyTorch Mistral model.
	"""
	from typing import Dict, Optional, Union
	import inspect

	import torch
	from flash_attn import bert_padding
	from flash_attn.flash_attn_interface import (
	flash_attn_varlen_func,
	flash_attn_with_kvcache,
	)
	from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
	from nanotron import distributed as dist
	from nanotron import logging
	from nanotron.config import ParallelismArgs, RecomputeGranularity
	from nanotron.generation.generate_store import AttachableStore
	from nanotron.logging import log_rank
	from nanotron.models import NanotronModel
	from nanotron.nn.layer_norm import TritonRMSNorm
	from nanotron.parallel import ParallelContext
	from nanotron.parallel.parameters import NanotronParameter
	from nanotron.parallel.pipeline_parallel.block import (
	PipelineBlock,
	TensorPointer,
	)
	from nanotron.parallel.pipeline_parallel.p2p import P2P
	from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy
	from nanotron.parallel.tensor_parallel.nn import (
	TensorParallelColumnLinear,
	TensorParallelEmbedding,
	TensorParallelLinearMode,
	TensorParallelRowLinear,
	)
	from nanotron.random import RandomStates
	from nanotron.utils import checkpoint_method
	from nanotron.nn.activations import ACT2FN
	from torch import nn

	from config_mistral_7b import MistralConfig

	logger = logging.get_logger(__name__)

	_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_varlen_func).parameters)


	class RotaryEmbedding(nn.Module):
	def __init__(self, dim: int, end: int, theta: float = 10000.0):
	super().__init__()
	assert dim % 2 == 0
	self.dim = dim
	self.end = end
	self.theta = theta
	# TODO @nouamane: Figure out why we can't set `DTypeInvariantTensor` ...
	# TODO @thomasw21: Complex buffers break DDP, instead we store float and view them as complex
	self.freqs_cis: torch.Tensor
	self._initialized_buffer = False

	def init_rotary_embeddings(self):
	if self._initialized_buffer is True:
	# Buffer if already initialized
	return
	self.register_buffer(
	"freqs_cis",
	torch.empty(self.end, self.dim // 2, 2, dtype=torch.float, device="cuda"),
	persistent=False,
	)
	assert self.freqs_cis.device.type == "cuda"
	# TODO @nouamane: One we figure out how to do the DTypeInvariantTensor, this can be removed and changed to an assert
	if self.freqs_cis.dtype != torch.float:
	self.freqs_cis = self.freqs_cis.to(torch.float)
	assert self.freqs_cis.dtype == torch.float
	freqs = 1.0 / (
	self.theta
	** (torch.arange(0, self.dim, 2, dtype=torch.float, device="cuda")[: (self.dim // 2)] / self.dim)
	)
	t = torch.arange(self.end, device="cuda")
	freqs = torch.outer(t, freqs).float()
	complex_freqs = torch.polar(torch.ones_like(freqs), freqs)
	freqs = torch.view_as_real(complex_freqs)
	self.freqs_cis.copy_(freqs)
	self._initialized_buffer = True

	def forward(
	self,
	x: torch.Tensor, # [batch_size, seq_length, num_heads, d_qk]
	position_ids: Optional[torch.LongTensor], # [batch_size, seq_length]
	):
	batch_size, seq_length, num_heads, inner_dim = x.shape
	while (
	position_ids is not None and position_ids[-1, -1] >= self.end
	) or seq_length >= self.end: # TODO @nouamane: check if this causes cpu-gpu sync
	self.end *= 2
	self._initialized_buffer = False
	if self._initialized_buffer is False:
	# print(f"Initializing rotary embeddings with end={self.end}")
	self.init_rotary_embeddings()
	dtype = x.dtype
	assert inner_dim % 2 == 0
	x = x.view(
	batch_size, seq_length, num_heads, inner_dim // 2, 2
	) # [batch_size, q_length, num_heads, inner_dim]
	if x.dtype == torch.bfloat16:
	x = x.float()
	complex_x = torch.view_as_complex(x) # [batch_size, q_length, num_heads, inner_dim // 2]
	if position_ids is None:
	freqs_cis = self.freqs_cis[None, :seq_length, None, :]
	else:
	# TODO(kunhao): Should None follow the num_heads dimension?
	if position_ids[-1, -1] < 0 or position_ids[-1, -1] >= self.end: # Quick test hopefully
	raise ValueError(f"Position ids must be in the range [0, {self.end}), but got {position_ids}")
	freqs_cis = self.freqs_cis[position_ids][:, :, None, :]
	complex_freqs = torch.view_as_complex(freqs_cis)
	x_out = torch.view_as_real(complex_x * complex_freqs).view(batch_size, seq_length, num_heads, inner_dim)
	return x_out.type(dtype)


	class GLUActivation(nn.Module):
	def __init__(self, act_fn_name: str):
	super().__init__()
	self.act = ACT2FN[act_fn_name]

	def forward(self, merged_states: torch.Tensor):
	gate_states, up_states = torch.split(merged_states, merged_states.shape[-1] // 2, dim=-1)
	return self.act(gate_states) * up_states


	class MLP(nn.Module):
	def __init__(
	self,
	config: MistralConfig,
	parallel_config: Optional[ParallelismArgs],
	tp_pg: dist.ProcessGroup,
	):
	super().__init__()

	# TODO @thomasw21: refactor so that we store that default in a single place.
	tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE
	tp_linear_async_communication = (
	parallel_config.tp_linear_async_communication if parallel_config is not None else False
	)

	gate_up_contiguous_chunks = (
	config.intermediate_size, # shape of gate_linear
	config.intermediate_size, # shape of up_linear
	)
	self.gate_up_proj = TensorParallelColumnLinear(
	config.hidden_size,
	2 * config.intermediate_size,
	pg=tp_pg,
	mode=tp_mode,
	bias=False,
	async_communication=tp_linear_async_communication,
	contiguous_chunks=gate_up_contiguous_chunks,
	)

	self.down_proj = TensorParallelRowLinear(
	config.intermediate_size,
	config.hidden_size,
	pg=tp_pg,
	mode=tp_mode,
	bias=False,
	async_communication=tp_linear_async_communication and tp_mode is TensorParallelLinearMode.REDUCE_SCATTER,
	)
	# TODO @nouamane: why can't we torch.jit.script GLUActivation?
	self.split_silu_mul = GLUActivation(config.hidden_act)

	def forward(self, hidden_states): # [seq_length, batch_size, hidden_dim]
	merged_states = self.gate_up_proj(hidden_states)
	hidden_states = self.down_proj(self.split_silu_mul(merged_states))
	return {"hidden_states": hidden_states}


	class CoreAttention(nn.Module):
	def __init__(self, config: MistralConfig, parallel_config: Optional[ParallelismArgs], layer_idx: int):
	super().__init__()
	# TODO @thomasw21: GPT has a weird `d_kv` config which I'm guessing is essentically a `d_qkv`
	assert (
	config.hidden_size % config.num_attention_heads == 0
	), f"Hidden size {config.hidden_size} must be divisible by number of attention heads {config.num_attention_heads}."
	self.d_qk = config.hidden_size // config.num_attention_heads
	self.d_v = config.hidden_size // config.num_attention_heads
	self.dropout = config.attn_pdrop

	self.checkpoint_attention = False # Because flash_attn already does checkpointing

	if config.sliding_window_size is not None:
	assert (
	_flash_supports_window_size
	), "Current version of flash-attn doesn't support sliding window: `pip install flash-attn>=2.3`"
	self.sliding_window_size = config.sliding_window_size # if layer_idx not in config.global_attn_layers else None

	@checkpoint_method(attr_name="checkpoint_attention")
	def forward(
	self,
	query_states: torch.Tensor, # [batch_size * q_length, num_heads, inner_dim]
	key_states: torch.Tensor, # [batch_size * kv_length, 1, inner_dim]
	value_states: torch.Tensor, # [batch_size * kv_length, 1, inner_dim]
	q_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, q_length] (can be broadcasted to that size)
	kv_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, kv_length] (can be broadcasted to that size)
	):
	# TODO @thomasw21: Compute once, instead of computing for each layers.
	cu_seqlens_q = torch.zeros((q_sequence_mask.shape[0] + 1), dtype=torch.int32, device=query_states.device)
	cu_seqlens_k = torch.zeros((kv_sequence_mask.shape[0] + 1), dtype=torch.int32, device=query_states.device)
	torch.cumsum(q_sequence_mask.sum(-1, dtype=torch.int32), dim=0, dtype=torch.int32, out=cu_seqlens_q[1:])
	torch.cumsum(kv_sequence_mask.sum(-1, dtype=torch.int32), dim=0, dtype=torch.int32, out=cu_seqlens_k[1:])

	# TODO(kunhao): flash attn's causal means that the query can only attend to the keys before it. This is not
	# what we want if we are using kv cache. This is a hack as we always have q_length == 1 when using kv cache.
	causal = False if q_sequence_mask.shape[1] == 1 else True
	attn_output = flash_attn_varlen_func(
	q=query_states,
	k=key_states,
	v=value_states,
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=q_sequence_mask.shape[1],
	max_seqlen_k=kv_sequence_mask.shape[1],
	dropout_p=self.dropout if self.training else 0.0,
	softmax_scale=None, # defaults to 1/sqrt(d_qk)
	causal=causal,
	window_size=(self.sliding_window_size - 1, 0) if self.sliding_window_size is not None else (-1, -1),
	return_attn_probs=False,
	)

	return attn_output


	def pad_to_right(tensor, mask, new_tensor=None):
	"""Transform a left-padded tensor into a right-padded tensor. (Useful for prefilling key/value states)
	Args:
	tensor: (batch_size, seqlen, d1, d2)
	mask: (batch_size, seqlen)
	new_tensor: (batch_size, new_tensor_seqlen, d1, d2)
	Returns:
	new_tensor: (batch_size, new_tensor_seqlen, d1, d2)
	right_padded_mask: (batch_size, seqlen)
	"""
	# First, we need to find the number of padding for each row
	unpad_seqlens = mask.sum(1)
	# Then, we need to find the maximum length of the tensor
	max_seqlen = mask.shape[1]
	# We can then create the indices to select the padded values
	# The indices are the same for each row
	indices = torch.arange(max_seqlen, device=mask.device)
	# We can then create the mask for the padded values
	right_padded_mask = indices < unpad_seqlens[:, None]
	# We select the useful values
	useful_values = tensor[mask]
	# We create the new tensor (if not provided)
	new_tensor = torch.zeros_like(tensor) if new_tensor is None else new_tensor
	# We fill the new tensor with the useful values
	new_tensor[:, : right_padded_mask.shape[1], :, :][right_padded_mask] = useful_values
	return new_tensor, right_padded_mask


	class CausalSelfAttention(nn.Module, AttachableStore):
	def __init__(
	self,
	config: MistralConfig,
	parallel_config: Optional[ParallelismArgs],
	tp_pg: dist.ProcessGroup,
	layer_idx: int,
	):
	super().__init__()
	# Tensor parallel considerations: We split tensors along head dimension
	assert (
	config.num_attention_heads % tp_pg.size() == 0
	), f"Number of attention heads ({config.num_attention_heads}) must be divisible by TP size ({tp_pg.size()})."
	try:
	assert (
	config.num_key_value_heads % tp_pg.size() == 0
	), f"Number of key/value heads ({config.num_key_value_heads}) must be divisible by TP size ({tp_pg.size()})."
	except AttributeError:
	log_rank(
	"WARNING: num_key_value_heads not defined, assuming it is equal to num_attention_heads",
	logger=logger,
	level=logging.WARNING,
	rank=0,
	)
	# If num_key_value_heads is not defined, we assume that it is equal to num_attention_heads
	config.num_key_value_heads = config.num_attention_heads
	assert (
	config.num_attention_heads % config.num_key_value_heads == 0
	), f"Number of attention heads ({config.num_attention_heads}) must be divisible by number of key/value heads ({config.num_key_value_heads})."
	self.n_local_q_heads = config.num_attention_heads // tp_pg.size()
	self.n_local_kv_heads = config.num_key_value_heads // tp_pg.size()
	self.n_repeats = config.num_attention_heads // config.num_key_value_heads
	self.is_gqa = config.num_attention_heads != config.num_key_value_heads # Whether we are using GQA or not
	self.d_qk = config.hidden_size // config.num_attention_heads
	self.d_v = config.hidden_size // config.num_attention_heads
	self.d_model = config.hidden_size

	# TODO @thomasw21: refactor so that we store that default in a single place.
	tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE
	tp_linear_async_communication = (
	parallel_config.tp_linear_async_communication if parallel_config is not None else False
	)

	# build the slice config for self.qkv for save/load
	# shard are done within the contiguous chunk
	qkv_contiguous_chunks = (
	config.num_attention_heads * self.d_qk, # shape of q
	config.num_key_value_heads * self.d_qk, # shape of k
	config.num_key_value_heads * self.d_qk, # shape of v
	)
	self.qkv_proj = TensorParallelColumnLinear(
	self.d_model,
	config.num_attention_heads * self.d_qk + 2 * config.num_key_value_heads * self.d_qk,
	pg=tp_pg,
	mode=tp_mode,
	bias=False,
	async_communication=tp_linear_async_communication,
	contiguous_chunks=qkv_contiguous_chunks,
	)
	# TODO(kunhao): We want to have only one version per device and not one version per layer.
	self.rotary_embedding = RotaryEmbedding(
	dim=self.d_qk,
	end=config.max_position_embeddings,
	theta=config.rope_theta
	)

	# NOTE: Only supported for training (TODO(fmom): position_ids not supported yet)
	self.flash_rotary_embedding = FlashRotaryEmbedding(dim=self.d_qk, base=config.rope_theta, interleaved=True)

	self.o_proj = TensorParallelRowLinear(
	config.num_attention_heads * self.d_qk,
	self.d_model,
	pg=tp_pg,
	mode=tp_mode,
	bias=False,
	async_communication=tp_linear_async_communication,
	)

	self.attention = CoreAttention(
	config,
	parallel_config=parallel_config,
	layer_idx=layer_idx,
	)

	self.prefill_kv_len = (
	config.max_position_embeddings
	) # TODO @nouamane: compute based on free memory, because in rope we can surpass max_position_embeddings

	def forward(
	self,
	hidden_states, # [seq_length, batch_size, hidden_size]
	sequence_mask, # [batch_size, seq_length]
	):
	qkv_states = self.qkv_proj(
	hidden_states
	) # [seq_length, batch_size, n_local_q_heads * d_qk + 2 * n_local_kv_heads * d_qk]
	q_length, batch_size, _ = qkv_states.shape

	if self.is_gqa:
	query_states, key_states, value_states = torch.split(
	qkv_states,
	[
	self.n_local_q_heads * self.d_qk,
	self.n_local_kv_heads * self.d_qk,
	self.n_local_kv_heads * self.d_qk,
	],
	dim=-1,
	)

	query_states = (
	query_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_q_heads, self.d_qk)
	)
	key_states = (
	key_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk)
	)
	value_states = (
	value_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk)
	)
	else:
	query_states, key_states, value_states = (
	qkv_states.view(q_length, batch_size, 3, self.n_local_q_heads, self.d_qk)
	.permute(2, 1, 0, 3, 4)
	.contiguous()
	) # [3, batch_size, seq_length, n_local_q_heads, d_qk]

	store = self.get_local_store()
	if store is not None: # Inference case
	# Double check that we use store only at inference time
	assert key_states.requires_grad is False
	assert value_states.requires_grad is False
	if "position_offsets" in store:
	old_position_offsets = store["position_offsets"]
	position_ids = old_position_offsets[:, None] + sequence_mask
	else:
	position_ids = torch.cumsum(sequence_mask, dim=-1, dtype=torch.int32) - 1
	position_offsets = position_ids[:, -1]

	# Compute rotary embeddings
	# Note: keep track of old rotary embedding end to check if we need to enlarge k_cache and v_cache
	old_rotary_embed_end = self.rotary_embedding.end
	query_states = self.rotary_embedding(query_states, position_ids=position_ids)
	key_states = self.rotary_embedding(key_states, position_ids=position_ids)

	if "key" not in store:
	# First inference iteration (Prefill)
	# TODO @nouamane: support custom masking
	# assert that [ False, False, False, False, True, True, True, True, True, True] is accepted
	# but [ False, False, False, False, True, True, False, False, True, True] is not (can't mask in the middle of sequence)
	assert ~(
	sequence_mask[:, :-1] & (~sequence_mask[:, 1:]) # True is never followed by False
	).any(), "Can't mask in the middle of sequence, please make sure that pads are at the left of the sequence if existing"

	# preallocate k_cache, v_cache to self.prefill_kv_len
	k_cache = torch.zeros(
	(
	batch_size,
	self.prefill_kv_len,
	self.n_local_kv_heads,
	self.d_qk,
	),
	dtype=query_states.dtype,
	device=query_states.device,
	)
	v_cache = torch.zeros(
	(batch_size, self.prefill_kv_len, self.n_local_kv_heads, self.d_v),
	dtype=query_states.dtype,
	device=query_states.device,
	)
	# Remove pad tokens from key_states and concatenate samples in key_unpad
	# cu_seqlens_k is the cumulative sequence lengths of key_states
	(query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(
	query_states,
	sequence_mask,
	)
	(key_unpad, indices_k, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(
	key_states, sequence_mask
	)
	(value_unpad, _, _, _) = bert_padding.unpad_input(value_states, sequence_mask)

	output_unpad = flash_attn_varlen_func(
	q=query_unpad, # (total_q, n_local_q_heads, d_qk)
	k=key_unpad, # (total_kv, n_local_kv_heads, d_qk)
	v=value_unpad, # (total_kv, n_local_kv_heads, d_v)
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=max_seqlen_q,
	max_seqlen_k=max_seqlen_k,
	dropout_p=0.0,
	softmax_scale=None,
	causal=True, # True in prefill phase, False in subsequent phases
	return_attn_probs=False,
	) # (total_unpadded, n_local_q_heads, d_v)

	attention_output = bert_padding.pad_input(
	output_unpad, indices_q, batch_size, q_length
	) # (batch_size, q_length, n_local_q_heads, d_v)

	pad_to_right(key_states, sequence_mask, new_tensor=k_cache)
	pad_to_right(value_states, sequence_mask, new_tensor=v_cache)

	else:
	# Pull pre-computed key/value states
	# Subsequent inference iterations (q_length=1)
	k_cache = store["key"]
	v_cache = store["value"]

	# NOTE(fmom): According to flash_attn_with_kvcache, "If you pass in k / v, you must make sure that the cache is large enough to hold the new values"
	# Since rotary embedding has changed (to enable larger context), we need to enlarge k_cache and v_cache
	if self.rotary_embedding.end > old_rotary_embed_end:
	k_cache = torch.cat(
	[
	k_cache,
	torch.zeros(
	(
	batch_size,
	self.rotary_embedding.end - old_rotary_embed_end,
	self.n_local_kv_heads,
	self.d_qk,
	),
	dtype=query_states.dtype,
	device=query_states.device,
	),
	],
	dim=1,
	)

	v_cache = torch.cat(
	[
	v_cache,
	torch.zeros(
	(
	batch_size,
	self.rotary_embedding.end - old_rotary_embed_end,
	self.n_local_kv_heads,
	self.d_v,
	),
	dtype=query_states.dtype,
	device=query_states.device,
	),
	],
	dim=1,
	)

	assert (
	k_cache.shape[1] == self.rotary_embedding.end
	), f"Cache size {k_cache.shape[1]} is smaller than rotary embedding end {self.rotary_embedding.end}"
	assert (
	v_cache.shape[1] == self.rotary_embedding.end
	), f"Cache size {v_cache.shape[1]} is smaller than rotary embedding end {self.rotary_embedding.end}"

	# [batch_size, seq_length, num_heads, d_qk]
	query_states = query_states.view(
	batch_size, q_length, self.n_local_q_heads, self.d_qk
	) # [batch_size, q_length, self.n_heads, d_qk]
	kv_length = key_states.shape[1]
	key_states = key_states.view(
	batch_size, kv_length, self.n_local_kv_heads, self.d_qk
	) # [batch_size, kv_length, self.n_heads, d_qk]
	value_states = value_states.view(
	batch_size, kv_length, self.n_local_kv_heads, self.d_v
	) # [batch_size, kv_length, self.n_heads, d_v]

	attention_output = flash_attn_with_kvcache(
	query_states,
	k_cache,
	v_cache,
	key_states,
	value_states,
	rotary_cos=None,
	rotary_sin=None,
	# TODO @nouamane: seems like this doesnt help to indicate padding in (for first iteration it's just 0)
	cache_seqlens=position_offsets.contiguous(),
	softmax_scale=None,
	causal=True,
	rotary_interleaved=False, # GPT-NeoX style
	)

	store.update(
	{
	"key": k_cache, # flash-attn has updated with new key_states using cache_seqlens
	"value": v_cache,
	"position_offsets": position_offsets,
	}
	)

	else: # Training case
	# Apply rotary embeddings to query/key states
	# NOTE: The layout is different from models/mistral.py which is [batch_size, num_heads, seq_length, d_qk]
	# Here it is, [batch_size, seq_length, num_heads, d_qk]
	# [2, batch_size, seq_length, num_heads, d_qk]
	key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0)
	# [batch_size, seq_length, 2, num_heads, d_qk]
	key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous()
	query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states)
	# [batch_size, seq_length, num_heads, d_qk]
	key_states, value_states = torch.split(key_value_states, 1, dim=2)

	q_sequence_mask = sequence_mask
	kv_sequence_mask = sequence_mask

	kv_length = key_states.shape[1]
	# [batch_size, seq_length, num_heads, d_qk]
	# Shaping for use in `flash-attn` version of flash-attn: `flash_attn_unpadded_func`
	query_states = query_states.view(
	batch_size * q_length, self.n_local_q_heads, self.d_qk
	) # [batch_size * q_length, self.n_heads, d_qk]

	key_states = key_states.view(
	batch_size * kv_length, self.n_local_kv_heads, self.d_qk
	) # [batch_size * kv_length, self.n_heads, d_qk]
	value_states = value_states.view(
	batch_size * kv_length, self.n_local_kv_heads, self.d_v
	) # [batch_size * kv_length, self.n_heads, d_v]

	attention_output = self.attention(
	query_states=query_states,
	key_states=key_states,
	value_states=value_states,
	q_sequence_mask=q_sequence_mask,
	kv_sequence_mask=kv_sequence_mask,
	)

	attention_output = (
	attention_output.contiguous().view(batch_size, q_length, self.n_local_q_heads * self.d_v).transpose(0, 1)
	)
	output = self.o_proj(attention_output)

	return {"hidden_states": output, "sequence_mask": sequence_mask}


	class MistralDecoderLayer(nn.Module):
	def __init__(
	self,
	config: MistralConfig,
	parallel_config: Optional[ParallelismArgs],
	tp_pg: dist.ProcessGroup,
	layer_idx: int,
	):
	super().__init__()
	self.input_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.attn = CausalSelfAttention(
	config=config,
	parallel_config=parallel_config,
	tp_pg=tp_pg,
	layer_idx=layer_idx,
	)

	self.post_attention_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.mlp = MLP(config=config, parallel_config=parallel_config, tp_pg=tp_pg)

	def forward(
	self,
	hidden_states: Union[torch.Tensor, TensorPointer],
	sequence_mask: Union[torch.Tensor, TensorPointer],
	) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)

	output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
	hidden_states = output["hidden_states"]
	hidden_states = hidden_states + residual

	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
	hidden_states = hidden_states + residual

	return {
	"hidden_states": hidden_states,
	"sequence_mask": output["sequence_mask"],
	}


	class Embedding(nn.Module, AttachableStore):
	def __init__(self, tp_pg: dist.ProcessGroup, config: MistralConfig, parallel_config: Optional[ParallelismArgs]):
	super().__init__()
	self.token_embedding = TensorParallelEmbedding(
	num_embeddings=config.vocab_size,
	embedding_dim=config.hidden_size,
	padding_idx=config.pad_token_id,
	pg=tp_pg,
	mode=parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE,
	)
	self.pg = tp_pg

	def forward(self, input_ids: torch.Tensor, input_mask: torch.Tensor): # [batch_size, seq_length]
	store = self.get_local_store()
	if store is not None:
	if "past_length" in store:
	past_length = store["past_length"]
	else:
	past_length = torch.zeros(1, dtype=torch.long, device=input_ids.device).expand(input_ids.shape[0])

	cumsum_mask = input_mask.cumsum(-1, dtype=torch.long)
	# Store new past_length in store
	store["past_length"] = past_length + cumsum_mask[:, -1]

	# Format input in `[seq_length, batch_size]` to support high TP with low batch_size
	input_ids = input_ids.transpose(0, 1)
	input_embeds = self.token_embedding(input_ids)
	return {"input_embeds": input_embeds}


	class MistralModel(nn.Module):
	"""Build pipeline graph"""

	def __init__(
	self,
	config: MistralConfig,
	parallel_context: ParallelContext,
	parallel_config: Optional[ParallelismArgs],
	):
	super().__init__()

	# Declare all the nodes
	self.p2p = P2P(parallel_context.pp_pg, device=torch.device("cuda"))
	self.config = config
	self.parallel_config = parallel_config
	self.parallel_context = parallel_context
	self.tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE
	tp_linear_async_communication = (
	parallel_config.tp_linear_async_communication if parallel_config is not None else False
	)

	self.token_position_embeddings = PipelineBlock(
	p2p=self.p2p,
	module_builder=Embedding,
	module_kwargs={
	"tp_pg": parallel_context.tp_pg,
	"config": config,
	"parallel_config": parallel_config,
	},
	module_input_keys={"input_ids", "input_mask"},
	module_output_keys={"input_embeds"},
	)

	self.decoder = nn.ModuleList(
	[
	PipelineBlock(
	p2p=self.p2p,
	module_builder=MistralDecoderLayer,
	module_kwargs={
	"config": config,
	"parallel_config": parallel_config,
	"tp_pg": parallel_context.tp_pg,
	"layer_idx": layer_idx,
	},
	module_input_keys={"hidden_states", "sequence_mask"},
	module_output_keys={"hidden_states", "sequence_mask"},
	)
	for layer_idx in range(config.num_hidden_layers)
	]
	)

	self.final_layer_norm = PipelineBlock(
	p2p=self.p2p,
	module_builder=TritonRMSNorm,
	module_kwargs={"hidden_size": config.hidden_size, "eps": config.rms_norm_eps},
	module_input_keys={"input"},
	module_output_keys={"hidden_states"},
	) # TODO

	self.lm_head = PipelineBlock(
	p2p=self.p2p,
	# Understand that this means that we return sharded logits that are going to need to be gathered
	module_builder=TensorParallelColumnLinear,
	module_kwargs={
	"in_features": config.hidden_size,
	"out_features": config.vocab_size,
	"pg": parallel_context.tp_pg,
	"bias": False,
	# TODO @thomasw21: refactor so that we store that default in a single place.
	"mode": self.tp_mode,
	"async_communication": tp_linear_async_communication,
	},
	module_input_keys={"x"},
	module_output_keys={"logits"},
	)

	self.cast_to_fp32 = PipelineBlock(
	p2p=self.p2p,
	module_builder=lambda: lambda x: x.float(),
	module_kwargs={},
	module_input_keys={"x"},
	module_output_keys={"output"},
	)

	def forward(
	self,
	input_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length]
	input_mask: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length]
	):
	return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]

	def forward_with_hidden_states(
	self,
	input_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length]
	input_mask: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length]
	):
	# all tensors are optional as most ranks don't need anything from the dataloader.

	output = self.token_position_embeddings(input_ids=input_ids, input_mask=input_mask)

	hidden_encoder_states = {
	"hidden_states": output["input_embeds"],
	"sequence_mask": input_mask,
	}
	for encoder_block in self.decoder:
	hidden_encoder_states = encoder_block(**hidden_encoder_states)

	hidden_states = self.final_layer_norm(input=hidden_encoder_states["hidden_states"])["hidden_states"]

	sharded_logits = self.lm_head(x=hidden_states)["logits"]

	fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]

	return fp32_sharded_logits, hidden_states

	def get_block_compute_costs(self):
	"""Computes the compute cost of each block in the model so that we can do a better job of load balancing."""
	model_config = self.config
	d_ff = model_config.intermediate_size
	d_qkv = model_config.hidden_size // model_config.num_attention_heads
	block_compute_costs = {
	# CausalSelfAttention (qkv proj + attn out) + MLP
	MistralDecoderLayer: 4 * model_config.num_attention_heads * d_qkv * model_config.hidden_size
	+ 3 * d_ff * model_config.hidden_size,
	# This is the last lm_head
	TensorParallelColumnLinear: model_config.vocab_size * model_config.hidden_size,
	}
	return block_compute_costs

	def get_flops_per_sec(self, iteration_time_in_sec, sequence_length, global_batch_size):
	"""Get flops per second for a given model"""
	world_size = self.parallel_context.world_pg.size()
	try:
	num_key_values_heads = self.config.num_key_value_heads
	except AttributeError:
	num_key_values_heads = self.config.num_attention_heads

	model_flops, hardware_flops = get_flops(
	num_layers=self.config.num_hidden_layers,
	hidden_size=self.config.hidden_size,
	num_heads=self.config.num_attention_heads,
	num_key_value_heads=num_key_values_heads,
	vocab_size=self.config.vocab_size,
	ffn_hidden_size=self.config.intermediate_size,
	seq_len=sequence_length,
	batch_size=global_batch_size,
	recompute_granularity=self.parallel_config.recompute_granularity,
	)

	model_flops_per_s = model_flops / (iteration_time_in_sec * world_size * 1e12)
	hardware_flops_per_s = hardware_flops / (iteration_time_in_sec * world_size * 1e12)
	return model_flops_per_s, hardware_flops_per_s


	@torch.jit.script
	def masked_mean(loss, label_mask, dtype):
	# type: (Tensor, Tensor, torch.dtype) -> Tensor
	return (loss * label_mask).sum(dtype=dtype) / label_mask.sum()


	class Loss(nn.Module):
	def __init__(self, tp_pg: dist.ProcessGroup):
	super().__init__()
	self.tp_pg = tp_pg

	def forward(
	self,
	sharded_logits: torch.Tensor, # [seq_length, batch_size, logits]
	label_ids: torch.Tensor, # [batch_size, seq_length]
	label_mask: torch.Tensor, # [batch_size, seq_length]
	) -> Dict[str, torch.Tensor]:
	# Megatron by defaults cast everything in fp32. `--f16-lm-cross-entropy` is an option you can use to keep current precision.
	# https://github.com/NVIDIA/Megatron-LM/blob/f267e6186eae1d6e2055b412b00e2e545a8e896a/megatron/model/gpt_model.py#L38
	loss = sharded_cross_entropy(
	sharded_logits, label_ids.transpose(0, 1).contiguous(), group=self.tp_pg, dtype=torch.float
	).transpose(0, 1)
	# TODO @thomasw21: It's unclear what kind of normalization we want to do.
	loss = masked_mean(loss, label_mask, dtype=torch.float)
	# I think indexing causes a sync we don't actually want
	# loss = loss[label_mask].sum()
	return {"loss": loss}


	class MistralForTraining(NanotronModel):
	def __init__(
	self,
	config: MistralConfig,
	parallel_context: ParallelContext,
	parallel_config: Optional[ParallelismArgs],
	random_states: Optional[RandomStates] = None,
	):
	super().__init__()
	import warnings

	self.model = MistralModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config)
	self.loss = PipelineBlock(
	p2p=self.model.p2p,
	module_builder=Loss,
	module_kwargs={"tp_pg": parallel_context.tp_pg},
	module_input_keys={
	"sharded_logits",
	"label_ids",
	"label_mask",
	},
	module_output_keys={"loss"},
	)
	self.parallel_context = parallel_context
	self.config = config
	self.parallel_config = parallel_config

	def forward(
	self,
	input_ids: Union[torch.Tensor, TensorPointer],
	input_mask: Union[torch.Tensor, TensorPointer],
	label_ids: Union[torch.Tensor, TensorPointer],
	label_mask: Union[torch.Tensor, TensorPointer],
	) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
	sharded_logits = self.model(
	input_ids=input_ids,
	input_mask=input_mask,
	)
	loss = self.loss(
	sharded_logits=sharded_logits,
	label_ids=label_ids,
	label_mask=label_mask,
	)["loss"]
	return {"loss": loss}

	@torch.no_grad()
	def init_model_randomly(self, init_method, scaled_init_method):
	"""Initialize model parameters randomly.
	Args:
	init_method (callable): Used for embedding/position/qkv weight in attention/first layer weight of mlp/ /lm_head/
	scaled_init_method (callable): Used for o weight in attention/second layer weight of mlp/

	Note:
	Layernorm weight all 0 or 1 depending on `apply_layernorm_1p`
	"""
	model = self
	initialized_parameters = set()
	# Handle tensor parallelism
	module_id_to_prefix = {id(module): f"{module_name}." for module_name, module in model.named_modules()}
	# Fix the root_model
	module_id_to_prefix[id(model)] = ""

	for module_name, module in model.named_modules():
	if isinstance(module, TensorParallelColumnLinear):
	# Somehow Megatron-LM does something super complicated, https://github.com/NVIDIA/Megatron-LM/blob/2360d732a399dd818d40cbe32828f65b260dee11/megatron/core/tensor_parallel/layers.py#L96
	# What it does:
	# - instantiate a buffer of the `full size` in fp32
	# - run init method on it
	# - shard result to get only a specific shard
	# Instead I'm lazy and just going to run init_method, since they are scalar independent
	assert {"weight"} == {name for name, _ in module.named_parameters()} or {"weight"} == {
	name for name, _ in module.named_parameters()
	}
	for param_name, param in module.named_parameters():
	assert isinstance(param, NanotronParameter)
	if param.is_tied:
	tied_info = param.get_tied_info()
	full_param_name = tied_info.get_full_name_from_module_id_to_prefix(
	module_id_to_prefix=module_id_to_prefix
	)
	else:
	full_param_name = f"{module_name}.{param_name}"

	if full_param_name in initialized_parameters:
	# Already initialized
	continue

	if "weight" == param_name:
	init_method(param)
	elif "bias" == param_name:
	param.zero_()
	else:
	raise ValueError(f"Who the fuck is {param_name}?")

	assert full_param_name not in initialized_parameters
	initialized_parameters.add(full_param_name)
	elif isinstance(module, TensorParallelRowLinear):
	# Somehow Megatron-LM does something super complicated, https://github.com/NVIDIA/Megatron-LM/blob/2360d732a399dd818d40cbe32828f65b260dee11/megatron/core/tensor_parallel/layers.py#L96
	# What it does:
	# - instantiate a buffer of the `full size` in fp32
	# - run init method on it
	# - shard result to get only a specific shard
	# Instead I'm lazy and just going to run init_method, since they are scalar independent
	assert {"weight"} == {name for name, _ in module.named_parameters()} or {"weight"} == {
	name for name, _ in module.named_parameters()
	}
	for param_name, param in module.named_parameters():
	assert isinstance(param, NanotronParameter)
	if param.is_tied:
	tied_info = param.get_tied_info()
	full_param_name = tied_info.get_full_name_from_module_id_to_prefix(
	module_id_to_prefix=module_id_to_prefix
	)
	else:
	full_param_name = f"{module_name}.{param_name}"

	if full_param_name in initialized_parameters:
	# Already initialized
	continue

	if "weight" == param_name:
	scaled_init_method(param)
	elif "bias" == param_name:
	param.zero_()
	else:
	raise ValueError(f"Who the fuck is {param_name}?")

	assert full_param_name not in initialized_parameters
	initialized_parameters.add(full_param_name)
	elif isinstance(module, TritonRMSNorm):
	assert {"weight"} == {name for name, _ in module.named_parameters()}
	for param_name, param in module.named_parameters():
	assert isinstance(param, NanotronParameter)
	if param.is_tied:
	tied_info = param.get_tied_info()
	full_param_name = tied_info.get_full_name_from_module_id_to_prefix(
	module_id_to_prefix=module_id_to_prefix
	)
	else:
	full_param_name = f"{module_name}.{param_name}"

	if full_param_name in initialized_parameters:
	# Already initialized
	continue

	if "weight" == param_name:
	# TODO @thomasw21: Sometimes we actually want 0
	param.fill_(1)
	elif "bias" == param_name:
	param.zero_()
	else:
	raise ValueError(f"Who the fuck is {param_name}?")

	assert full_param_name not in initialized_parameters
	initialized_parameters.add(full_param_name)
	elif isinstance(module, TensorParallelEmbedding):
	# TODO @thomasw21: Handle tied embeddings
	# Somehow Megatron-LM does something super complicated, https://github.com/NVIDIA/Megatron-LM/blob/2360d732a399dd818d40cbe32828f65b260dee11/megatron/core/tensor_parallel/layers.py#L96
	# What it does:
	# - instantiate a buffer of the `full size` in fp32
	# - run init method on it
	# - shard result to get only a specific shard
	# Instead I'm lazy and just going to run init_method, since they are scalar independent
	assert {"weight"} == {name for name, _ in module.named_parameters()}

	assert isinstance(module.weight, NanotronParameter)
	if module.weight.is_tied:
	tied_info = module.weight.get_tied_info()
	full_param_name = tied_info.get_full_name_from_module_id_to_prefix(
	module_id_to_prefix=module_id_to_prefix
	)
	else:
	full_param_name = f"{module_name}.weight"

	if full_param_name in initialized_parameters:
	# Already initialized
	continue

	init_method(module.weight)
	assert full_param_name not in initialized_parameters
	initialized_parameters.add(full_param_name)

	assert initialized_parameters == {
	param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
	if param.is_tied
	else name
	for name, param in model.named_parameters()
	}, f"Somehow the initialized set of parameters don't match:\n - Expected: { {name for name, _ in model.named_parameters()} }\n - Got: {initialized_parameters}"

	def get_block_compute_costs(self):
	"""Computes the compute cost of each block in the model so that we can do a better job of load balancing."""
	return self.model.get_block_compute_costs()

	def get_flops_per_sec(self, iteration_time_in_sec, sequence_length, global_batch_size):
	"""Get flops per second for a given model"""
	return self.model.get_flops_per_sec(iteration_time_in_sec, sequence_length, global_batch_size)


	def get_flops(
	num_layers,
	hidden_size,
	num_heads,
	vocab_size,
	seq_len,
	kv_channels=None,
	ffn_hidden_size=None,
	batch_size=1,
	recompute_granularity=None,
	glu_activation=False,
	):
	"""Counts flops in an decoder-only model
	Args:
	num_layers: number of decoder layers
	hidden_size: hidden size of the model
	num_heads: number of heads in the model
	num_key_value_heads: number of key/value heads in the model
	ffn_hidden_size: hidden size of the FFN
	vocab_size: size of the vocabulary
	seq_len: sequence length of the decoder
	batch_size: batch size
	recompute_granularity: Activation recomputation method. Either None, FULL or SELECTIVE. Check Megatron-LM docs for more info.
	Returns:
	model_flops: flops in the model (should be independent of the hardware and model implementation)
	hardware_flops: flops in the hardware (actual flops performed on the hardware). Check 6.3 in https://arxiv.org/pdf/2205.05198.pdf
	"""
	if kv_channels is None:
	assert hidden_size % num_heads == 0
	kv_channels = hidden_size // num_heads
	if ffn_hidden_size is None:
	ffn_hidden_size = 4 * hidden_size

	# In the following we mark the reduced dimension with parentheses
	# decoder
	# self attention (MQA)
	## q projection
	decoder_q_proj_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * num_heads * kv_channels
	## kv projection, shared across heads
	decoder_kv_proj_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * 2 * kv_channels
	## qk logits
	decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * seq_len
	### SWA (sliding window attention / local attention)
	# window_size = 4096
	# decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * window_size
	## v logits
	decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (seq_len) * kv_channels
	# decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (window_size) * kv_channels
	## attn out
	decoder_attn_out_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * hidden_size
	# FF
	## 1st layer
	decoder_ffn_1_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size
	if glu_activation:
	# 3 matmuls instead of 2 in FFN
	# ref. https://arxiv.org/pdf/2002.05202.pdf
	# Used for example in T5 v1.1
	decoder_ffn_1_flops_fwd = 4 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size
	## 2nd layer
	decoder_ffn_2_flops_fwd = 2 * num_layers * batch_size * seq_len * (ffn_hidden_size) * hidden_size

	decoder_flops_fwd = (
	decoder_q_proj_flops_fwd
	+ decoder_kv_proj_flops_fwd
	+ decoder_qk_logits_flops_fwd
	+ decoder_v_logits_flops_fwd
	+ decoder_attn_out_flops_fwd
	+ decoder_ffn_1_flops_fwd
	+ decoder_ffn_2_flops_fwd
	)

	# lm head
	lm_head_flops_fwd = 2 * batch_size * seq_len * (hidden_size) * vocab_size

	# the bwd pass requires double the flops in case of matmuls to calculate the gradients with respect to
	# both input and weight tensors
	model_flops = 3 * (decoder_flops_fwd + lm_head_flops_fwd) # 1 for fwd + 2 for bwd

	if recompute_granularity is None:
	hardware_flops = model_flops
	elif recompute_granularity is RecomputeGranularity.FULL:
	# Note: we don't recompute lm head activs
	hardware_flops = model_flops + decoder_flops_fwd # + activ recomputation
	elif recompute_granularity is RecomputeGranularity.SELECTIVE:
	# all terms with s^2 are flops that are recomputed
	# ref. appendix A: https://arxiv.org/pdf/2205.05198.pdf
	recomputed_decoder_flops = decoder_qk_logits_flops_fwd + decoder_v_logits_flops_fwd
	hardware_flops = model_flops + recomputed_decoder_flops
	else:
	raise ValueError("recompute_granularity must be one of 'full' or 'selective'")

	return model_flops, hardware_flops