Spaces:

roll-ai
/

Lyra

Build error

Lyra / cosmos_predict1 /autoregressive /utils /parallel.py

Muhammad Taqi Raza

adding lyra files

af758d1 3 months ago

9.64 kB

	# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import List

	import torch
	import torch.distributed as dist
	from megatron.core import mpu, parallel_state
	from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
	from torch.autograd import Function
	from torch.distributed import broadcast, get_process_group_ranks
	from transformer_engine.pytorch.jit import no_torch_dynamo
	from transformer_engine.pytorch.module.base import TransformerEngineBaseModule
	from transformer_engine.pytorch.module.rmsnorm import RMSNorm as RMSNormTE
	from transformer_engine.pytorch.module.rmsnorm import _RMSNorm

	from cosmos_predict1.utils import log


	def get_batch_on_this_cp_rank(inputs):
	"""Slice batch input along sequence dimension into multiple chunks,
	which are parallelized across GPUs in a context parallel group.
	"""

	# With causal masking, each token only attends to its prior tokens. Simply split
	# sequence into CP chunks can result in severe load imbalance. That's to say, chunks
	# at the end of sequence have bigger workload than others. To address this issue,
	# we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0
	# and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so
	# that we can get balanced workload among GPUs in a context parallel group.
	cp_size = parallel_state.get_context_parallel_world_size()

	if cp_size > 1:
	cp_rank = mpu.get_context_parallel_rank()
	seq_dim = 1 # if key != 'attention_mask' else 2
	inputs = inputs.view(
	*inputs.shape[0:seq_dim],
	2 * cp_size,
	inputs.shape[seq_dim] // (2 * cp_size),
	*inputs.shape[(seq_dim + 1) :],
	)
	index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True).cuda(
	non_blocking=True
	)
	inputs = inputs.index_select(seq_dim, index)
	inputs = inputs.view(inputs.shape[0:seq_dim], -1, inputs.shape[(seq_dim + 2) :])

	return inputs


	def gather_batch_from_cp_ranks(outputs):
	"""
	Gather and reconstruct the full batch from chunks distributed across GPUs in a context parallel group.
	"""
	cp_size = parallel_state.get_context_parallel_world_size()
	cp_rank = mpu.get_context_parallel_rank()

	if cp_size > 1:
	seq_dim = 1 # Assuming sequence dimension is 1

	try:
	# Reshape output to separate the two chunks
	chunk_size = outputs.shape[seq_dim] // 2
	outputs = outputs.view(outputs.shape[:seq_dim], 2, chunk_size, outputs.shape[seq_dim + 1 :])

	# Prepare a list to gather all chunks from all ranks
	gathered_chunks = [torch.zeros_like(outputs) for _ in range(cp_size)]

	# Gather all chunks
	dist.barrier()
	dist.all_gather(gathered_chunks, outputs, group=parallel_state.get_context_parallel_group())
	dist.barrier()

	# Reorder chunks
	reordered_chunks = [None] * (2 * cp_size)
	for i in range(cp_size):
	reordered_chunks[i] = gathered_chunks[i].select(seq_dim, 0)
	reordered_chunks[2 * cp_size - 1 - i] = gathered_chunks[i].select(seq_dim, 1)

	# Concatenate all chunks
	outputs = torch.cat(reordered_chunks, dim=seq_dim)
	except Exception as e:
	log.info(f"[Rank {cp_rank}] Error in gather_batch_from_cp_ranks: {str(e)}")
	raise

	return outputs


	def broadcast_data_batch_in_tp_cp_group(data_batch):
	"""
	Broadcast data batch across tensor model parallel and context parallel groups.
	"""
	keys = sorted(data_batch.keys())
	tp_size = parallel_state.get_tensor_model_parallel_world_size()
	cp_size = parallel_state.get_context_parallel_world_size()
	tp_group = parallel_state.get_tensor_model_parallel_group() if tp_size > 1 else None
	cp_group = parallel_state.get_context_parallel_group() if cp_size > 1 else None
	tp_ranks = get_process_group_ranks(tp_group) if tp_size > 1 else None
	cp_ranks = get_process_group_ranks(cp_group) if cp_size > 1 else None
	if tp_size > 1 or cp_size > 1:
	for key in keys:
	tensor = data_batch[key]
	if isinstance(tensor, torch.Tensor):
	tensor = tensor.contiguous()
	if tp_size > 1:
	broadcast(tensor, min(tp_ranks), group=tp_group)
	if cp_size > 1:
	broadcast(tensor, min(cp_ranks), group=cp_group)


	def allreduce_layernorm_grads(model: List[torch.nn.Module], tensor_model_parallel_size: int, sequence_parallel: bool):
	"""
	All-reduce layernorm grads (for sequence parallelism).
	Note:
	- We skip QK Normalization layers and the last normalization layer of Transformer,
	since we use AllReduceBWDRMSNormTE for these layers, which already applies all-reduce in the backward pass.
	- TransformerEngine's LayernormLinear and LayernormMLP modules have `*.layer_norm_weight` parameters that
	we must all-reduce in the backward pass as well. So we implement this function to cover these parameters.
	"""
	# All-reduce layernorm parameters across model parallel nodes
	# when sequence parallelism is used
	if tensor_model_parallel_size > 1 and sequence_parallel:
	grads = []
	for model_chunk in model:
	for name, param in model_chunk.named_parameters():
	if not param.requires_grad:
	continue
	if name.endswith(".layer_norm_weight"): # TP # Q-layernorm # K-layernorm
	grad = param.grad
	if grad is not None:
	grads.append(grad.data)

	if grads:
	coalesced = _flatten_dense_tensors(grads)
	torch.distributed.all_reduce(coalesced, group=parallel_state.get_tensor_model_parallel_group())
	for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
	buf.copy_(synced)


	def sync_1d_parameters(model: torch.nn.Module, process_group=None):
	"""
	Synchronize layernorm parameters (1D) across ranks by performing all-reduce with mean operation.
	LayerNorm parameters are identified by having ndim==1.
	Note: If parameters other than LayerNorm are 1D, they will also be synchronized.

	Args:
	model (torch.nn.Module): The model containing layernorm parameters
	process_group (optional): The process group to perform all-reduce.
	If None, uses the default process group.
	"""
	if not torch.distributed.is_initialized():
	return
	# Synchronize each 1D parameter (layernorm parameters)
	for name, param in model.named_parameters():
	if param.ndim == 1 and param.requires_grad: # LayerNorm weights/biases are 1D
	torch.distributed.all_reduce(param.data, op=torch.distributed.ReduceOp.AVG, group=process_group)


	class AllReduceBWD(Function):
	"""
	Custom autograd Function that performs an all-reduce operation during the backward pass.

	Args:
	tensor (Tensor): The input tensor.
	process_group: The process group to perform the all-reduce operation.

	Returns:
	Tensor: The input tensor in the forward pass, and the all-reduced gradient in the backward pass.
	"""

	@staticmethod
	def forward(ctx, tensor, process_group):
	ctx.process_group = process_group
	return tensor

	@staticmethod
	def backward(ctx, grad_output):
	dist.all_reduce(grad_output, group=ctx.process_group)
	return grad_output, None


	class AllReduceBWDRMSNormTE(RMSNormTE):
	"""
	A custom RMSNorm layer that applies all-reduce operation during backward pass.
	Used in tensor parallel training with Transformer Engine.

	Args:
	hidden_size (int): The size of the hidden dimension.
	process_group: Megatron Core's process group.
	**kwargs: Additional arguments to be passed to RMSNormTE.
	"""

	def __init__(self, hidden_size, process_group, **kwargs):
	super().__init__(hidden_size, **kwargs)
	self.process_group = process_group

	@no_torch_dynamo()
	def forward(self, inp: torch.Tensor) -> torch.Tensor:
	"""RMSNorm FWD"""

	# Set the activation type for AMP.
	TransformerEngineBaseModule.set_activation_dtype(self, inp)

	if torch.is_grad_enabled():
	fwd_fn = _RMSNorm.apply
	args = []
	else:
	fwd_fn = _RMSNorm.forward
	args = [None]

	args += (
	inp,
	AllReduceBWD.apply(self.weight, self.process_group),
	self.eps,
	self.fwd_rmsnorm_sm_margin,
	self.bwd_rmsnorm_sm_margin,
	self.inf_rmsnorm_sm_margin,
	self.zero_centered_gamma,
	torch.is_grad_enabled(),
	self.activation_dtype,
	)

	return fwd_fn(*args)