Spaces:

Adi-69s
/

Image-Generator

Runtime error

App Files Files Community

Image-Generator / torch /distributed /_spmd /parallel_mode.py

Adi-69s

Upload 5061 files

b2659ad verified 2 months ago

raw

history blame contribute delete

8.17 kB

	from abc import ABC, abstractmethod
	from typing import Any, Callable, Dict, List, Optional, Tuple

	import torch
	import torch.distributed as dist
	import torch.utils._pytree as pytree
	from torch._subclasses import FakeTensorMode
	from torch.distributed._spmd.data_parallel import (
	DataParallelStyle,
	partition_data_parallel,
	)
	from torch.distributed._spmd.distribute import _convert_to_distributed, Schema
	from torch.distributed._tensor import DeviceMesh, Placement, Replicate, Shard

	from torch.fx import GraphModule


	class ParallelMode(ABC):
	"""
	Basic Parallel Mode interface. Each parallelism pattern should implement
	this interface to describe how to partition and compile the graph in the
	spmd compiler.
	"""

	@abstractmethod
	def partition(
	self,
	gm: GraphModule,
	model: torch.nn.Module,
	optimizer: Optional[torch.optim.Optimizer],
	params_and_buffers: Dict[str, Any],
	named_states: Dict[str, Any],
	args: Tuple[Any, ...],
	kwargs: Dict[str, Any],
	) -> GraphModule:
	"""
	Partition a single device graph to a distributed graph.

	TODO(@wanchaol): some of these arguments are not necessary for
	partitioning, remove the unnecessary ones later.
	"""
	raise NotImplementedError()

	@abstractmethod
	def transform_and_compile(self, gm: GraphModule) -> GraphModule:
	"""
	Transform and compile a distributed graph with a set of graph
	transformation and optimization passes for each parallel mode.

	The returned result should be a compiled executable graph in
	the distributed environment.
	"""
	# TODO: add more necessary arguments to this interface.
	raise NotImplementedError()


	class DataParallel(ParallelMode):
	"""Data Parallelism mode."""

	def __init__(
	self,
	parallel_style: str = "replicate",
	*,
	input_batch_dim: int = 0,
	custom_passes: Optional[Callable[[GraphModule], GraphModule]] = None,
	):
	"""
	DataParallel Mode that partition the model and graph to data parallel style
	parallelism (i.e. DDP/FSDP/ZERO-3). It currently supports three different
	parallel styles: "replicate", "fully_shard", and "default". See
	:class:`DataParallelStyle` for more details.

	Args:
	parallel_style (str): parallel style to use. Currently supports
	"replicate", "fully_shard", and "default".

	Keyword args:
	input_batch_dim (int): the batch dimension of the input tensor.
	default: 0
	custom_passes (Callable[[GraphModule], GraphModule], optional):
	A custom callable that overrides the default graph transformation
	and optimization passes.
	"""
	if parallel_style == "replicate":
	self.parallel_style = DataParallelStyle.REPLICATE
	elif parallel_style == "fully_shard":
	self.parallel_style = DataParallelStyle.FULLY_SHARD
	elif parallel_style == "default":
	self.parallel_style = DataParallelStyle.DEFAULT
	else:
	raise RuntimeError(f"Unknown parallel style: {parallel_style}")

	# TODO: what if user passes in a incorrect `input_batch_dim`, how should we
	# detect that and do proper error handling?
	self.input_batch_dim = input_batch_dim

	if custom_passes is not None:
	self._gm_passes: Callable[[GraphModule], GraphModule] = custom_passes
	else:
	# TODO: add a few default passes here.
	self._gm_passes = lambda gm: gm

	def partition(
	self,
	gm: GraphModule,
	model: torch.nn.Module,
	optimizer: Optional[torch.optim.Optimizer],
	params_and_buffers: Dict[str, Any],
	named_states: Dict[str, Any],
	args: Tuple[Any, ...],
	kwargs: Dict[str, Any],
	) -> GraphModule:
	# TODO: figure out a way to avoid explicit "cuda" mesh.
	mesh = DeviceMesh("cuda", torch.arange(dist.get_world_size()))

	gm = partition_data_parallel(
	gm,
	model,
	optimizer,
	params_and_buffers,
	named_states,
	args,
	kwargs,
	mesh,
	self.parallel_style,
	self.input_batch_dim,
	)
	return gm

	def transform_and_compile(self, gm: GraphModule) -> GraphModule:
	"""optimize a distributed graph with a set of optimization passes"""
	# TODO: add more necessary arguments to this interface.
	return self._gm_passes(gm)


	class DTensorExpandMode(ParallelMode):
	"""
	The DTensor Expand mode. It's replicating the parameters and
	shard the inputs to represent DDP like behavior, it's currently
	a transitent mode before we move to the new data parallel expansion.
	"""

	def __init__(
	self, custom_passes: Optional[Callable[[GraphModule], GraphModule]] = None
	):
	self._placements_override: Dict[int, List[Placement]] = {}
	if custom_passes is not None:
	self._gm_passes: Callable[[GraphModule], GraphModule] = custom_passes
	else:
	# TODO: add a few default passes here.
	self._gm_passes = lambda gm: gm

	def partition(
	self,
	gm: GraphModule,
	model: torch.nn.Module,
	optimizer: Optional[torch.optim.Optimizer],
	params_and_buffers: Dict[str, Any],
	named_states: Dict[str, Any],
	args: Tuple[Any, ...],
	kwargs: Dict[str, Any],
	) -> GraphModule:
	flat_args = pytree.arg_tree_leaves(args, *kwargs)

	mesh = DeviceMesh("cuda", torch.arange(dist.get_world_size()).cuda())
	shard_schema: Schema = Schema(mesh=mesh, placements=[Shard(0)])
	# FIXME: allow other sharding schemas
	replicate_schema: Schema = Schema(mesh=mesh, placements=[Replicate()])

	inps, schemas = [], []

	for p in pytree.tree_leaves(params_and_buffers):
	assert isinstance(p, torch.Tensor), f"expecting Tensor but got {type(p)}"
	inps.append(p)
	schemas.append(replicate_schema)

	for o in pytree.tree_leaves(named_states):
	if isinstance(o, torch.Tensor):
	inps.append(o)
	schemas.append(replicate_schema)
	else:
	inps.append(torch.empty(0))
	schemas.append(replicate_schema)

	for a in flat_args:
	if isinstance(a, torch.Tensor):
	inps.append(a)
	if id(a) in self._placements_override:
	schemas.append(
	Schema(mesh=mesh, placements=self._placements_override[id(a)])
	)
	else:
	schemas.append(shard_schema)
	else:
	# Create dummy tensor and schema for non-tensor inputs for
	# the purpose of dtensor expansion. Non-tensor inputs are
	# guaranteed unused in dispatcher graphs produced by make_fx.
	# However, we still need to respect them so that tensor inputs
	# match wtih their placeholders.
	inps.append(torch.empty(0))
	schemas.append(shard_schema)

	with FakeTensorMode(allow_non_fake_inputs=True):
	fake_inps = [torch.empty_like(inp) for inp in inps]

	return _convert_to_distributed(
	gm, fake_inps, schemas, default_mesh=mesh, _allow_partial=False
	)[0]

	def transform_and_compile(self, gm: GraphModule) -> GraphModule:
	"""
	Transform and compile a distributed graph with a set of graph transformation
	and optimization passes for the dtensor fallback parallel mode.
	"""
	# TODO: move the trasnformation passed to this function
	return self._gm_passes(gm)