Spaces:

mshukor
/

eP-ALM

Runtime error

eP-ALM / utils.py

mshukor

init

3eb682b over 1 year ago

20.2 kB

	import numpy as np
	import io
	import os
	import time
	from collections import defaultdict, deque
	import datetime

	import torch
	import torch.distributed as dist

	def optimizer_to(optim, device):
	for param in optim.state.values():
	# Not sure there are any global tensors in the state dict
	if isinstance(param, torch.Tensor):
	param.data = param.data.to(device)
	if param._grad is not None:
	param._grad.data = param._grad.data.to(device)
	elif isinstance(param, dict):
	for subparam in param.values():
	if isinstance(subparam, torch.Tensor):
	subparam.data = subparam.data.to(device)
	if subparam._grad is not None:
	subparam._grad.data = subparam._grad.data.to(device)


	class SmoothedValue(object):
	"""Track a series of values and provide access to smoothed values over a
	window or the global series average.
	"""

	def __init__(self, window_size=20, fmt=None):
	if fmt is None:
	fmt = "{median:.4f} ({global_avg:.4f})"
	self.deque = deque(maxlen=window_size)
	self.total = 0.0
	self.count = 0
	self.fmt = fmt

	def update(self, value, n=1):
	self.deque.append(value)
	self.count += n
	self.total += value * n

	def synchronize_between_processes(self):
	"""
	Warning: does not synchronize the deque!
	"""
	if not is_dist_avail_and_initialized():
	return
	t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
	dist.barrier()
	dist.all_reduce(t)
	t = t.tolist()
	self.count = int(t[0])
	self.total = t[1]

	@property
	def median(self):
	d = torch.tensor(list(self.deque))
	return d.median().item()

	@property
	def avg(self):
	d = torch.tensor(list(self.deque), dtype=torch.float32)
	return d.mean().item()

	@property
	def global_avg(self):
	return self.total / self.count

	@property
	def max(self):
	return max(self.deque)

	@property
	def value(self):
	return self.deque[-1]

	def __str__(self):
	return self.fmt.format(
	median=self.median,
	avg=self.avg,
	global_avg=self.global_avg,
	max=self.max,
	value=self.value)


	class MetricLogger(object):
	def __init__(self, delimiter="\t", accelerator=None):
	self.meters = defaultdict(SmoothedValue)
	self.delimiter = delimiter
	self.accelerator = accelerator

	def update(self, **kwargs):
	for k, v in kwargs.items():
	if isinstance(v, torch.Tensor):
	v = v.item()
	assert isinstance(v, (float, int))
	self.meters[k].update(v)

	def __getattr__(self, attr):
	if attr in self.meters:
	return self.meters[attr]
	if attr in self.__dict__:
	return self.__dict__[attr]
	raise AttributeError("'{}' object has no attribute '{}'".format(
	type(self).__name__, attr))

	def __str__(self):
	loss_str = []
	for name, meter in self.meters.items():
	loss_str.append(
	"{}: {}".format(name, str(meter))
	)
	return self.delimiter.join(loss_str)

	def global_avg(self):
	loss_str = []
	for name, meter in self.meters.items():
	loss_str.append(
	"{}: {:.4f}".format(name, meter.global_avg)
	)
	return self.delimiter.join(loss_str)

	def synchronize_between_processes(self):
	for meter in self.meters.values():
	meter.synchronize_between_processes()

	def add_meter(self, name, meter):
	self.meters[name] = meter

	def log_every(self, iterable, print_freq, header=None):
	i = 0
	if not header:
	header = ''
	start_time = time.time()
	end = time.time()
	iter_time = SmoothedValue(fmt='{avg:.4f}')
	data_time = SmoothedValue(fmt='{avg:.4f}')
	space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
	log_msg = [
	header,
	'[{0' + space_fmt + '}/{1}]',
	'eta: {eta}',
	'{meters}',
	'time: {time}',
	'data: {data}'
	]
	if torch.cuda.is_available():
	log_msg.append('max mem: {memory:.0f}')
	log_msg = self.delimiter.join(log_msg)
	MB = 1024.0 * 1024.0

	if self.accelerator is not None:
	print_func = self.accelerator.print
	else:
	print_func = print
	for obj in iterable:
	data_time.update(time.time() - end)
	yield obj
	iter_time.update(time.time() - end)
	if i % print_freq == 0 or i == len(iterable) - 1:
	eta_seconds = iter_time.global_avg * (len(iterable) - i)
	eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
	if torch.cuda.is_available():
	print_func(log_msg.format(
	i, len(iterable), eta=eta_string,
	meters=str(self),
	time=str(iter_time), data=str(data_time),
	memory=torch.cuda.max_memory_allocated() / MB))
	else:
	print_func(log_msg.format(
	i, len(iterable), eta=eta_string,
	meters=str(self),
	time=str(iter_time), data=str(data_time)))
	i += 1
	end = time.time()
	total_time = time.time() - start_time
	total_time_str = str(datetime.timedelta(seconds=int(total_time)))
	print_func('{} Total time: {} ({:.4f} s / it)'.format(
	header, total_time_str, total_time / len(iterable)))



	class AttrDict(dict):
	def __init__(self, args, *kwargs):
	super(AttrDict, self).__init__(args, *kwargs)
	self.__dict__ = self


	def compute_acc(logits, label, reduction='mean'):
	ret = (torch.argmax(logits, dim=1) == label).float()
	if reduction == 'none':
	return ret.detach()
	elif reduction == 'mean':
	return ret.mean().item()

	def compute_n_params(model, return_str=True):
	tot = 0
	for p in model.parameters():
	w = 1
	for x in p.shape:
	w *= x
	tot += w
	if return_str:
	if tot >= 1e6:
	return '{:.1f}M'.format(tot / 1e6)
	else:
	return '{:.1f}K'.format(tot / 1e3)
	else:
	return tot

	def setup_for_distributed(is_master):
	"""
	This function disables printing when not in master process
	"""
	import builtins as __builtin__
	builtin_print = __builtin__.print

	def print(args, *kwargs):
	force = kwargs.pop('force', False)
	if is_master or force:
	builtin_print(args, *kwargs)

	__builtin__.print = print


	def is_dist_avail_and_initialized():
	if not dist.is_available():
	return False
	if not dist.is_initialized():
	return False
	return True


	def get_world_size():
	if not is_dist_avail_and_initialized():
	return 1
	return dist.get_world_size()


	def get_rank():
	if not is_dist_avail_and_initialized():
	return 0
	return dist.get_rank()


	def is_main_process():
	return get_rank() == 0


	def save_on_master(args, *kwargs):
	if is_main_process():
	torch.save(args, *kwargs)


	def init_distributed_mode(args):
	if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
	args.rank = int(os.environ["RANK"])
	args.world_size = int(os.environ['WORLD_SIZE'])
	args.gpu = int(os.environ['LOCAL_RANK'])
	elif 'SLURM_PROCID' in os.environ:
	args.rank = int(os.environ['SLURM_PROCID'])
	args.gpu = args.rank % torch.cuda.device_count()
	print(args.gpu, os.environ['SLURM_LOCALID'], os.environ['SLURM_JOB_NODELIST'], os.environ['SLURM_STEP_GPUS'])
	else:
	print('Not using distributed mode')
	args.distributed = False
	return

	args.distributed = True

	torch.cuda.set_device(args.gpu)
	args.dist_backend = 'nccl'
	print('world_size', args.world_size, 'gpu', args.gpu, 'dist_url:', args.dist_url)
	print('\| distributed init (rank {}): {}'.format(
	args.rank, args.dist_url), flush=True)
	torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
	world_size=args.world_size, rank=args.rank)
	print("init")
	torch.distributed.barrier()
	setup_for_distributed(args.rank == 0)


	def init_distributed_mode_multinodes(args):
	import hostlist
	if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
	args.rank = int(os.environ["RANK"])
	args.world_size = int(os.environ['WORLD_SIZE'])
	args.gpu = int(os.environ['LOCAL_RANK'])
	elif 'SLURM_PROCID' in os.environ:
	args.rank = int(os.environ['SLURM_PROCID'])
	args.gpu = args.rank % torch.cuda.device_count()
	print('slurm')
	else:
	print('Not using distributed mode')
	args.distributed = False
	return

	args.distributed = True
	# print(args.gpu, os.environ['SLURM_PROCID'], os.environ['SLURM_LOCALID'], os.environ['SLURM_JOB_NODELIST'], os.environ['SLURM_STEP_GPUS'])
	hostnames = hostlist.expand_hostlist(os.environ['SLURM_JOB_NODELIST'])
	os.environ['MASTER_ADDR'] = hostnames[0]
	gpu_ids = os.environ['SLURM_STEP_GPUS'].split(",")
	# os.environ['MASTER_PORT'] = str(12345 + int(min(gpu_ids))) # to avoid port conflict on the same node

	print(os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])

	torch.cuda.set_device(args.gpu)
	args.dist_backend = 'nccl'
	args.dist_url = 'tcp://'+os.environ['MASTER_ADDR']+':'+os.environ['MASTER_PORT']

	print('world_size', args.world_size, 'gpu', args.gpu)
	print('\| distributed init (rank {}): {}'.format(
	args.rank, args.dist_url), flush=True)



	torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
	world_size=args.world_size, rank=args.rank)
	# torch.distributed.barrier()
	# setup_for_distributed(args.rank == 0)


	def init_distributed_mode_multinodes_jz(args):
	import hostlist
	if args.jean_zay:
	hostnames = hostlist.expand_hostlist(os.environ['SLURM_JOB_NODELIST'])
	os.environ['MASTER_ADDR'] = hostnames[0]

	print(os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'], os.environ['SLURM_PROCID'], os.environ['SLURM_NTASKS'], os.environ['SLURM_LOCALID'])
	args.gpu = int(os.environ['SLURM_LOCALID'])
	args.rank = int(os.environ['SLURM_PROCID'])
	args.world_size = int(os.environ['SLURM_NTASKS'])
	args.dist_url = 'env://'+os.environ['MASTER_ADDR']+':'+os.environ['MASTER_PORT']
	# args.dist_url = 'env://'
	print('jean zay')
	elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
	args.rank = int(os.environ["RANK"])
	args.world_size = int(os.environ['WORLD_SIZE'])
	args.gpu = int(os.environ['LOCAL_RANK'])
	elif 'SLURM_PROCID' in os.environ:
	args.rank = int(os.environ['SLURM_PROCID'])
	args.gpu = args.rank % torch.cuda.device_count()
	print('slurm')
	else:
	print('Not using distributed mode')
	args.distributed = False
	return

	args.distributed = True

	torch.cuda.set_device(args.gpu)
	args.dist_backend = 'nccl'
	print('world_size', args.world_size, 'gpu', args.gpu, 'rank', args.rank)
	print('\| distributed init (rank {}): {}'.format(
	args.rank, args.dist_url), flush=True)
	torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
	world_size=args.world_size, rank=args.rank)


	torch.distributed.barrier()
	setup_for_distributed(args.rank == 0)










	# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
	"""
	This file contains primitives for multi-gpu communication.
	This is useful when doing distributed training.
	"""

	import functools
	import logging
	import numpy as np
	import pickle
	import torch
	import torch.distributed as dist

	import torch

	_LOCAL_PROCESS_GROUP = None
	"""
	A torch process group which only includes processes that on the same machine as the current process.
	This variable is set when processes are spawned by `launch()` in "engine/launch.py".
	"""


	def get_world_size() -> int:
	if not dist.is_available():
	return 1
	if not dist.is_initialized():
	return 1
	return dist.get_world_size()


	def get_rank() -> int:
	if not dist.is_available():
	return 0
	if not dist.is_initialized():
	return 0
	return dist.get_rank()


	def get_local_rank() -> int:
	"""
	Returns:
	The rank of the current process within the local (per-machine) process group.
	"""
	if not dist.is_available():
	return 0
	if not dist.is_initialized():
	return 0
	assert _LOCAL_PROCESS_GROUP is not None
	return dist.get_rank(group=_LOCAL_PROCESS_GROUP)


	def get_local_size() -> int:
	"""
	Returns:
	The size of the per-machine process group,
	i.e. the number of processes per machine.
	"""
	if not dist.is_available():
	return 1
	if not dist.is_initialized():
	return 1
	return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)


	def is_main_process() -> bool:
	return get_rank() == 0


	def synchronize():
	"""
	Helper function to synchronize (barrier) among all processes when
	using distributed training
	"""
	if not dist.is_available():
	return
	if not dist.is_initialized():
	return
	world_size = dist.get_world_size()
	if world_size == 1:
	return
	dist.barrier()


	@functools.lru_cache()
	def _get_global_gloo_group():
	"""
	Return a process group based on gloo backend, containing all the ranks
	The result is cached.
	"""
	if dist.get_backend() == "nccl":
	return dist.new_group(backend="gloo")
	else:
	return dist.group.WORLD


	def _serialize_to_tensor(data, group):
	backend = dist.get_backend(group)
	assert backend in ["gloo", "nccl"]
	device = torch.device("cpu" if backend == "gloo" else "cuda")

	buffer = pickle.dumps(data)
	if len(buffer) > 1024 ** 3:
	logger = logging.getLogger(__name__)
	logger.warning(
	"Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
	get_rank(), len(buffer) / (1024 ** 3), device
	)
	)
	storage = torch.ByteStorage.from_buffer(buffer)
	tensor = torch.ByteTensor(storage).to(device=device)
	return tensor


	def _pad_to_largest_tensor(tensor, group):
	"""
	Returns:
	list[int]: size of the tensor, on each rank
	Tensor: padded tensor that has the max size
	"""
	world_size = dist.get_world_size(group=group)
	assert (
	world_size >= 1
	), "comm.gather/all_gather must be called from ranks within the given group!"
	local_size = torch.tensor(
	[tensor.numel()], dtype=torch.int64, device=tensor.device)
	size_list = [
	torch.zeros([1], dtype=torch.int64, device=tensor.device)
	for _ in range(world_size)
	]
	dist.all_gather(size_list, local_size, group=group)
	size_list = [int(size.item()) for size in size_list]

	max_size = max(size_list)

	# we pad the tensor because torch all_gather does not support
	# gathering tensors of different shapes
	if local_size != max_size:
	padding = torch.zeros(
	(max_size - local_size,), dtype=torch.uint8, device=tensor.device
	)
	tensor = torch.cat((tensor, padding), dim=0)
	return size_list, tensor


	def all_gather(data, group=None):
	"""
	Run all_gather on arbitrary picklable data (not necessarily tensors).
	Args:
	data: any picklable object
	group: a torch process group. By default, will use a group which
	contains all ranks on gloo backend.
	Returns:
	list[data]: list of data gathered from each rank
	"""
	if get_world_size() == 1:
	return [data]
	if group is None:
	group = _get_global_gloo_group()
	if dist.get_world_size(group) == 1:
	return [data]

	tensor = _serialize_to_tensor(data, group)

	size_list, tensor = _pad_to_largest_tensor(tensor, group)
	max_size = max(size_list)

	# receiving Tensor from all ranks
	tensor_list = [
	torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
	for _ in size_list
	]
	dist.all_gather(tensor_list, tensor, group=group)

	data_list = []
	for size, tensor in zip(size_list, tensor_list):
	buffer = tensor.cpu().numpy().tobytes()[:size]
	data_list.append(pickle.loads(buffer))

	return data_list


	def gather(data, dst=0, group=None):
	"""
	Run gather on arbitrary picklable data (not necessarily tensors).
	Args:
	data: any picklable object
	dst (int): destination rank
	group: a torch process group. By default, will use a group which
	contains all ranks on gloo backend.
	Returns:
	list[data]: on dst, a list of data gathered from each rank. Otherwise,
	an empty list.
	"""
	if get_world_size() == 1:
	return [data]
	if group is None:
	group = _get_global_gloo_group()
	if dist.get_world_size(group=group) == 1:
	return [data]
	rank = dist.get_rank(group=group)

	tensor = _serialize_to_tensor(data, group)
	size_list, tensor = _pad_to_largest_tensor(tensor, group)

	# receiving Tensor from all ranks
	if rank == dst:
	max_size = max(size_list)
	tensor_list = [
	torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
	for _ in size_list
	]
	dist.gather(tensor, tensor_list, dst=dst, group=group)

	data_list = []
	for size, tensor in zip(size_list, tensor_list):
	buffer = tensor.cpu().numpy().tobytes()[:size]
	data_list.append(pickle.loads(buffer))
	return data_list
	else:
	dist.gather(tensor, [], dst=dst, group=group)
	return []


	def shared_random_seed():
	"""
	Returns:
	int: a random number that is the same across all workers.
	If workers need a shared RNG, they can use this shared seed to
	create one.
	All workers must call this function, otherwise it will deadlock.
	"""
	ints = np.random.randint(2 ** 31)
	all_ints = all_gather(ints)
	return all_ints[0]



	def reduce_dict(input_dict, average=True):
	"""
	Reduce the values in the dictionary from all processes so that process with rank
	0 has the reduced results.
	Args:
	input_dict (dict): inputs to be reduced. (values not necessarily tensors).
	average (bool): whether to do average or sum
	Returns:
	a dict with the same keys as input_dict, after reduction.
	"""

	world_size = get_world_size()
	if world_size < 2:
	return input_dict

	with torch.no_grad():

	# Convert to CUDA Tensor for dist.reduce()
	input_dict_cuda_vals = {}
	for k, v in input_dict.items():
	if type(v) == torch.Tensor:
	input_dict_cuda_vals[k] = v.to('cuda')
	else:
	input_dict_cuda_vals[k] = torch.tensor(v, device='cuda')

	names = []
	values = []
	for k, v in sorted(input_dict_cuda_vals.items()):
	names.append(k)
	values.append(v)
	values = torch.stack(values, dim=0)
	dist.reduce(values, dst=0) # reduce to gpu 0

	if dist.get_rank() == 0 and average:
	# only main process gets accumulated, so only divide by
	# world_size in this case
	values /= world_size
	reduced_dict = {k: v for k, v in zip(names, values)}
	return reduced_dict