Spaces:
Sleeping
Sleeping
| import datetime | |
| import torch | |
| import builtins | |
| import torch.distributed as dist | |
| from torch import inf | |
| def is_dist_avail_and_initialized(): | |
| if not dist.is_available(): | |
| return False | |
| if not dist.is_initialized(): | |
| return False | |
| return True | |
| def get_world_size(): | |
| if not is_dist_avail_and_initialized(): | |
| return 1 | |
| return dist.get_world_size() | |
| def get_rank(): | |
| if not is_dist_avail_and_initialized(): | |
| return 0 | |
| return dist.get_rank() | |
| def is_main_process(): | |
| return get_rank() == 0 | |
| def save_on_master(*args, **kwargs): | |
| if is_main_process(): | |
| torch.save(*args, **kwargs) | |
| def setup_for_distributed(is_master): | |
| """ | |
| This function disables printing when not in master process | |
| """ | |
| builtin_print = builtins.print | |
| def print(*args, **kwargs): | |
| force = kwargs.pop('force', False) | |
| force = force or (get_world_size() > 8) | |
| if is_master or force: | |
| now = datetime.datetime.now().time() | |
| builtin_print('[{}] '.format(now), end='') # print with time stamp | |
| builtin_print(*args, **kwargs) | |
| builtins.print = print | |
| def all_reduce_mean(x): | |
| world_size = get_world_size() | |
| if world_size > 1: | |
| x_reduce = torch.tensor(x).cuda() | |
| # x_reduce = x.clone() | |
| dist.all_reduce(x_reduce) | |
| x_reduce /= world_size | |
| return x_reduce.item() | |
| else: | |
| return x | |
| def gather_tensors(tensor): | |
| # Gather tensors across all processes | |
| gather_list = [torch.zeros_like(tensor) for _ in range(dist.get_world_size())] | |
| dist.all_gather(gather_list, tensor) | |
| # return torch.cat(gather_list, dim=0) | |
| # Stack tensors along a new dimension (each GPU result along dimension 0) | |
| tensor_gather = torch.stack(gather_list, dim=0) | |
| # Transpose to get them in the correct order (swap dimensions 0 and 1) | |
| tensor_gather = tensor_gather.transpose(0, 1) | |
| # Reshape back into a 2D tensor with the correct order | |
| tensor_gather = tensor_gather.reshape(-1, tensor.size(-1)) | |
| return tensor_gather | |
| def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: | |
| if isinstance(parameters, torch.Tensor): | |
| parameters = [parameters] | |
| parameters = [p for p in parameters if p.grad is not None] | |
| norm_type = float(norm_type) | |
| if len(parameters) == 0: | |
| return torch.tensor(0.) | |
| device = parameters[0].grad.device | |
| if norm_type == inf: | |
| total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) | |
| else: | |
| total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) | |
| return total_norm | |
| class NativeScalerWithGradNormCount: | |
| state_dict_key = "amp_scaler" | |
| def __init__(self): | |
| self._scaler = torch.cuda.amp.GradScaler() | |
| def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): | |
| self._scaler.scale(loss).backward(create_graph=create_graph) | |
| if update_grad: | |
| if clip_grad is not None: | |
| assert parameters is not None | |
| self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place | |
| norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) | |
| else: | |
| self._scaler.unscale_(optimizer) | |
| norm = get_grad_norm_(parameters) | |
| self._scaler.step(optimizer) | |
| self._scaler.update() | |
| else: | |
| norm = None | |
| return norm | |
| def state_dict(self): | |
| return self._scaler.state_dict() | |
| def load_state_dict(self, state_dict): | |
| self._scaler.load_state_dict(state_dict) |