# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from torch import nn from fairseq.distributed import utils class TPUDistributedDataParallel(nn.Module): def __init__(self, module, process_group): super().__init__() self.module = module self.process_group = process_group self.world_size = utils.get_world_size(self.process_group) def forward(self, *inputs, **kwargs): return self.module(*inputs, **kwargs) def all_reduce_grads(self): gradients = [] for p in self.parameters(): if not p.requires_grad: continue if p.grad is None: p.grad = torch.zeros_like(p) if p.grad.requires_grad: raise RuntimeError( "TPUDistributedDataParallel only works with gradients that don't " "require grad" ) gradients.append(p.grad) import torch_xla.core.xla_model as xm xm.all_reduce( 'sum', gradients, scale=1. / self.world_size, groups=self.process_group[1], )