glenn-jocher commited on
Commit
19d03a9
·
unverified ·
1 Parent(s): 4e65052

Remove DDP process group timeout (#4422)

Browse files
Files changed (2) hide show
  1. train.py +1 -1
  2. utils/torch_utils.py +2 -2
train.py CHANGED
@@ -493,7 +493,7 @@ def main(opt):
493
  assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
494
  torch.cuda.set_device(LOCAL_RANK)
495
  device = torch.device('cuda', LOCAL_RANK)
496
- dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
497
 
498
  # Train
499
  if not opt.evolve:
 
493
  assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
494
  torch.cuda.set_device(LOCAL_RANK)
495
  device = torch.device('cuda', LOCAL_RANK)
496
+ dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
497
 
498
  # Train
499
  if not opt.evolve:
utils/torch_utils.py CHANGED
@@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
35
  Decorator to make all processes in distributed training wait for each local_master to do something.
36
  """
37
  if local_rank not in [-1, 0]:
38
- dist.barrier()
39
  yield
40
  if local_rank == 0:
41
- dist.barrier()
42
 
43
 
44
  def init_torch_seeds(seed=0):
 
35
  Decorator to make all processes in distributed training wait for each local_master to do something.
36
  """
37
  if local_rank not in [-1, 0]:
38
+ dist.barrier(device_ids=[local_rank])
39
  yield
40
  if local_rank == 0:
41
+ dist.barrier(device_ids=[0])
42
 
43
 
44
  def init_torch_seeds(seed=0):