Spaces:

xiang-wuu
/

yolov5

Runtime error

glenn-jocher commited on Aug 15, 2021

Commit

19d03a9

unverified ·

1 Parent(s): 4e65052

Remove DDP process group timeout (#4422)

Files changed (2) hide show

train.py CHANGED Viewed

@@ -493,7 +493,7 @@ def main(opt):
         assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
         torch.cuda.set_device(LOCAL_RANK)
         device = torch.device('cuda', LOCAL_RANK)
-        dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
     # Train
     if not opt.evolve:

         assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
         torch.cuda.set_device(LOCAL_RANK)
         device = torch.device('cuda', LOCAL_RANK)
+        dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
     # Train
     if not opt.evolve:

utils/torch_utils.py CHANGED Viewed

@@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
     Decorator to make all processes in distributed training wait for each local_master to do something.
     """
     if local_rank not in [-1, 0]:
-        dist.barrier()
     yield
     if local_rank == 0:
-        dist.barrier()
 def init_torch_seeds(seed=0):

     Decorator to make all processes in distributed training wait for each local_master to do something.
     """
     if local_rank not in [-1, 0]:
+        dist.barrier(device_ids=[local_rank])
     yield
     if local_rank == 0:
+        dist.barrier(device_ids=[0])
 def init_torch_seeds(seed=0):