Remove DDP process group timeout (#4422)
Browse files- train.py +1 -1
- utils/torch_utils.py +2 -2
train.py
CHANGED
@@ -493,7 +493,7 @@ def main(opt):
|
|
493 |
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
|
494 |
torch.cuda.set_device(LOCAL_RANK)
|
495 |
device = torch.device('cuda', LOCAL_RANK)
|
496 |
-
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo"
|
497 |
|
498 |
# Train
|
499 |
if not opt.evolve:
|
|
|
493 |
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
|
494 |
torch.cuda.set_device(LOCAL_RANK)
|
495 |
device = torch.device('cuda', LOCAL_RANK)
|
496 |
+
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
|
497 |
|
498 |
# Train
|
499 |
if not opt.evolve:
|
utils/torch_utils.py
CHANGED
@@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
|
|
35 |
Decorator to make all processes in distributed training wait for each local_master to do something.
|
36 |
"""
|
37 |
if local_rank not in [-1, 0]:
|
38 |
-
dist.barrier()
|
39 |
yield
|
40 |
if local_rank == 0:
|
41 |
-
dist.barrier()
|
42 |
|
43 |
|
44 |
def init_torch_seeds(seed=0):
|
|
|
35 |
Decorator to make all processes in distributed training wait for each local_master to do something.
|
36 |
"""
|
37 |
if local_rank not in [-1, 0]:
|
38 |
+
dist.barrier(device_ids=[local_rank])
|
39 |
yield
|
40 |
if local_rank == 0:
|
41 |
+
dist.barrier(device_ids=[0])
|
42 |
|
43 |
|
44 |
def init_torch_seeds(seed=0):
|