glenn-jocher commited on
Commit
50a9828
1 Parent(s): bb5ebc2

DDP `torch.jit.trace()` `--sync-bn` fix (#4615)

Browse files

* Remove assert

* debug0

* trace=not opt.sync

* sync to sync_bn fix

* Cleanup

Files changed (2) hide show
  1. train.py +1 -2
  2. utils/loggers/__init__.py +5 -4
train.py CHANGED
@@ -333,7 +333,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
333
  mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
334
  pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
335
  f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
336
- callbacks.on_train_batch_end(ni, model, imgs, targets, paths, plots)
337
  # end batch ------------------------------------------------------------------------------------------------
338
 
339
  # Scheduler
@@ -499,7 +499,6 @@ def main(opt):
499
  assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
500
  assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
501
  assert not opt.evolve, '--evolve argument is not compatible with DDP training'
502
- assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
503
  torch.cuda.set_device(LOCAL_RANK)
504
  device = torch.device('cuda', LOCAL_RANK)
505
  dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
 
333
  mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
334
  pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
335
  f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
336
+ callbacks.on_train_batch_end(ni, model, imgs, targets, paths, plots, opt.sync_bn)
337
  # end batch ------------------------------------------------------------------------------------------------
338
 
339
  # Scheduler
 
499
  assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
500
  assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
501
  assert not opt.evolve, '--evolve argument is not compatible with DDP training'
 
502
  torch.cuda.set_device(LOCAL_RANK)
503
  device = torch.device('cuda', LOCAL_RANK)
504
  dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
utils/loggers/__init__.py CHANGED
@@ -69,13 +69,14 @@ class Loggers():
69
  if self.wandb:
70
  self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]})
71
 
72
- def on_train_batch_end(self, ni, model, imgs, targets, paths, plots):
73
  # Callback runs on train batch end
74
  if plots:
75
  if ni == 0:
76
- with warnings.catch_warnings():
77
- warnings.simplefilter('ignore') # suppress jit trace warning
78
- self.tb.add_graph(torch.jit.trace(de_parallel(model), imgs[0:1], strict=False), [])
 
79
  if ni < 3:
80
  f = self.save_dir / f'train_batch{ni}.jpg' # filename
81
  Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
 
69
  if self.wandb:
70
  self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]})
71
 
72
+ def on_train_batch_end(self, ni, model, imgs, targets, paths, plots, sync_bn):
73
  # Callback runs on train batch end
74
  if plots:
75
  if ni == 0:
76
+ if not sync_bn: # tb.add_graph() --sync known issue https://github.com/ultralytics/yolov5/issues/3754
77
+ with warnings.catch_warnings():
78
+ warnings.simplefilter('ignore') # suppress jit trace warning
79
+ self.tb.add_graph(torch.jit.trace(de_parallel(model), imgs[0:1], strict=False), [])
80
  if ni < 3:
81
  f = self.save_dir / f'train_batch{ni}.jpg' # filename
82
  Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()