Ayush Chaurasia glenn-jocher commited on
Commit
ca290dc
1 Parent(s): c8c5ef3

Weights & Biases (W&B) Feature Addition (#1235)

Browse files

* Add wandb metric logging and bounding box debugging

* Improve formatting, readability

* Remove mutliple path for init, improve formatting

* Add wandb params

* Remove typecasting in bbox coordinates and reformat

* Cleanup

* add wandb to requirements.txt

* minor updates to test.py

* general reorg

* reduce --log-imgs to 10

* clean wandb import

* reverse wandb import assert

* add except AssertionError to try import

* move wandb init to all global ranks

* replace print() with logger.info()

* replace print() with logger.info()

* move wandb.init() bug fix

* project PosixPath to basename bug fix

Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>

Files changed (3) hide show
  1. requirements.txt +3 -0
  2. test.py +23 -2
  3. train.py +35 -13
requirements.txt CHANGED
@@ -13,6 +13,9 @@ torch>=1.6.0
13
  torchvision>=0.7.0
14
  tqdm>=4.41.0
15
 
 
 
 
16
  # coco ----------------------------------------
17
  # pycocotools>=2.0
18
 
 
13
  torchvision>=0.7.0
14
  tqdm>=4.41.0
15
 
16
+ # logging -------------------------------------
17
+ # wandb
18
+
19
  # coco ----------------------------------------
20
  # pycocotools>=2.0
21
 
test.py CHANGED
@@ -33,7 +33,9 @@ def test(data,
33
  save_dir=Path(''), # for saving images
34
  save_txt=False, # for auto-labelling
35
  save_conf=False,
36
- plots=True):
 
 
37
  # Initialize/load model and set device
38
  training = model is not None
39
  if training: # called by train.py
@@ -77,6 +79,13 @@ def test(data,
77
  iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
78
  niou = iouv.numel()
79
 
 
 
 
 
 
 
 
80
  # Dataloader
81
  if not training:
82
  img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img
@@ -91,7 +100,7 @@ def test(data,
91
  s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
92
  p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
93
  loss = torch.zeros(3, device=device)
94
- jdict, stats, ap, ap_class = [], [], [], []
95
  for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
96
  img = img.to(device, non_blocking=True)
97
  img = img.half() if half else img.float() # uint8 to fp16/32
@@ -139,6 +148,14 @@ def test(data,
139
  with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f:
140
  f.write(('%g ' * len(line) + '\n') % line)
141
 
 
 
 
 
 
 
 
 
142
  # Clip boxes to image bounds
143
  clip_coords(pred, (height, width))
144
 
@@ -196,6 +213,10 @@ def test(data,
196
  f = save_dir / f'test_batch{batch_i}_pred.jpg'
197
  plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions
198
 
 
 
 
 
199
  # Compute statistics
200
  stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
201
  if len(stats) and stats[0].any():
 
33
  save_dir=Path(''), # for saving images
34
  save_txt=False, # for auto-labelling
35
  save_conf=False,
36
+ plots=True,
37
+ log_imgs=0): # number of logged images
38
+
39
  # Initialize/load model and set device
40
  training = model is not None
41
  if training: # called by train.py
 
79
  iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
80
  niou = iouv.numel()
81
 
82
+ # Logging
83
+ log_imgs = min(log_imgs, 100) # ceil
84
+ try:
85
+ import wandb # Weights & Biases
86
+ except ImportError:
87
+ log_imgs = 0
88
+
89
  # Dataloader
90
  if not training:
91
  img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img
 
100
  s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
101
  p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
102
  loss = torch.zeros(3, device=device)
103
+ jdict, stats, ap, ap_class, wandb_images = [], [], [], [], []
104
  for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
105
  img = img.to(device, non_blocking=True)
106
  img = img.half() if half else img.float() # uint8 to fp16/32
 
148
  with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f:
149
  f.write(('%g ' * len(line) + '\n') % line)
150
 
151
+ # W&B logging
152
+ if len(wandb_images) < log_imgs:
153
+ bbox_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
154
+ "class_id": int(cls),
155
+ "scores": {"class_score": conf},
156
+ "domain": "pixel"} for *xyxy, conf, cls in pred.clone().tolist()]
157
+ wandb_images.append(wandb.Image(img[si], boxes={"predictions": {"box_data": bbox_data}}))
158
+
159
  # Clip boxes to image bounds
160
  clip_coords(pred, (height, width))
161
 
 
213
  f = save_dir / f'test_batch{batch_i}_pred.jpg'
214
  plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions
215
 
216
+ # W&B logging
217
+ if wandb_images:
218
+ wandb.log({"outputs": wandb_images})
219
+
220
  # Compute statistics
221
  stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
222
  if len(stats) and stats[0].any():
train.py CHANGED
@@ -33,7 +33,7 @@ from utils.torch_utils import ModelEMA, select_device, intersect_dicts
33
  logger = logging.getLogger(__name__)
34
 
35
 
36
- def train(hyp, opt, device, tb_writer=None):
37
  logger.info(f'Hyperparameters {hyp}')
38
  log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
39
  wdir = log_dir / 'weights' # weights directory
@@ -118,6 +118,11 @@ def train(hyp, opt, device, tb_writer=None):
118
  scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
119
  # plot_lr_scheduler(optimizer, scheduler, epochs)
120
 
 
 
 
 
 
121
  # Resume
122
  start_epoch, best_fitness = 0, 0.0
123
  if pretrained:
@@ -317,7 +322,8 @@ def train(hyp, opt, device, tb_writer=None):
317
  single_cls=opt.single_cls,
318
  dataloader=testloader,
319
  save_dir=log_dir,
320
- plots=epoch == 0 or final_epoch) # plot first and last
 
321
 
322
  # Write
323
  with open(results_file, 'a') as f:
@@ -325,14 +331,16 @@ def train(hyp, opt, device, tb_writer=None):
325
  if len(opt.name) and opt.bucket:
326
  os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
327
 
328
- # Tensorboard
329
- if tb_writer:
330
- tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss
331
- 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
332
- 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss
333
- 'x/lr0', 'x/lr1', 'x/lr2'] # params
334
- for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
335
- tb_writer.add_scalar(tag, x, epoch)
 
 
336
 
337
  # Update best mAP
338
  fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
@@ -347,7 +355,8 @@ def train(hyp, opt, device, tb_writer=None):
347
  'best_fitness': best_fitness,
348
  'training_results': f.read(),
349
  'model': ema.ema,
350
- 'optimizer': None if final_epoch else optimizer.state_dict()}
 
351
 
352
  # Save last, best and delete
353
  torch.save(ckpt, last)
@@ -403,7 +412,9 @@ if __name__ == '__main__':
403
  parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
404
  parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
405
  parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
 
406
  parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
 
407
  opt = parser.parse_args()
408
 
409
  # Set DDP variables
@@ -452,12 +463,23 @@ if __name__ == '__main__':
452
  # Train
453
  logger.info(opt)
454
  if not opt.evolve:
455
- tb_writer = None
456
  if opt.global_rank in [-1, 0]:
 
457
  logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
458
  tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0
459
 
460
- train(hyp, opt, device, tb_writer)
 
 
 
 
 
 
 
 
 
 
461
 
462
  # Evolve hyperparameters (optional)
463
  else:
 
33
  logger = logging.getLogger(__name__)
34
 
35
 
36
+ def train(hyp, opt, device, tb_writer=None, wandb=None):
37
  logger.info(f'Hyperparameters {hyp}')
38
  log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
39
  wdir = log_dir / 'weights' # weights directory
 
118
  scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
119
  # plot_lr_scheduler(optimizer, scheduler, epochs)
120
 
121
+ # Logging
122
+ if wandb and wandb.run is None:
123
+ id = ckpt.get('wandb_id') if 'ckpt' in locals() else None
124
+ wandb_run = wandb.init(config=opt, resume="allow", project=os.path.basename(log_dir), id=id)
125
+
126
  # Resume
127
  start_epoch, best_fitness = 0, 0.0
128
  if pretrained:
 
322
  single_cls=opt.single_cls,
323
  dataloader=testloader,
324
  save_dir=log_dir,
325
+ plots=epoch == 0 or final_epoch, # plot first and last
326
+ log_imgs=opt.log_imgs)
327
 
328
  # Write
329
  with open(results_file, 'a') as f:
 
331
  if len(opt.name) and opt.bucket:
332
  os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
333
 
334
+ # Log
335
+ tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss
336
+ 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
337
+ 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss
338
+ 'x/lr0', 'x/lr1', 'x/lr2'] # params
339
+ for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
340
+ if tb_writer:
341
+ tb_writer.add_scalar(tag, x, epoch) # tensorboard
342
+ if wandb:
343
+ wandb.log({tag: x}) # W&B
344
 
345
  # Update best mAP
346
  fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
 
355
  'best_fitness': best_fitness,
356
  'training_results': f.read(),
357
  'model': ema.ema,
358
+ 'optimizer': None if final_epoch else optimizer.state_dict(),
359
+ 'wandb_id': wandb_run.id if wandb else None}
360
 
361
  # Save last, best and delete
362
  torch.save(ckpt, last)
 
412
  parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
413
  parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
414
  parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
415
+ parser.add_argument('--log-imgs', type=int, default=10, help='number of images for W&B logging, max 100')
416
  parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
417
+
418
  opt = parser.parse_args()
419
 
420
  # Set DDP variables
 
463
  # Train
464
  logger.info(opt)
465
  if not opt.evolve:
466
+ tb_writer, wandb = None, None # init loggers
467
  if opt.global_rank in [-1, 0]:
468
+ # Tensorboard
469
  logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
470
  tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0
471
 
472
+ # W&B
473
+ try:
474
+ import wandb
475
+
476
+ assert os.environ.get('WANDB_DISABLED') != 'true'
477
+ logger.info("Weights & Biases logging enabled, to disable set os.environ['WANDB_DISABLED'] = 'true'")
478
+ except (ImportError, AssertionError):
479
+ opt.log_imgs = 0
480
+ logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)")
481
+
482
+ train(hyp, opt, device, tb_writer, wandb)
483
 
484
  # Evolve hyperparameters (optional)
485
  else: