Weights & Biases (W&B) Feature Addition (#1235)
Browse files* Add wandb metric logging and bounding box debugging
* Improve formatting, readability
* Remove mutliple path for init, improve formatting
* Add wandb params
* Remove typecasting in bbox coordinates and reformat
* Cleanup
* add wandb to requirements.txt
* minor updates to test.py
* general reorg
* reduce --log-imgs to 10
* clean wandb import
* reverse wandb import assert
* add except AssertionError to try import
* move wandb init to all global ranks
* replace print() with logger.info()
* replace print() with logger.info()
* move wandb.init() bug fix
* project PosixPath to basename bug fix
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
- requirements.txt +3 -0
- test.py +23 -2
- train.py +35 -13
requirements.txt
CHANGED
@@ -13,6 +13,9 @@ torch>=1.6.0
|
|
13 |
torchvision>=0.7.0
|
14 |
tqdm>=4.41.0
|
15 |
|
|
|
|
|
|
|
16 |
# coco ----------------------------------------
|
17 |
# pycocotools>=2.0
|
18 |
|
|
|
13 |
torchvision>=0.7.0
|
14 |
tqdm>=4.41.0
|
15 |
|
16 |
+
# logging -------------------------------------
|
17 |
+
# wandb
|
18 |
+
|
19 |
# coco ----------------------------------------
|
20 |
# pycocotools>=2.0
|
21 |
|
test.py
CHANGED
@@ -33,7 +33,9 @@ def test(data,
|
|
33 |
save_dir=Path(''), # for saving images
|
34 |
save_txt=False, # for auto-labelling
|
35 |
save_conf=False,
|
36 |
-
plots=True
|
|
|
|
|
37 |
# Initialize/load model and set device
|
38 |
training = model is not None
|
39 |
if training: # called by train.py
|
@@ -77,6 +79,13 @@ def test(data,
|
|
77 |
iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
|
78 |
niou = iouv.numel()
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
# Dataloader
|
81 |
if not training:
|
82 |
img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img
|
@@ -91,7 +100,7 @@ def test(data,
|
|
91 |
s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
|
92 |
p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
|
93 |
loss = torch.zeros(3, device=device)
|
94 |
-
jdict, stats, ap, ap_class = [], [], [], []
|
95 |
for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
|
96 |
img = img.to(device, non_blocking=True)
|
97 |
img = img.half() if half else img.float() # uint8 to fp16/32
|
@@ -139,6 +148,14 @@ def test(data,
|
|
139 |
with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f:
|
140 |
f.write(('%g ' * len(line) + '\n') % line)
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
# Clip boxes to image bounds
|
143 |
clip_coords(pred, (height, width))
|
144 |
|
@@ -196,6 +213,10 @@ def test(data,
|
|
196 |
f = save_dir / f'test_batch{batch_i}_pred.jpg'
|
197 |
plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions
|
198 |
|
|
|
|
|
|
|
|
|
199 |
# Compute statistics
|
200 |
stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
|
201 |
if len(stats) and stats[0].any():
|
|
|
33 |
save_dir=Path(''), # for saving images
|
34 |
save_txt=False, # for auto-labelling
|
35 |
save_conf=False,
|
36 |
+
plots=True,
|
37 |
+
log_imgs=0): # number of logged images
|
38 |
+
|
39 |
# Initialize/load model and set device
|
40 |
training = model is not None
|
41 |
if training: # called by train.py
|
|
|
79 |
iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
|
80 |
niou = iouv.numel()
|
81 |
|
82 |
+
# Logging
|
83 |
+
log_imgs = min(log_imgs, 100) # ceil
|
84 |
+
try:
|
85 |
+
import wandb # Weights & Biases
|
86 |
+
except ImportError:
|
87 |
+
log_imgs = 0
|
88 |
+
|
89 |
# Dataloader
|
90 |
if not training:
|
91 |
img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img
|
|
|
100 |
s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
|
101 |
p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
|
102 |
loss = torch.zeros(3, device=device)
|
103 |
+
jdict, stats, ap, ap_class, wandb_images = [], [], [], [], []
|
104 |
for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
|
105 |
img = img.to(device, non_blocking=True)
|
106 |
img = img.half() if half else img.float() # uint8 to fp16/32
|
|
|
148 |
with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f:
|
149 |
f.write(('%g ' * len(line) + '\n') % line)
|
150 |
|
151 |
+
# W&B logging
|
152 |
+
if len(wandb_images) < log_imgs:
|
153 |
+
bbox_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
|
154 |
+
"class_id": int(cls),
|
155 |
+
"scores": {"class_score": conf},
|
156 |
+
"domain": "pixel"} for *xyxy, conf, cls in pred.clone().tolist()]
|
157 |
+
wandb_images.append(wandb.Image(img[si], boxes={"predictions": {"box_data": bbox_data}}))
|
158 |
+
|
159 |
# Clip boxes to image bounds
|
160 |
clip_coords(pred, (height, width))
|
161 |
|
|
|
213 |
f = save_dir / f'test_batch{batch_i}_pred.jpg'
|
214 |
plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions
|
215 |
|
216 |
+
# W&B logging
|
217 |
+
if wandb_images:
|
218 |
+
wandb.log({"outputs": wandb_images})
|
219 |
+
|
220 |
# Compute statistics
|
221 |
stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
|
222 |
if len(stats) and stats[0].any():
|
train.py
CHANGED
@@ -33,7 +33,7 @@ from utils.torch_utils import ModelEMA, select_device, intersect_dicts
|
|
33 |
logger = logging.getLogger(__name__)
|
34 |
|
35 |
|
36 |
-
def train(hyp, opt, device, tb_writer=None):
|
37 |
logger.info(f'Hyperparameters {hyp}')
|
38 |
log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
|
39 |
wdir = log_dir / 'weights' # weights directory
|
@@ -118,6 +118,11 @@ def train(hyp, opt, device, tb_writer=None):
|
|
118 |
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
|
119 |
# plot_lr_scheduler(optimizer, scheduler, epochs)
|
120 |
|
|
|
|
|
|
|
|
|
|
|
121 |
# Resume
|
122 |
start_epoch, best_fitness = 0, 0.0
|
123 |
if pretrained:
|
@@ -317,7 +322,8 @@ def train(hyp, opt, device, tb_writer=None):
|
|
317 |
single_cls=opt.single_cls,
|
318 |
dataloader=testloader,
|
319 |
save_dir=log_dir,
|
320 |
-
plots=epoch == 0 or final_epoch
|
|
|
321 |
|
322 |
# Write
|
323 |
with open(results_file, 'a') as f:
|
@@ -325,14 +331,16 @@ def train(hyp, opt, device, tb_writer=None):
|
|
325 |
if len(opt.name) and opt.bucket:
|
326 |
os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
|
327 |
|
328 |
-
#
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
tb_writer.add_scalar(tag, x, epoch)
|
|
|
|
|
336 |
|
337 |
# Update best mAP
|
338 |
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
@@ -347,7 +355,8 @@ def train(hyp, opt, device, tb_writer=None):
|
|
347 |
'best_fitness': best_fitness,
|
348 |
'training_results': f.read(),
|
349 |
'model': ema.ema,
|
350 |
-
'optimizer': None if final_epoch else optimizer.state_dict()
|
|
|
351 |
|
352 |
# Save last, best and delete
|
353 |
torch.save(ckpt, last)
|
@@ -403,7 +412,9 @@ if __name__ == '__main__':
|
|
403 |
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
|
404 |
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
|
405 |
parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
|
|
|
406 |
parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
|
|
|
407 |
opt = parser.parse_args()
|
408 |
|
409 |
# Set DDP variables
|
@@ -452,12 +463,23 @@ if __name__ == '__main__':
|
|
452 |
# Train
|
453 |
logger.info(opt)
|
454 |
if not opt.evolve:
|
455 |
-
tb_writer = None
|
456 |
if opt.global_rank in [-1, 0]:
|
|
|
457 |
logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
|
458 |
tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0
|
459 |
|
460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
|
462 |
# Evolve hyperparameters (optional)
|
463 |
else:
|
|
|
33 |
logger = logging.getLogger(__name__)
|
34 |
|
35 |
|
36 |
+
def train(hyp, opt, device, tb_writer=None, wandb=None):
|
37 |
logger.info(f'Hyperparameters {hyp}')
|
38 |
log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
|
39 |
wdir = log_dir / 'weights' # weights directory
|
|
|
118 |
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
|
119 |
# plot_lr_scheduler(optimizer, scheduler, epochs)
|
120 |
|
121 |
+
# Logging
|
122 |
+
if wandb and wandb.run is None:
|
123 |
+
id = ckpt.get('wandb_id') if 'ckpt' in locals() else None
|
124 |
+
wandb_run = wandb.init(config=opt, resume="allow", project=os.path.basename(log_dir), id=id)
|
125 |
+
|
126 |
# Resume
|
127 |
start_epoch, best_fitness = 0, 0.0
|
128 |
if pretrained:
|
|
|
322 |
single_cls=opt.single_cls,
|
323 |
dataloader=testloader,
|
324 |
save_dir=log_dir,
|
325 |
+
plots=epoch == 0 or final_epoch, # plot first and last
|
326 |
+
log_imgs=opt.log_imgs)
|
327 |
|
328 |
# Write
|
329 |
with open(results_file, 'a') as f:
|
|
|
331 |
if len(opt.name) and opt.bucket:
|
332 |
os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
|
333 |
|
334 |
+
# Log
|
335 |
+
tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss
|
336 |
+
'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
|
337 |
+
'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss
|
338 |
+
'x/lr0', 'x/lr1', 'x/lr2'] # params
|
339 |
+
for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
|
340 |
+
if tb_writer:
|
341 |
+
tb_writer.add_scalar(tag, x, epoch) # tensorboard
|
342 |
+
if wandb:
|
343 |
+
wandb.log({tag: x}) # W&B
|
344 |
|
345 |
# Update best mAP
|
346 |
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
|
|
355 |
'best_fitness': best_fitness,
|
356 |
'training_results': f.read(),
|
357 |
'model': ema.ema,
|
358 |
+
'optimizer': None if final_epoch else optimizer.state_dict(),
|
359 |
+
'wandb_id': wandb_run.id if wandb else None}
|
360 |
|
361 |
# Save last, best and delete
|
362 |
torch.save(ckpt, last)
|
|
|
412 |
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
|
413 |
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
|
414 |
parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
|
415 |
+
parser.add_argument('--log-imgs', type=int, default=10, help='number of images for W&B logging, max 100')
|
416 |
parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
|
417 |
+
|
418 |
opt = parser.parse_args()
|
419 |
|
420 |
# Set DDP variables
|
|
|
463 |
# Train
|
464 |
logger.info(opt)
|
465 |
if not opt.evolve:
|
466 |
+
tb_writer, wandb = None, None # init loggers
|
467 |
if opt.global_rank in [-1, 0]:
|
468 |
+
# Tensorboard
|
469 |
logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
|
470 |
tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0
|
471 |
|
472 |
+
# W&B
|
473 |
+
try:
|
474 |
+
import wandb
|
475 |
+
|
476 |
+
assert os.environ.get('WANDB_DISABLED') != 'true'
|
477 |
+
logger.info("Weights & Biases logging enabled, to disable set os.environ['WANDB_DISABLED'] = 'true'")
|
478 |
+
except (ImportError, AssertionError):
|
479 |
+
opt.log_imgs = 0
|
480 |
+
logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)")
|
481 |
+
|
482 |
+
train(hyp, opt, device, tb_writer, wandb)
|
483 |
|
484 |
# Evolve hyperparameters (optional)
|
485 |
else:
|