Spaces:

xiang-wuu
/

yolov5

Runtime error

App Files Files Community

Ayush Chaurasia

glenn-jocher commited on Oct 31, 2020

Commit

ca290dc

unverified ·

1 Parent(s): c8c5ef3

Weights & Biases (W&B) Feature Addition (#1235)

Browse files

* Add wandb metric logging and bounding box debugging

* Improve formatting, readability

* Remove mutliple path for init, improve formatting

* Add wandb params

* Remove typecasting in bbox coordinates and reformat

* Cleanup

* add wandb to requirements.txt

* minor updates to test.py

* general reorg

* reduce --log-imgs to 10

* clean wandb import

* reverse wandb import assert

* add except AssertionError to try import

* move wandb init to all global ranks

* replace print() with logger.info()

* replace print() with logger.info()

* move wandb.init() bug fix

* project PosixPath to basename bug fix

Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>

Files changed (3) hide show

requirements.txt +3 -0
test.py +23 -2
train.py +35 -13

requirements.txt CHANGED Viewed

@@ -13,6 +13,9 @@ torch>=1.6.0
 torchvision>=0.7.0
 tqdm>=4.41.0
 # coco ----------------------------------------
 # pycocotools>=2.0

 torchvision>=0.7.0
 tqdm>=4.41.0
+# logging -------------------------------------
+# wandb
 # coco ----------------------------------------
 # pycocotools>=2.0

test.py CHANGED Viewed

@@ -33,7 +33,9 @@ def test(data,
          save_dir=Path(''),  # for saving images
          save_txt=False,  # for auto-labelling
          save_conf=False,
-         plots=True):
     # Initialize/load model and set device
     training = model is not None
     if training:  # called by train.py
@@ -77,6 +79,13 @@ def test(data,
     iouv = torch.linspace(0.5, 0.95, 10).to(device)  # iou vector for mAP@0.5:0.95
     niou = iouv.numel()
     # Dataloader
     if not training:
         img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
@@ -91,7 +100,7 @@ def test(data,
     s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
     p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
     loss = torch.zeros(3, device=device)
-    jdict, stats, ap, ap_class = [], [], [], []
     for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
         img = img.to(device, non_blocking=True)
         img = img.half() if half else img.float()  # uint8 to fp16/32
@@ -139,6 +148,14 @@ def test(data,
                     with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f:
                         f.write(('%g ' * len(line) + '\n') % line)
             # Clip boxes to image bounds
             clip_coords(pred, (height, width))
@@ -196,6 +213,10 @@ def test(data,
             f = save_dir / f'test_batch{batch_i}_pred.jpg'
             plot_images(img, output_to_target(output, width, height), paths, str(f), names)  # predictions
     # Compute statistics
     stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy
     if len(stats) and stats[0].any():

          save_dir=Path(''),  # for saving images
          save_txt=False,  # for auto-labelling
          save_conf=False,
+         plots=True,
+         log_imgs=0):  # number of logged images
     # Initialize/load model and set device
     training = model is not None
     if training:  # called by train.py
     iouv = torch.linspace(0.5, 0.95, 10).to(device)  # iou vector for mAP@0.5:0.95
     niou = iouv.numel()
+    # Logging
+    log_imgs = min(log_imgs, 100)  # ceil
+    try:
+        import wandb  # Weights & Biases
+    except ImportError:
+        log_imgs = 0
     # Dataloader
     if not training:
         img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
     s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
     p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
     loss = torch.zeros(3, device=device)
+    jdict, stats, ap, ap_class, wandb_images = [], [], [], [], []
     for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
         img = img.to(device, non_blocking=True)
         img = img.half() if half else img.float()  # uint8 to fp16/32
                     with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f:
                         f.write(('%g ' * len(line) + '\n') % line)
+            # W&B logging
+            if len(wandb_images) < log_imgs:
+                bbox_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
+                              "class_id": int(cls),
+                              "scores": {"class_score": conf},
+                              "domain": "pixel"} for *xyxy, conf, cls in pred.clone().tolist()]
+                wandb_images.append(wandb.Image(img[si], boxes={"predictions": {"box_data": bbox_data}}))
             # Clip boxes to image bounds
             clip_coords(pred, (height, width))
             f = save_dir / f'test_batch{batch_i}_pred.jpg'
             plot_images(img, output_to_target(output, width, height), paths, str(f), names)  # predictions
+    # W&B logging
+    if wandb_images:
+        wandb.log({"outputs": wandb_images})
     # Compute statistics
     stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy
     if len(stats) and stats[0].any():

train.py CHANGED Viewed

@@ -33,7 +33,7 @@ from utils.torch_utils import ModelEMA, select_device, intersect_dicts
 logger = logging.getLogger(__name__)
-def train(hyp, opt, device, tb_writer=None):
     logger.info(f'Hyperparameters {hyp}')
     log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve'  # logging directory
     wdir = log_dir / 'weights'  # weights directory
@@ -118,6 +118,11 @@ def train(hyp, opt, device, tb_writer=None):
     scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
     # plot_lr_scheduler(optimizer, scheduler, epochs)
     # Resume
     start_epoch, best_fitness = 0, 0.0
     if pretrained:
@@ -317,7 +322,8 @@ def train(hyp, opt, device, tb_writer=None):
                                                  single_cls=opt.single_cls,
                                                  dataloader=testloader,
                                                  save_dir=log_dir,
-                                                 plots=epoch == 0 or final_epoch)  # plot first and last
             # Write
             with open(results_file, 'a') as f:
@@ -325,14 +331,16 @@ def train(hyp, opt, device, tb_writer=None):
             if len(opt.name) and opt.bucket:
                 os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
-            # Tensorboard
-            if tb_writer:
-                tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
-                        'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
-                        'val/box_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
-                        'x/lr0', 'x/lr1', 'x/lr2']  # params
-                for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
-                    tb_writer.add_scalar(tag, x, epoch)
             # Update best mAP
             fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
@@ -347,7 +355,8 @@ def train(hyp, opt, device, tb_writer=None):
                             'best_fitness': best_fitness,
                             'training_results': f.read(),
                             'model': ema.ema,
-                            'optimizer': None if final_epoch else optimizer.state_dict()}
                 # Save last, best and delete
                 torch.save(ckpt, last)
@@ -403,7 +412,9 @@ if __name__ == '__main__':
     parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
     parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
     parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
     parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
     opt = parser.parse_args()
     # Set DDP variables
@@ -452,12 +463,23 @@ if __name__ == '__main__':
     # Train
     logger.info(opt)
     if not opt.evolve:
-        tb_writer = None
         if opt.global_rank in [-1, 0]:
             logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
             tb_writer = SummaryWriter(log_dir=log_dir)  # runs/exp0
-        train(hyp, opt, device, tb_writer)
     # Evolve hyperparameters (optional)
     else:

 logger = logging.getLogger(__name__)
+def train(hyp, opt, device, tb_writer=None, wandb=None):
     logger.info(f'Hyperparameters {hyp}')
     log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve'  # logging directory
     wdir = log_dir / 'weights'  # weights directory
     scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
     # plot_lr_scheduler(optimizer, scheduler, epochs)
+    # Logging
+    if wandb and wandb.run is None:
+        id = ckpt.get('wandb_id') if 'ckpt' in locals() else None
+        wandb_run = wandb.init(config=opt, resume="allow", project=os.path.basename(log_dir), id=id)
     # Resume
     start_epoch, best_fitness = 0, 0.0
     if pretrained:
                                                  single_cls=opt.single_cls,
                                                  dataloader=testloader,
                                                  save_dir=log_dir,
+                                                 plots=epoch == 0 or final_epoch,  # plot first and last
+                                                 log_imgs=opt.log_imgs)
             # Write
             with open(results_file, 'a') as f:
             if len(opt.name) and opt.bucket:
                 os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
+            # Log
+            tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
+                    'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
+                    'val/giou_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
+                    'x/lr0', 'x/lr1', 'x/lr2']  # params
+            for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
+                if tb_writer:
+                    tb_writer.add_scalar(tag, x, epoch)  # tensorboard
+                if wandb:
+                    wandb.log({tag: x})  # W&B
             # Update best mAP
             fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
                             'best_fitness': best_fitness,
                             'training_results': f.read(),
                             'model': ema.ema,
+                            'optimizer': None if final_epoch else optimizer.state_dict(),
+                            'wandb_id': wandb_run.id if wandb else None}
                 # Save last, best and delete
                 torch.save(ckpt, last)
     parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
     parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
     parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
+    parser.add_argument('--log-imgs', type=int, default=10, help='number of images for W&B logging, max 100')
     parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
     opt = parser.parse_args()
     # Set DDP variables
     # Train
     logger.info(opt)
     if not opt.evolve:
+        tb_writer, wandb = None, None  # init loggers
         if opt.global_rank in [-1, 0]:
+            # Tensorboard
             logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
             tb_writer = SummaryWriter(log_dir=log_dir)  # runs/exp0
+            # W&B
+            try:
+                import wandb
+                assert os.environ.get('WANDB_DISABLED') != 'true'
+                logger.info("Weights & Biases logging enabled, to disable set os.environ['WANDB_DISABLED'] = 'true'")
+            except (ImportError, AssertionError):
+                opt.log_imgs = 0
+                logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)")
+        train(hyp, opt, device, tb_writer, wandb)
     # Evolve hyperparameters (optional)
     else: