glenn-jocher commited on
Commit
8056fe2
1 Parent(s): 61b5733

hyperparameter evolution bug fix (#566)

Browse files
Files changed (1) hide show
  1. train.py +41 -21
train.py CHANGED
@@ -16,8 +16,7 @@ from utils.datasets import *
16
  from utils.utils import *
17
 
18
  # Hyperparameters
19
- hyp = {'optimizer': 'SGD', # ['Adam', 'SGD', ...] from torch.optim
20
- 'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3)
21
  'momentum': 0.937, # SGD momentum/Adam beta1
22
  'weight_decay': 5e-4, # optimizer weight decay
23
  'giou': 0.05, # GIoU loss gain
@@ -41,7 +40,7 @@ hyp = {'optimizer': 'SGD', # ['Adam', 'SGD', ...] from torch.optim
41
  'mixup': 0.0} # image mixup (probability)
42
 
43
 
44
- def train(hyp, tb_writer, opt, device):
45
  print(f'Hyperparameters {hyp}')
46
  log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory
47
  wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory
@@ -102,7 +101,7 @@ def train(hyp, tb_writer, opt, device):
102
  else:
103
  pg0.append(v) # all else
104
 
105
- if hyp['optimizer'] == 'Adam':
106
  optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
107
  else:
108
  optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
@@ -279,7 +278,7 @@ def train(hyp, tb_writer, opt, device):
279
  imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
280
 
281
  # Autocast
282
- with amp.autocast():
283
  # Forward
284
  pred = model(imgs)
285
 
@@ -402,11 +401,11 @@ if __name__ == '__main__':
402
  parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
403
  parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
404
  parser.add_argument('--epochs', type=int, default=300)
405
- parser.add_argument('--batch-size', type=int, default=16, help="Total batch size for all gpus.")
406
  parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
407
  parser.add_argument('--rect', action='store_true', help='rectangular training')
408
  parser.add_argument('--resume', nargs='?', const='get_last', default=False,
409
- help='resume from given path/to/last.pt, or most recent run if blank.')
410
  parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
411
  parser.add_argument('--notest', action='store_true', help='only test final epoch')
412
  parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
@@ -418,6 +417,7 @@ if __name__ == '__main__':
418
  parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
419
  parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
420
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
 
421
  parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
422
  parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
423
  opt = parser.parse_args()
@@ -445,30 +445,52 @@ if __name__ == '__main__':
445
  if opt.local_rank != -1:
446
  assert torch.cuda.device_count() > opt.local_rank
447
  torch.cuda.set_device(opt.local_rank)
448
- device = torch.device("cuda", opt.local_rank)
449
  dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
450
  opt.world_size = dist.get_world_size()
451
- assert opt.batch_size % opt.world_size == 0, "Batch size is not a multiple of the number of devices given!"
452
  opt.batch_size = opt.total_batch_size // opt.world_size
453
 
454
  print(opt)
455
 
456
  # Train
457
  if not opt.evolve:
 
458
  if opt.local_rank in [-1, 0]:
459
  print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
460
  tb_writer = SummaryWriter(log_dir=increment_dir('runs/exp', opt.name))
461
- else:
462
- tb_writer = None
463
 
464
- train(hyp, tb_writer, opt, device)
465
 
466
  # Evolve hyperparameters (optional)
467
  else:
468
- assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
- tb_writer = None
471
  opt.notest, opt.nosave = True, True # only test/save final epoch
 
472
  if opt.bucket:
473
  os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
474
 
@@ -490,8 +512,8 @@ if __name__ == '__main__':
490
  mp, s = 0.9, 0.2 # mutation probability, sigma
491
  npr = np.random
492
  npr.seed(int(time.time()))
493
- g = np.array([1, 1, 1, 1, 1, 1, 1, 0, .1, 1, 0, 1, 1, 1, 1, 1, 1, 1]) # gains
494
- ng = len(g)
495
  v = np.ones(ng)
496
  while all(v == 1): # mutate until a change occurs (prevent duplicates)
497
  v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
@@ -499,13 +521,11 @@ if __name__ == '__main__':
499
  hyp[k] = x[i + 7] * v[i] # mutate
500
 
501
  # Clip to limits
502
- keys = ['lr0', 'iou_t', 'momentum', 'weight_decay', 'hsv_s', 'hsv_v', 'translate', 'scale', 'fl_gamma']
503
- limits = [(1e-5, 1e-2), (0.00, 0.70), (0.60, 0.98), (0, 0.001), (0, .9), (0, .9), (0, .9), (0, .9), (0, 3)]
504
- for k, v in zip(keys, limits):
505
- hyp[k] = np.clip(hyp[k], v[0], v[1])
506
 
507
  # Train mutation
508
- results = train(hyp.copy(), tb_writer, opt, device)
509
 
510
  # Write mutation results
511
  print_mutation(hyp, results, opt.bucket)
 
16
  from utils.utils import *
17
 
18
  # Hyperparameters
19
+ hyp = {'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3)
 
20
  'momentum': 0.937, # SGD momentum/Adam beta1
21
  'weight_decay': 5e-4, # optimizer weight decay
22
  'giou': 0.05, # GIoU loss gain
 
40
  'mixup': 0.0} # image mixup (probability)
41
 
42
 
43
+ def train(hyp, opt, device, tb_writer=None):
44
  print(f'Hyperparameters {hyp}')
45
  log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory
46
  wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory
 
101
  else:
102
  pg0.append(v) # all else
103
 
104
+ if opt.adam:
105
  optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
106
  else:
107
  optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
 
278
  imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
279
 
280
  # Autocast
281
+ with amp.autocast(enabled=cuda):
282
  # Forward
283
  pred = model(imgs)
284
 
 
401
  parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
402
  parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
403
  parser.add_argument('--epochs', type=int, default=300)
404
+ parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
405
  parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
406
  parser.add_argument('--rect', action='store_true', help='rectangular training')
407
  parser.add_argument('--resume', nargs='?', const='get_last', default=False,
408
+ help='resume from given path/last.pt, or most recent run if blank')
409
  parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
410
  parser.add_argument('--notest', action='store_true', help='only test final epoch')
411
  parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
 
417
  parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
418
  parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
419
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
420
+ parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
421
  parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
422
  parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
423
  opt = parser.parse_args()
 
445
  if opt.local_rank != -1:
446
  assert torch.cuda.device_count() > opt.local_rank
447
  torch.cuda.set_device(opt.local_rank)
448
+ device = torch.device('cuda', opt.local_rank)
449
  dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
450
  opt.world_size = dist.get_world_size()
451
+ assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
452
  opt.batch_size = opt.total_batch_size // opt.world_size
453
 
454
  print(opt)
455
 
456
  # Train
457
  if not opt.evolve:
458
+ tb_writer = None
459
  if opt.local_rank in [-1, 0]:
460
  print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
461
  tb_writer = SummaryWriter(log_dir=increment_dir('runs/exp', opt.name))
 
 
462
 
463
+ train(hyp, opt, device, tb_writer)
464
 
465
  # Evolve hyperparameters (optional)
466
  else:
467
+ # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
468
+ meta = {'lr0': (1, 1e-5, 1e-2), # initial learning rate (SGD=1E-2, Adam=1E-3)
469
+ 'momentum': (0.1, 0.6, 0.98), # SGD momentum/Adam beta1
470
+ 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
471
+ 'giou': (1, 0.02, 0.2), # GIoU loss gain
472
+ 'cls': (1, 0.2, 4.0), # cls loss gain
473
+ 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
474
+ 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
475
+ 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
476
+ 'iou_t': (0, 0.1, 0.7), # IoU training threshold
477
+ 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
478
+ 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
479
+ 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
480
+ 'hsv_s': (1, 0.0, 0.8), # image HSV-Saturation augmentation (fraction)
481
+ 'hsv_v': (1, 0.0, 0.8), # image HSV-Value augmentation (fraction)
482
+ 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
483
+ 'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
484
+ 'scale': (1, 0.0, 0.9), # image scale (+/- gain)
485
+ 'shear': (1, 0.0, 10.0), # image shear (+/- deg)
486
+ 'perspective': (1, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
487
+ 'flipud': (0, 0.0, 1.0), # image flip up-down (probability)
488
+ 'fliplr': (1, 0.0, 1.0), # image flip left-right (probability)
489
+ 'mixup': (1, 0.0, 1.0)} # image mixup (probability)
490
 
491
+ assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
492
  opt.notest, opt.nosave = True, True # only test/save final epoch
493
+ # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
494
  if opt.bucket:
495
  os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
496
 
 
512
  mp, s = 0.9, 0.2 # mutation probability, sigma
513
  npr = np.random
514
  npr.seed(int(time.time()))
515
+ g = np.array([x[0] for x in meta.values()]) # gains 0-1
516
+ ng = len(meta)
517
  v = np.ones(ng)
518
  while all(v == 1): # mutate until a change occurs (prevent duplicates)
519
  v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
 
521
  hyp[k] = x[i + 7] * v[i] # mutate
522
 
523
  # Clip to limits
524
+ for k, v in meta.items():
525
+ hyp[k] = np.clip(hyp[k], v[1], v[2])
 
 
526
 
527
  # Train mutation
528
+ results = train(hyp.copy(), opt, device)
529
 
530
  # Write mutation results
531
  print_mutation(hyp, results, opt.bucket)