glenn-jocher
commited on
Commit
·
8056fe2
1
Parent(s):
61b5733
hyperparameter evolution bug fix (#566)
Browse files
train.py
CHANGED
@@ -16,8 +16,7 @@ from utils.datasets import *
|
|
16 |
from utils.utils import *
|
17 |
|
18 |
# Hyperparameters
|
19 |
-
hyp = {'
|
20 |
-
'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3)
|
21 |
'momentum': 0.937, # SGD momentum/Adam beta1
|
22 |
'weight_decay': 5e-4, # optimizer weight decay
|
23 |
'giou': 0.05, # GIoU loss gain
|
@@ -41,7 +40,7 @@ hyp = {'optimizer': 'SGD', # ['Adam', 'SGD', ...] from torch.optim
|
|
41 |
'mixup': 0.0} # image mixup (probability)
|
42 |
|
43 |
|
44 |
-
def train(hyp,
|
45 |
print(f'Hyperparameters {hyp}')
|
46 |
log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory
|
47 |
wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory
|
@@ -102,7 +101,7 @@ def train(hyp, tb_writer, opt, device):
|
|
102 |
else:
|
103 |
pg0.append(v) # all else
|
104 |
|
105 |
-
if
|
106 |
optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
|
107 |
else:
|
108 |
optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
|
@@ -279,7 +278,7 @@ def train(hyp, tb_writer, opt, device):
|
|
279 |
imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
|
280 |
|
281 |
# Autocast
|
282 |
-
with amp.autocast():
|
283 |
# Forward
|
284 |
pred = model(imgs)
|
285 |
|
@@ -402,11 +401,11 @@ if __name__ == '__main__':
|
|
402 |
parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
|
403 |
parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
|
404 |
parser.add_argument('--epochs', type=int, default=300)
|
405 |
-
parser.add_argument('--batch-size', type=int, default=16, help=
|
406 |
parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
|
407 |
parser.add_argument('--rect', action='store_true', help='rectangular training')
|
408 |
parser.add_argument('--resume', nargs='?', const='get_last', default=False,
|
409 |
-
help='resume from given path/
|
410 |
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
|
411 |
parser.add_argument('--notest', action='store_true', help='only test final epoch')
|
412 |
parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
|
@@ -418,6 +417,7 @@ if __name__ == '__main__':
|
|
418 |
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
|
419 |
parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
|
420 |
parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
|
|
|
421 |
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
|
422 |
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
|
423 |
opt = parser.parse_args()
|
@@ -445,30 +445,52 @@ if __name__ == '__main__':
|
|
445 |
if opt.local_rank != -1:
|
446 |
assert torch.cuda.device_count() > opt.local_rank
|
447 |
torch.cuda.set_device(opt.local_rank)
|
448 |
-
device = torch.device(
|
449 |
dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
|
450 |
opt.world_size = dist.get_world_size()
|
451 |
-
assert opt.batch_size % opt.world_size == 0,
|
452 |
opt.batch_size = opt.total_batch_size // opt.world_size
|
453 |
|
454 |
print(opt)
|
455 |
|
456 |
# Train
|
457 |
if not opt.evolve:
|
|
|
458 |
if opt.local_rank in [-1, 0]:
|
459 |
print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
|
460 |
tb_writer = SummaryWriter(log_dir=increment_dir('runs/exp', opt.name))
|
461 |
-
else:
|
462 |
-
tb_writer = None
|
463 |
|
464 |
-
train(hyp,
|
465 |
|
466 |
# Evolve hyperparameters (optional)
|
467 |
else:
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
-
|
471 |
opt.notest, opt.nosave = True, True # only test/save final epoch
|
|
|
472 |
if opt.bucket:
|
473 |
os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
|
474 |
|
@@ -490,8 +512,8 @@ if __name__ == '__main__':
|
|
490 |
mp, s = 0.9, 0.2 # mutation probability, sigma
|
491 |
npr = np.random
|
492 |
npr.seed(int(time.time()))
|
493 |
-
g = np.array([
|
494 |
-
ng = len(
|
495 |
v = np.ones(ng)
|
496 |
while all(v == 1): # mutate until a change occurs (prevent duplicates)
|
497 |
v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
|
@@ -499,13 +521,11 @@ if __name__ == '__main__':
|
|
499 |
hyp[k] = x[i + 7] * v[i] # mutate
|
500 |
|
501 |
# Clip to limits
|
502 |
-
|
503 |
-
|
504 |
-
for k, v in zip(keys, limits):
|
505 |
-
hyp[k] = np.clip(hyp[k], v[0], v[1])
|
506 |
|
507 |
# Train mutation
|
508 |
-
results = train(hyp.copy(),
|
509 |
|
510 |
# Write mutation results
|
511 |
print_mutation(hyp, results, opt.bucket)
|
|
|
16 |
from utils.utils import *
|
17 |
|
18 |
# Hyperparameters
|
19 |
+
hyp = {'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3)
|
|
|
20 |
'momentum': 0.937, # SGD momentum/Adam beta1
|
21 |
'weight_decay': 5e-4, # optimizer weight decay
|
22 |
'giou': 0.05, # GIoU loss gain
|
|
|
40 |
'mixup': 0.0} # image mixup (probability)
|
41 |
|
42 |
|
43 |
+
def train(hyp, opt, device, tb_writer=None):
|
44 |
print(f'Hyperparameters {hyp}')
|
45 |
log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory
|
46 |
wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory
|
|
|
101 |
else:
|
102 |
pg0.append(v) # all else
|
103 |
|
104 |
+
if opt.adam:
|
105 |
optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
|
106 |
else:
|
107 |
optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
|
|
|
278 |
imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
|
279 |
|
280 |
# Autocast
|
281 |
+
with amp.autocast(enabled=cuda):
|
282 |
# Forward
|
283 |
pred = model(imgs)
|
284 |
|
|
|
401 |
parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
|
402 |
parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
|
403 |
parser.add_argument('--epochs', type=int, default=300)
|
404 |
+
parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
|
405 |
parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
|
406 |
parser.add_argument('--rect', action='store_true', help='rectangular training')
|
407 |
parser.add_argument('--resume', nargs='?', const='get_last', default=False,
|
408 |
+
help='resume from given path/last.pt, or most recent run if blank')
|
409 |
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
|
410 |
parser.add_argument('--notest', action='store_true', help='only test final epoch')
|
411 |
parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
|
|
|
417 |
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
|
418 |
parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
|
419 |
parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
|
420 |
+
parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
|
421 |
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
|
422 |
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
|
423 |
opt = parser.parse_args()
|
|
|
445 |
if opt.local_rank != -1:
|
446 |
assert torch.cuda.device_count() > opt.local_rank
|
447 |
torch.cuda.set_device(opt.local_rank)
|
448 |
+
device = torch.device('cuda', opt.local_rank)
|
449 |
dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
|
450 |
opt.world_size = dist.get_world_size()
|
451 |
+
assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
|
452 |
opt.batch_size = opt.total_batch_size // opt.world_size
|
453 |
|
454 |
print(opt)
|
455 |
|
456 |
# Train
|
457 |
if not opt.evolve:
|
458 |
+
tb_writer = None
|
459 |
if opt.local_rank in [-1, 0]:
|
460 |
print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
|
461 |
tb_writer = SummaryWriter(log_dir=increment_dir('runs/exp', opt.name))
|
|
|
|
|
462 |
|
463 |
+
train(hyp, opt, device, tb_writer)
|
464 |
|
465 |
# Evolve hyperparameters (optional)
|
466 |
else:
|
467 |
+
# Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
|
468 |
+
meta = {'lr0': (1, 1e-5, 1e-2), # initial learning rate (SGD=1E-2, Adam=1E-3)
|
469 |
+
'momentum': (0.1, 0.6, 0.98), # SGD momentum/Adam beta1
|
470 |
+
'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
|
471 |
+
'giou': (1, 0.02, 0.2), # GIoU loss gain
|
472 |
+
'cls': (1, 0.2, 4.0), # cls loss gain
|
473 |
+
'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
|
474 |
+
'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
|
475 |
+
'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
|
476 |
+
'iou_t': (0, 0.1, 0.7), # IoU training threshold
|
477 |
+
'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
|
478 |
+
'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
|
479 |
+
'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
|
480 |
+
'hsv_s': (1, 0.0, 0.8), # image HSV-Saturation augmentation (fraction)
|
481 |
+
'hsv_v': (1, 0.0, 0.8), # image HSV-Value augmentation (fraction)
|
482 |
+
'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
|
483 |
+
'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
|
484 |
+
'scale': (1, 0.0, 0.9), # image scale (+/- gain)
|
485 |
+
'shear': (1, 0.0, 10.0), # image shear (+/- deg)
|
486 |
+
'perspective': (1, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
|
487 |
+
'flipud': (0, 0.0, 1.0), # image flip up-down (probability)
|
488 |
+
'fliplr': (1, 0.0, 1.0), # image flip left-right (probability)
|
489 |
+
'mixup': (1, 0.0, 1.0)} # image mixup (probability)
|
490 |
|
491 |
+
assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
|
492 |
opt.notest, opt.nosave = True, True # only test/save final epoch
|
493 |
+
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
|
494 |
if opt.bucket:
|
495 |
os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
|
496 |
|
|
|
512 |
mp, s = 0.9, 0.2 # mutation probability, sigma
|
513 |
npr = np.random
|
514 |
npr.seed(int(time.time()))
|
515 |
+
g = np.array([x[0] for x in meta.values()]) # gains 0-1
|
516 |
+
ng = len(meta)
|
517 |
v = np.ones(ng)
|
518 |
while all(v == 1): # mutate until a change occurs (prevent duplicates)
|
519 |
v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
|
|
|
521 |
hyp[k] = x[i + 7] * v[i] # mutate
|
522 |
|
523 |
# Clip to limits
|
524 |
+
for k, v in meta.items():
|
525 |
+
hyp[k] = np.clip(hyp[k], v[1], v[2])
|
|
|
|
|
526 |
|
527 |
# Train mutation
|
528 |
+
results = train(hyp.copy(), opt, device)
|
529 |
|
530 |
# Write mutation results
|
531 |
print_mutation(hyp, results, opt.bucket)
|