glenn-jocher commited on
Commit
379396e
1 Parent(s): c4addd7

Yaml constructor posixpath --resume bug fix (#1390)

Browse files

* resume fix for yaml constructor posixpath error

* fix update

* remove weights/ dir backup

Files changed (1) hide show
  1. train.py +4 -6
train.py CHANGED
@@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
37
  def train(hyp, opt, device, tb_writer=None, wandb=None):
38
  logger.info(f'Hyperparameters {hyp}')
39
  save_dir, epochs, batch_size, total_batch_size, weights, rank = \
40
- opt.save_dir, opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
41
 
42
  # Directories
43
  wdir = save_dir / 'weights'
@@ -143,7 +143,6 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
143
  start_epoch = ckpt['epoch'] + 1
144
  if opt.resume:
145
  assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
146
- shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}') # save previous weights
147
  if epochs < start_epoch:
148
  logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
149
  (weights, ckpt['epoch'], epochs))
@@ -431,9 +430,8 @@ if __name__ == '__main__':
431
  # Resume
432
  if opt.resume: # resume an interrupted run
433
  ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
434
- opt.save_dir = Path(ckpt).parent.parent # runs/train/exp
435
  assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
436
- with open(opt.save_dir / 'opt.yaml') as f:
437
  opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
438
  opt.cfg, opt.weights, opt.resume = '', ckpt, True
439
  logger.info('Resuming training from %s' % ckpt)
@@ -443,7 +441,7 @@ if __name__ == '__main__':
443
  assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
444
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
445
  opt.name = 'evolve' if opt.evolve else opt.name
446
- opt.save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # increment run
447
 
448
  # DDP mode
449
  device = select_device(opt.device, batch_size=opt.batch_size)
@@ -517,7 +515,7 @@ if __name__ == '__main__':
517
  assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
518
  opt.notest, opt.nosave = True, True # only test/save final epoch
519
  # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
520
- yaml_file = opt.save_dir / 'hyp_evolved.yaml' # save best result here
521
  if opt.bucket:
522
  os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
523
 
 
37
  def train(hyp, opt, device, tb_writer=None, wandb=None):
38
  logger.info(f'Hyperparameters {hyp}')
39
  save_dir, epochs, batch_size, total_batch_size, weights, rank = \
40
+ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
41
 
42
  # Directories
43
  wdir = save_dir / 'weights'
 
143
  start_epoch = ckpt['epoch'] + 1
144
  if opt.resume:
145
  assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
 
146
  if epochs < start_epoch:
147
  logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
148
  (weights, ckpt['epoch'], epochs))
 
430
  # Resume
431
  if opt.resume: # resume an interrupted run
432
  ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
 
433
  assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
434
+ with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
435
  opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
436
  opt.cfg, opt.weights, opt.resume = '', ckpt, True
437
  logger.info('Resuming training from %s' % ckpt)
 
441
  assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
442
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
443
  opt.name = 'evolve' if opt.evolve else opt.name
444
+ opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok) # increment run
445
 
446
  # DDP mode
447
  device = select_device(opt.device, batch_size=opt.batch_size)
 
515
  assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
516
  opt.notest, opt.nosave = True, True # only test/save final epoch
517
  # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
518
+ yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here
519
  if opt.bucket:
520
  os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
521