glenn-jocher commited on
Commit
7180b22
1 Parent(s): 73cf75f

DDP Multi-GPU --resume bug fix (#1810)

Browse files
Files changed (1) hide show
  1. train.py +2 -1
train.py CHANGED
@@ -472,9 +472,10 @@ if __name__ == '__main__':
472
  if opt.resume: # resume an interrupted run
473
  ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
474
  assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
 
475
  with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
476
  opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
477
- opt.cfg, opt.weights, opt.resume = '', ckpt, True
478
  logger.info('Resuming training from %s' % ckpt)
479
  else:
480
  # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
 
472
  if opt.resume: # resume an interrupted run
473
  ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
474
  assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
475
+ apriori = opt.global_rank, opt.local_rank
476
  with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
477
  opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
478
+ opt.cfg, opt.weights, opt.resume, opt.global_rank, opt.local_rank = '', ckpt, True, *apriori # reinstate
479
  logger.info('Resuming training from %s' % ckpt)
480
  else:
481
  # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')