glenn-jocher
commited on
Commit
•
7180b22
1
Parent(s):
73cf75f
DDP Multi-GPU --resume bug fix (#1810)
Browse files
train.py
CHANGED
@@ -472,9 +472,10 @@ if __name__ == '__main__':
|
|
472 |
if opt.resume: # resume an interrupted run
|
473 |
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
|
474 |
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
|
|
|
475 |
with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
|
476 |
opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
|
477 |
-
opt.cfg, opt.weights, opt.resume = '', ckpt, True
|
478 |
logger.info('Resuming training from %s' % ckpt)
|
479 |
else:
|
480 |
# opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
|
|
|
472 |
if opt.resume: # resume an interrupted run
|
473 |
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
|
474 |
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
|
475 |
+
apriori = opt.global_rank, opt.local_rank
|
476 |
with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
|
477 |
opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
|
478 |
+
opt.cfg, opt.weights, opt.resume, opt.global_rank, opt.local_rank = '', ckpt, True, *apriori # reinstate
|
479 |
logger.info('Resuming training from %s' % ckpt)
|
480 |
else:
|
481 |
# opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
|