File size: 1,198 Bytes
92894b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# Resume all interrupted trainings in yolov5/ dir including DDP trainings
# Usage: $ python utils/aws/resume.py
import os
import sys
from pathlib import Path
import torch
import yaml
FILE = Path(__file__).resolve()
ROOT = FILE.parents[2] # YOLOv5 root directory
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
port = 0 # --master_port
path = Path("").resolve()
for last in path.rglob("*/**/last.pt"):
ckpt = torch.load(last)
if ckpt["optimizer"] is None:
continue
# Load opt.yaml
with open(last.parent.parent / "opt.yaml", errors="ignore") as f:
opt = yaml.safe_load(f)
# Get device count
d = opt["device"].split(",") # devices
nd = len(d) # number of devices
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
if ddp: # multi-GPU
port += 1
cmd = f"python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}"
else: # single-GPU
cmd = f"python train.py --resume {last}"
cmd += " > /dev/null 2>&1 &" # redirect output to dev/null and run in daemon thread
print(cmd)
os.system(cmd)
|