# Resume all interrupted trainings in yolov5/ dir including DDP trainings # Usage: $ python utils/aws/resume.py import os import sys from pathlib import Path import torch import yaml FILE = Path(__file__).resolve() ROOT = FILE.parents[2] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH port = 0 # --master_port path = Path("").resolve() for last in path.rglob("*/**/last.pt"): ckpt = torch.load(last) if ckpt["optimizer"] is None: continue # Load opt.yaml with open(last.parent.parent / "opt.yaml", errors="ignore") as f: opt = yaml.safe_load(f) # Get device count d = opt["device"].split(",") # devices nd = len(d) # number of devices ddp = nd > 1 or ( nd == 0 and torch.cuda.device_count() > 1 ) # distributed data parallel if ddp: # multi-GPU port += 1 cmd = f"python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}" else: # single-GPU cmd = f"python train.py --resume {last}" cmd += " > /dev/null 2>&1 &" # redirect output to dev/null and run in daemon thread print(cmd) os.system(cmd)