|
""" distributed launcher adopted from torch.distributed.launch |
|
usage example: https://github.com/facebookresearch/maskrcnn-benchmark |
|
This enables using multiprocessing for each spawned process (as they are treated as main processes) |
|
""" |
|
import sys |
|
import subprocess |
|
from argparse import ArgumentParser, REMAINDER |
|
|
|
from utils import str2bool, int2str |
|
|
|
def parse_args(): |
|
parser = ArgumentParser(description="PyTorch distributed training launch " |
|
"helper utilty that will spawn up " |
|
"multiple distributed processes") |
|
|
|
|
|
parser.add_argument('--n_GPUs', type=int, default=1, help='the number of GPUs for training') |
|
|
|
|
|
parser.add_argument("training_script", type=str, |
|
help="The full path to the single GPU training " |
|
"program/script to be launched in parallel, " |
|
"followed by all the arguments for the " |
|
"training script") |
|
|
|
|
|
parser.add_argument('training_script_args', nargs=REMAINDER) |
|
return parser.parse_args() |
|
|
|
def main(): |
|
args = parse_args() |
|
|
|
processes = [] |
|
for rank in range(0, args.n_GPUs): |
|
cmd = [sys.executable] |
|
|
|
cmd.append(args.training_script) |
|
cmd.extend(args.training_script_args) |
|
|
|
cmd += ['--distributed', 'True'] |
|
cmd += ['--launched', 'True'] |
|
cmd += ['--n_GPUs', str(args.n_GPUs)] |
|
cmd += ['--rank', str(rank)] |
|
|
|
process = subprocess.Popen(cmd) |
|
processes.append(process) |
|
|
|
for process in processes: |
|
process.wait() |
|
if process.returncode != 0: |
|
raise subprocess.CalledProcessError(returncode=process.returncode, |
|
cmd=cmd) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|