hyliu
/

lambdanet

Upload folder using huggingface_hub

8ec10cf verified 9 months ago

1.89 kB

	""" distributed launcher adopted from torch.distributed.launch
	usage example: https://github.com/facebookresearch/maskrcnn-benchmark
	This enables using multiprocessing for each spawned process (as they are treated as main processes)
	"""
	import sys
	import subprocess
	from argparse import ArgumentParser, REMAINDER

	from utils import str2bool, int2str

	def parse_args():
	parser = ArgumentParser(description="PyTorch distributed training launch "
	"helper utilty that will spawn up "
	"multiple distributed processes")


	parser.add_argument('--n_GPUs', type=int, default=1, help='the number of GPUs for training')

	# positional
	parser.add_argument("training_script", type=str,
	help="The full path to the single GPU training "
	"program/script to be launched in parallel, "
	"followed by all the arguments for the "
	"training script")

	# rest from the training program
	parser.add_argument('training_script_args', nargs=REMAINDER)
	return parser.parse_args()

	def main():
	args = parse_args()

	processes = []
	for rank in range(0, args.n_GPUs):
	cmd = [sys.executable]

	cmd.append(args.training_script)
	cmd.extend(args.training_script_args)

	cmd += ['--distributed', 'True']
	cmd += ['--launched', 'True']
	cmd += ['--n_GPUs', str(args.n_GPUs)]
	cmd += ['--rank', str(rank)]

	process = subprocess.Popen(cmd)
	processes.append(process)

	for process in processes:
	process.wait()
	if process.returncode != 0:
	raise subprocess.CalledProcessError(returncode=process.returncode,
	cmd=cmd)

	if __name__ == "__main__":
	main()