Spaces:

tobiasc
/

conex

Build error

App Files Files Community

conex / espnet2 /bin /launch.py

tobiasc

Initial commit

ad16788 over 2 years ago

raw

history blame contribute delete

13.1 kB

	#!/usr/bin/env python3
	import argparse
	import logging
	import os
	from pathlib import Path
	import shlex
	import shutil
	import subprocess
	import sys
	import uuid

	from espnet.utils.cli_utils import get_commandline_args
	from espnet2.utils.types import str2bool
	from espnet2.utils.types import str_or_none


	def get_parser():
	parser = argparse.ArgumentParser(
	description="Launch distributed process with appropriate options. ",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)
	parser.add_argument(
	"--cmd",
	help="The path of cmd script of Kaldi: run.pl. queue.pl, or slurm.pl",
	default="utils/run.pl",
	)
	parser.add_argument(
	"--log",
	help="The path of log file used by cmd",
	default="run.log",
	)
	parser.add_argument(
	"--max_num_log_files",
	help="The maximum number of log-files to be kept",
	default=1000,
	)
	parser.add_argument(
	"--ngpu", type=int, default=1, help="The number of GPUs per node"
	)
	egroup = parser.add_mutually_exclusive_group()
	egroup.add_argument("--num_nodes", type=int, default=1, help="The number of nodes")
	egroup.add_argument(
	"--host",
	type=str,
	default=None,
	help="Directly specify the host names. The job are submitted via SSH. "
	"Multiple host names can be specified by splitting by comma. e.g. host1,host2"
	" You can also the device id after the host name with ':'. e.g. "
	"host1:0:2:3,host2:0:2. If the device ids are specified in this way, "
	"the value of --ngpu is ignored.",
	)
	parser.add_argument(
	"--envfile",
	type=str_or_none,
	default="path.sh",
	help="Source the shell script before executing command. "
	"This option is used when --host is specified.",
	)

	parser.add_argument(
	"--multiprocessing_distributed",
	type=str2bool,
	default=True,
	help="Distributed method is used when single-node mode.",
	)
	parser.add_argument(
	"--master_port",
	type=int,
	default=None,
	help="Specify the port number of master"
	"Master is a host machine has RANK0 process.",
	)
	parser.add_argument(
	"--master_addr",
	type=str,
	default=None,
	help="Specify the address s of master. "
	"Master is a host machine has RANK0 process.",
	)
	parser.add_argument(
	"--init_file_prefix",
	type=str,
	default=".dist_init_",
	help="The file name prefix for init_file, which is used for "
	"'Shared-file system initialization'. "
	"This option is used when --port is not specified",
	)
	parser.add_argument("args", type=str, nargs="+")
	return parser


	def main(cmd=None):
	logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
	logging.basicConfig(level=logging.INFO, format=logfmt)
	logging.info(get_commandline_args())

	parser = get_parser()
	args = parser.parse_args(cmd)
	args.cmd = shlex.split(args.cmd)

	if args.host is None and shutil.which(args.cmd[0]) is None:
	raise RuntimeError(
	f"The first args of --cmd should be a script path. e.g. utils/run.pl: "
	f"{args.cmd[0]}"
	)

	# Specify init_method:
	# See: https://pytorch.org/docs/stable/distributed.html#initialization
	if args.host is None and args.num_nodes <= 1:
	# Automatically set init_method if num_node=1
	init_method = None
	else:
	if args.master_port is None:
	# Try "shared-file system initialization" if master_port is not specified
	# Give random name to avoid reusing previous file
	init_file = args.init_file_prefix + str(uuid.uuid4())
	init_file = Path(init_file).absolute()
	Path(init_file).parent.mkdir(exist_ok=True, parents=True)
	init_method = ["--dist_init_method", f"file://{init_file}"]
	else:
	init_method = ["--dist_master_port", str(args.master_port)]

	# This can be omitted if slurm mode
	if args.master_addr is not None:
	init_method += ["--dist_master_addr", args.master_addr]
	elif args.host is not None:
	init_method += [
	"--dist_master_addr",
	args.host.split(",")[0].split(":")[0],
	]

	# Log-rotation
	for i in range(args.max_num_log_files - 1, -1, -1):
	if i == 0:
	p = Path(args.log)
	pn = p.parent / (p.stem + ".1" + p.suffix)
	else:
	_p = Path(args.log)
	p = _p.parent / (_p.stem + f".{i}" + _p.suffix)
	pn = _p.parent / (_p.stem + f".{i + 1}" + _p.suffix)

	if p.exists():
	if i == args.max_num_log_files - 1:
	p.unlink()
	else:
	shutil.move(p, pn)

	processes = []
	# Submit command via SSH
	if args.host is not None:
	hosts = []
	ids_list = []
	# e.g. args.host = "host1:0:2,host2:0:1"
	for host in args.host.split(","):
	# e.g host = "host1:0:2"
	sps = host.split(":")
	host = sps[0]
	if len(sps) > 1:
	ids = [int(x) for x in sps[1:]]
	else:
	ids = list(range(args.ngpu))
	hosts.append(host)
	ids_list.append(ids)

	world_size = sum(max(len(x), 1) for x in ids_list)
	logging.info(f"{len(hosts)}nodes with world_size={world_size} via SSH")

	if args.envfile is not None:
	env = f"source {args.envfile}"
	else:
	env = ""

	if args.log != "-":
	Path(args.log).parent.mkdir(parents=True, exist_ok=True)
	f = Path(args.log).open("w", encoding="utf-8")
	else:
	# Output to stdout/stderr
	f = None

	rank = 0
	for host, ids in zip(hosts, ids_list):
	ngpu = 1 if len(ids) > 0 else 0
	ids = ids if len(ids) > 0 else ["none"]

	for local_rank in ids:
	cmd = (
	args.args
	+ [
	"--ngpu",
	str(ngpu),
	"--multiprocessing_distributed",
	"false",
	"--local_rank",
	str(local_rank),
	"--dist_rank",
	str(rank),
	"--dist_world_size",
	str(world_size),
	]
	+ init_method
	)
	if ngpu == 0:
	# Gloo supports both GPU and CPU mode.
	# See: https://pytorch.org/docs/stable/distributed.html
	cmd += ["--dist_backend", "gloo"]

	heredoc = f"""<< EOF
	set -euo pipefail
	cd {os.getcwd()}
	{env}
	{" ".join([c if len(c) != 0 else "''" for c in cmd])}
	EOF
	"""

	# FIXME(kamo): The process will be alive
	# even if this program is stopped because we don't set -t here,
	# i.e. not assigning pty,
	# and the program is not killed when SSH connection is closed.
	process = subprocess.Popen(
	["ssh", host, "bash", heredoc],
	stdout=f,
	stderr=f,
	)

	processes.append(process)

	rank += 1

	# If Single node
	elif args.num_nodes <= 1:
	if args.ngpu > 1:
	if args.multiprocessing_distributed:
	# NOTE:
	# If multiprocessing_distributed=true,
	# -> Distributed mode, which is multi-process and Multi-GPUs.
	# and TCP initializetion is used if single-node case:
	# e.g. init_method="tcp://localhost:20000"
	logging.info(f"single-node with {args.ngpu}gpu on distributed mode")
	else:
	# NOTE:
	# If multiprocessing_distributed=false
	# -> "DataParallel" mode, which is single-process
	# and Multi-GPUs with threading.
	# See:
	# https://discuss.pytorch.org/t/why-torch-nn-parallel-distributeddataparallel-runs-faster-than-torch-nn-dataparallel-on-single-machine-with-multi-gpu/32977/2
	logging.info(f"single-node with {args.ngpu}gpu using DataParallel")

	# Using cmd as it is simply
	cmd = (
	args.cmd
	# arguments for ${cmd}
	+ ["--gpu", str(args.ngpu), args.log]
	# arguments for *_train.py
	+ args.args
	+ [
	"--ngpu",
	str(args.ngpu),
	"--multiprocessing_distributed",
	str(args.multiprocessing_distributed),
	]
	)
	process = subprocess.Popen(cmd)
	processes.append(process)

	elif Path(args.cmd[0]).name == "run.pl":
	raise RuntimeError("run.pl doesn't support submitting to the other nodes.")

	elif Path(args.cmd[0]).name == "ssh.pl":
	raise RuntimeError("Use --host option instead of ssh.pl")

	# If Slurm
	elif Path(args.cmd[0]).name == "slurm.pl":
	logging.info(f"{args.num_nodes}nodes and {args.ngpu}gpu-per-node using srun")
	cmd = (
	args.cmd
	# arguments for ${cmd}
	+ [
	"--gpu",
	str(args.ngpu),
	"--num_threads",
	str(max(args.ngpu, 1)),
	"--num_nodes",
	str(args.num_nodes),
	args.log,
	"srun",
	# Inherit all enviroment variable from parent process
	"--export=ALL",
	]
	# arguments for *_train.py
	+ args.args
	+ [
	"--ngpu",
	str(args.ngpu),
	"--multiprocessing_distributed",
	"true",
	"--dist_launcher",
	"slurm",
	]
	+ init_method
	)
	if args.ngpu == 0:
	# Gloo supports both GPU and CPU mode.
	# See: https://pytorch.org/docs/stable/distributed.html
	cmd += ["--dist_backend", "gloo"]
	process = subprocess.Popen(cmd)
	processes.append(process)

	else:
	# This pattern can also works with Slurm.

	logging.info(f"{args.num_nodes}nodes and {args.ngpu}gpu-per-node using mpirun")
	cmd = (
	args.cmd
	# arguments for ${cmd}
	+ [
	"--gpu",
	str(args.ngpu),
	"--num_threads",
	str(max(args.ngpu, 1)),
	# Make sure scheduler setting, i.e. conf/queue.conf
	# so that --num_nodes requires 1process-per-node
	"--num_nodes",
	str(args.num_nodes),
	args.log,
	"mpirun",
	# -np option can be omitted with Torque/PBS
	"-np",
	str(args.num_nodes),
	]
	# arguments for *_train.py
	+ args.args
	+ [
	"--ngpu",
	str(args.ngpu),
	"--multiprocessing_distributed",
	"true",
	"--dist_launcher",
	"mpi",
	]
	+ init_method
	)
	if args.ngpu == 0:
	# Gloo supports both GPU and CPU mode.
	# See: https://pytorch.org/docs/stable/distributed.html
	cmd += ["--dist_backend", "gloo"]
	process = subprocess.Popen(cmd)
	processes.append(process)

	logging.info(f"log file: {args.log}")

	failed = False
	while any(p.returncode is None for p in processes):
	for process in processes:
	# If any process is failed, try to kill the other processes too
	if failed and process.returncode is not None:
	process.kill()
	else:
	try:
	process.wait(0.5)
	except subprocess.TimeoutExpired:
	pass

	if process.returncode is not None and process.returncode != 0:
	failed = True

	for process in processes:
	if process.returncode != 0:
	print(
	subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd),
	file=sys.stderr,
	)
	p = Path(args.log)
	if p.exists():
	with p.open() as f:
	lines = list(f)
	raise RuntimeError(
	f"\n################### The last 1000 lines of {args.log} "
	f"###################\n" + "".join(lines[-1000:])
	)
	else:
	raise RuntimeError


	if __name__ == "__main__":
	main()