File size: 1,101 Bytes
2d9a728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env sh

mode=$1 # slurm or local
nnodes=$2
ngpus=$3
cmd=${@:4}  # the command to run. i.e. tasks/pretrain.py ...

if [[ "$mode" == "slurm" ]]; then # slurm
	master_node=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
	all_nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
	echo "All nodes used: ${all_nodes}"
	echo "Master node ${master_node}"

	head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$master_node" hostname --ip-address | awk '{print $1}')
	# head_node_ip=$master_node
	rdzv_endpoint="${head_node_ip}:${MASTER_PORT:-40000}"
	bin="srun"

else # local
	rdzv_endpoint="${MASTER_ADDR:-localhost}:${MASTER_PORT:-40000}"
	bin=""
fi

echo "PYTHONPATH: ${PYTHONPATH}"
which_python=$(which python)
echo "which python: ${which_python}"
export PYTHONPATH=${PYTHONPATH}:${which_python}
export PYTHONPATH=${PYTHONPATH}:.
echo "PYTHONPATH: ${PYTHONPATH}"

#run command
$bin torchrun --nnodes=$nnodes \
	--nproc_per_node=$ngpus \
	--rdzv_backend=c10d \
	--rdzv_endpoint=${rdzv_endpoint} \
    $cmd

echo "Finish at dir: ${PWD}"
############### ======> Your training scripts [END]