Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,101 Bytes
2d9a728 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
#!/usr/bin/env sh
mode=$1 # slurm or local
nnodes=$2
ngpus=$3
cmd=${@:4} # the command to run. i.e. tasks/pretrain.py ...
if [[ "$mode" == "slurm" ]]; then # slurm
master_node=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
all_nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
echo "All nodes used: ${all_nodes}"
echo "Master node ${master_node}"
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$master_node" hostname --ip-address | awk '{print $1}')
# head_node_ip=$master_node
rdzv_endpoint="${head_node_ip}:${MASTER_PORT:-40000}"
bin="srun"
else # local
rdzv_endpoint="${MASTER_ADDR:-localhost}:${MASTER_PORT:-40000}"
bin=""
fi
echo "PYTHONPATH: ${PYTHONPATH}"
which_python=$(which python)
echo "which python: ${which_python}"
export PYTHONPATH=${PYTHONPATH}:${which_python}
export PYTHONPATH=${PYTHONPATH}:.
echo "PYTHONPATH: ${PYTHONPATH}"
#run command
$bin torchrun --nnodes=$nnodes \
--nproc_per_node=$ngpus \
--rdzv_backend=c10d \
--rdzv_endpoint=${rdzv_endpoint} \
$cmd
echo "Finish at dir: ${PWD}"
############### ======> Your training scripts [END]
|