Spaces:
Running
on
Zero
Running
on
Zero
File size: 335 Bytes
2d9a728 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
#!/bin/bash
MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
MASTER_PORT=$((10660 + $RANDOM % 10))
echo "All nodes used:"
echo ${ALL_NODES}
echo "Master node:"
echo ${MASTER_NODE}
echo "Args:"
echo $@
torchrun --rdzv_endpoint=${MASTER_NODE}:10069 $@
|