glm / ds_pretrain_nvidia.sh
guoqiang wang
Upload ds_pretrain_nvidia.sh
1635bab
#! /bin/bash
# Change for multinode config
NUM_WORKERS=32
NUM_GPUS_PER_WORKER=3
MP_SIZE=1
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
source $1
DATESTR=$(date +"%m-%d-%H-%M")
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
HOST_FILE_PATH="./config/hostfile"
#`mkdir logs
run_cmd="${OPTIONS_NCCL} deepspeed --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_glm.py ${gpt_options} 2>&1 | tee ../logs/log-${DATESTR}.txt"
echo ${run_cmd}
eval ${run_cmd}
set +x