guoqiang wang
commited on
Commit
•
1635bab
1
Parent(s):
1278882
Upload ds_pretrain_nvidia.sh
Browse files- ds_pretrain_nvidia.sh +21 -0
ds_pretrain_nvidia.sh
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /bin/bash
|
2 |
+
|
3 |
+
# Change for multinode config
|
4 |
+
|
5 |
+
NUM_WORKERS=32
|
6 |
+
NUM_GPUS_PER_WORKER=3
|
7 |
+
MP_SIZE=1
|
8 |
+
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
|
9 |
+
|
10 |
+
source $1
|
11 |
+
DATESTR=$(date +"%m-%d-%H-%M")
|
12 |
+
|
13 |
+
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
|
14 |
+
HOST_FILE_PATH="./config/hostfile"
|
15 |
+
|
16 |
+
#`mkdir logs
|
17 |
+
run_cmd="${OPTIONS_NCCL} deepspeed --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_glm.py ${gpt_options} 2>&1 | tee ../logs/log-${DATESTR}.txt"
|
18 |
+
echo ${run_cmd}
|
19 |
+
eval ${run_cmd}
|
20 |
+
|
21 |
+
set +x
|