guoqiang wang commited on
Commit
1635bab
1 Parent(s): 1278882

Upload ds_pretrain_nvidia.sh

Browse files
Files changed (1) hide show
  1. ds_pretrain_nvidia.sh +21 -0
ds_pretrain_nvidia.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/bash
2
+
3
+ # Change for multinode config
4
+
5
+ NUM_WORKERS=32
6
+ NUM_GPUS_PER_WORKER=3
7
+ MP_SIZE=1
8
+ MASTER_PORT=$(shuf -n 1 -i 10000-65535)
9
+
10
+ source $1
11
+ DATESTR=$(date +"%m-%d-%H-%M")
12
+
13
+ OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
14
+ HOST_FILE_PATH="./config/hostfile"
15
+
16
+ #`mkdir logs
17
+ run_cmd="${OPTIONS_NCCL} deepspeed --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_glm.py ${gpt_options} 2>&1 | tee ../logs/log-${DATESTR}.txt"
18
+ echo ${run_cmd}
19
+ eval ${run_cmd}
20
+
21
+ set +x