Muennighoff
commited on
Commit
•
44f990d
1
Parent(s):
9149d87
Update sbatch_8b7_178b_25b_jz_tmp.sh
Browse files
sbatch_8b7_178b_25b_jz_tmp.sh
CHANGED
@@ -28,7 +28,10 @@ source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0
|
|
28 |
GPUS_PER_NODE=8
|
29 |
NNODES=$SLURM_NNODES
|
30 |
|
31 |
-
|
|
|
|
|
|
|
32 |
|
33 |
|
34 |
|
@@ -171,7 +174,24 @@ echo $CMD
|
|
171 |
|
172 |
echo "START $SLURM_JOBID: $(date)"
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
# bash launch_srun.sh $CMD
|
175 |
-
srun --label launch.sh $CMD
|
176 |
|
177 |
echo "END $SLURM_JOBID: $(date)"
|
|
|
28 |
GPUS_PER_NODE=8
|
29 |
NNODES=$SLURM_NNODES
|
30 |
|
31 |
+
TRAIN_DATA_PATH=train55boscar.txt
|
32 |
+
# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_55B_text_document"
|
33 |
+
VALID_DATA_PATH=val.txt
|
34 |
+
# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
|
35 |
|
36 |
|
37 |
|
|
|
174 |
|
175 |
echo "START $SLURM_JOBID: $(date)"
|
176 |
|
177 |
+
|
178 |
+
### JZ ###
|
179 |
+
export LAUNCHER="python -u -m torch.distributed.run \
|
180 |
+
--nproc_per_node $GPUS_PER_NODE \
|
181 |
+
--nnodes $NNODES \
|
182 |
+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
|
183 |
+
--rdzv_backend c10d \
|
184 |
+
--max_restarts 0 \
|
185 |
+
--tee 3 \
|
186 |
+
"
|
187 |
+
SRUN_ARGS=" \
|
188 |
+
--wait=60 \
|
189 |
+
--kill-on-bad-exit=1 \
|
190 |
+
"
|
191 |
+
srun $SRUN_ARGS --jobid $SLURM_JOBID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $CMD" 2>&1 | tee -a $LOGS_PATH/main_log.txt
|
192 |
+
|
193 |
+
### LUMI ###
|
194 |
# bash launch_srun.sh $CMD
|
195 |
+
# srun --label launch.sh $CMD
|
196 |
|
197 |
echo "END $SLURM_JOBID: $(date)"
|