Muennighoff commited on
Commit
44f990d
1 Parent(s): 9149d87

Update sbatch_8b7_178b_25b_jz_tmp.sh

Browse files
Files changed (1) hide show
  1. sbatch_8b7_178b_25b_jz_tmp.sh +22 -2
sbatch_8b7_178b_25b_jz_tmp.sh CHANGED
@@ -28,7 +28,10 @@ source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0
28
  GPUS_PER_NODE=8
29
  NNODES=$SLURM_NNODES
30
 
31
- DATA_PATH="XXXX"
 
 
 
32
 
33
 
34
 
@@ -171,7 +174,24 @@ echo $CMD
171
 
172
  echo "START $SLURM_JOBID: $(date)"
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  # bash launch_srun.sh $CMD
175
- srun --label launch.sh $CMD
176
 
177
  echo "END $SLURM_JOBID: $(date)"
 
28
  GPUS_PER_NODE=8
29
  NNODES=$SLURM_NNODES
30
 
31
+ TRAIN_DATA_PATH=train55boscar.txt
32
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_55B_text_document"
33
+ VALID_DATA_PATH=val.txt
34
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
35
 
36
 
37
 
 
174
 
175
  echo "START $SLURM_JOBID: $(date)"
176
 
177
+
178
+ ### JZ ###
179
+ export LAUNCHER="python -u -m torch.distributed.run \
180
+ --nproc_per_node $GPUS_PER_NODE \
181
+ --nnodes $NNODES \
182
+ --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
183
+ --rdzv_backend c10d \
184
+ --max_restarts 0 \
185
+ --tee 3 \
186
+ "
187
+ SRUN_ARGS=" \
188
+ --wait=60 \
189
+ --kill-on-bad-exit=1 \
190
+ "
191
+ srun $SRUN_ARGS --jobid $SLURM_JOBID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $CMD" 2>&1 | tee -a $LOGS_PATH/main_log.txt
192
+
193
+ ### LUMI ###
194
  # bash launch_srun.sh $CMD
195
+ # srun --label launch.sh $CMD
196
 
197
  echo "END $SLURM_JOBID: $(date)"