Muennighoff
commited on
Commit
•
37bbf74
1
Parent(s):
7a006aa
Upload srun_83m.sh
Browse files- srun_83m.sh +5 -21
srun_83m.sh
CHANGED
@@ -1,19 +1,5 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
|
3 |
-
#SBATCH --nodes=8
|
4 |
-
#SBATCH --ntasks-per-node=1
|
5 |
-
#SBATCH --cpus-per-task=32
|
6 |
-
#SBATCH --mem=256G
|
7 |
-
#SBATCH -p eap
|
8 |
-
#SBATCH -t 2-0:00:00
|
9 |
-
#SBATCH --gpus-per-node=mi250:8
|
10 |
-
#SBATCH --exclusive=user
|
11 |
-
#SBATCH --hint=nomultithread
|
12 |
-
#SBATCH --account=project_462000119
|
13 |
-
#SBATCH -o logs/%j.out
|
14 |
-
#SBATCH -e logs/%j.err
|
15 |
-
|
16 |
SLURM_JOB_NUM_NODES=32
|
|
|
17 |
|
18 |
# if run without sbatch, invoke here
|
19 |
if [ -z $SLURM_JOB_ID ]; then
|
@@ -28,18 +14,16 @@ set -euo pipefail
|
|
28 |
ln -f -s $SLURM_JOB_ID.out logs/latest.out
|
29 |
ln -f -s $SLURM_JOB_ID.err logs/latest.err
|
30 |
|
31 |
-
KILL_SWITCH_PATH=kill-switch
|
32 |
-
CHECKPOINT_PATH=
|
33 |
-
TENSORBOARD_PATH=
|
34 |
-
# Start from scratch
|
35 |
-
rm -rf "$CHECKPOINT_PATH" "$TENSORBOARD_PATH"
|
36 |
mkdir -p $CHECKPOINT_PATH
|
37 |
mkdir -p $TENSORBOARD_PATH
|
38 |
|
39 |
# Data
|
40 |
VOCAB_FILE="gpt2/vocab.json"
|
41 |
MERGE_FILE="gpt2/merges.txt"
|
42 |
-
DATA_PATH="/scratch/project_462000119/data/pile/megatron_data"
|
43 |
|
44 |
PP_SIZE=1
|
45 |
TP_SIZE=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
SLURM_JOB_NUM_NODES=32
|
2 |
+
VARIANT=83m
|
3 |
|
4 |
# if run without sbatch, invoke here
|
5 |
if [ -z $SLURM_JOB_ID ]; then
|
|
|
14 |
ln -f -s $SLURM_JOB_ID.out logs/latest.out
|
15 |
ln -f -s $SLURM_JOB_ID.err logs/latest.err
|
16 |
|
17 |
+
KILL_SWITCH_PATH=kill-switch-$VARIANT
|
18 |
+
CHECKPOINT_PATH=checkpoints_$VARIANT
|
19 |
+
TENSORBOARD_PATH=tensorboard_$VARIANT
|
|
|
|
|
20 |
mkdir -p $CHECKPOINT_PATH
|
21 |
mkdir -p $TENSORBOARD_PATH
|
22 |
|
23 |
# Data
|
24 |
VOCAB_FILE="gpt2/vocab.json"
|
25 |
MERGE_FILE="gpt2/merges.txt"
|
26 |
+
DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
|
27 |
|
28 |
PP_SIZE=1
|
29 |
TP_SIZE=1
|