File size: 4,254 Bytes
b757584 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
#!/bin/bash
#SBATCH --exclude=nid005159
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=256G
#SBATCH -p small-g
#SBATCH -t 2-0:00:00
#SBATCH --gpus-per-node=mi250:1
#SBATCH --exclusive=user
#SBATCH --hint=nomultithread
#SBATCH --account=project_462000119
#SBATCH -o logs/%j.out
#SBATCH -e logs/%j.err
# if run without sbatch, invoke here
if [ -z $SLURM_JOB_ID ]; then
mkdir -p logs
sbatch "$0"
exit
fi
set -euo pipefail
# symlink logs/latest_eval.out and logs/latest_eval.err
ln -f -s $SLURM_JOB_ID.out logs/latest_eval.out
ln -f -s $SLURM_JOB_ID.err logs/latest_eval.err
# Data
#CHECKPOINT_PATH=/scratch/project_462000119/muennighoff/nov-2022-optimization/checkpoints/global_step10
#VARIANT=global_step10
CHECKPOINT_PATH=lm1-220m/global_step14324
VARIANT=lm1-220m
CHECKPOINT_PATH=lm1-220m-7b5-oscar/global_step14324
VARIANT=lm1-220m-7b5-oscar
#CHECKPOINT_PATH=lm1-280m/global_step11269
#VARIANT=lm1-280m-5b9
#CHECKPOINT_PATH=lm1-280m-5b9-oscar/global_step11269
#VARIANT=lm1-280m-5b9-oscar
CHECKPOINT_PATH=lm1-1b1-21b-oscar/global_step39672
VARIANT=lm1-1b1-21b-oscar
#CHECKPOINT_PATH=lm1-1b1-21b/global_step39672
#VARIANT=lm1-1b1-21b
#CHECKPOINT_PATH=lm1-2b8-55b-oscar/global_step52452
#VARIANT=lm1-2b8-55b-oscar
#CHECKPOINT_PATH=lm1-2b8-55b/global_step52452
#VARIANT=lm1-2b8-55b
#CHECKPOINT_PATH=lm1-2b8-55b-oscar/global_step52452
#VARIANT=lm1-2b8-55b-oscar
#CHECKPOINT_PATH=lm1-3b9-77b/global_step73814
#VARIANT=lm1-3b9-77b
#CHECKPOINT_PATH=lm1-1b1-21b-c4/global_step39672
#VARIANT=lm1-1b1-21b-c4
# tensorboard_2b855b11bc4 tensorboard_2b855b14bc4 tensorboard_2b855b18bc4 tensorboard_2b855b28bc4 tensorboard_2b855b9bc4
#2b855b50c4py 2b855b60c4py 2b855b70c4py 2b855b80c4py 2b855b90c4py
VARIANT=2b855b70c4py
CHECKPOINT_PATH=lm1-2b8-55b-c4py/$VARIANT/global_step52452
#2b855b11bc4 2b855b14bc4 2b855b18bc4 2b855b28bc4 2b855b9bc4
#VARIANT=2b855b9boscar
#CHECKPOINT_PATH=lm1-2b8-55b-oscar-repetitions/$VARIANT/global_step52452
#VARIANT=realtasky
#CHECKPOINT_PATH=checkpoints_2b855brealtasky/global_step52452
#CHECKPOINT_PATH=lm1-2b8-55b-c4-repetitions/2b855b55bc4/global_step52452
CHECKPOINT_PATH=checkpoints_2b855b55bc4ul2valfast/global_step52452
VARIANT=ul2valfast
CHECKPOINT_PATH=lm2-2b8-55b-c4-new/global_step52452
VARIANT=ul2new
export HF_DATASETS_OFFLINE=1
export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache
VOCAB_FILE="gpt2/vocab.json"
MERGE_FILE="gpt2/merges.txt"
PP_SIZE=1
TP_SIZE=1
# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
# make as big as it can fit into gpu w/o OOM, but not too close to 100%
EVAL_MICRO_BATCH_SIZE=1
MICRO_BS_MULTIPLIER=1
# Model parameters
SEQ_LEN=2048
# Dummy arguments
MEGATRON_REQUIRED_ARGS=" \
--num-layers -1 \
--hidden-size -1 \
--num-attention-heads -1 \
--seq-length -1 \
--max-position-embeddings -1 \
"
ZERO_STAGE=0
mkdir -p ds_configs
DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
cat <<EOF > $DS_CONFIG_PATH
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 1,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOF
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config $DS_CONFIG_PATH \
--zero-stage $ZERO_STAGE \
"
CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate.py \
--load $CHECKPOINT_PATH \
--results_path $VARIANT-results.json \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--micro-batch-size $EVAL_MICRO_BATCH_SIZE \
--no-load-optim \
--no-load-rng \
--bf16 \
--inference \
--seq-length $SEQ_LEN \
--task_list anli_r1,anli_r2,anli_r3,cb,copa,hellaswag,rte,winogrande,storycloze_2016,boolq,arc_easy,arc_challenge,sciq,piqa \
--intermed_results \
--adaptive_seq_len \
--add_denoiser \
--micro_bs_multiplier $MICRO_BS_MULTIPLIER \
$MEGATRON_REQUIRED_ARGS \
$DEEPSPEED_ARGS \
"
echo $CMD
echo "START $SLURM_JOBID: $(date)"
srun --label launch.sh $CMD
echo "END $SLURM_JOBID: $(date)"
|