#!/bin/bash #SBATCH --exclude=nid005159 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=4 #SBATCH --mem=256G #SBATCH -p small-g #SBATCH -t 2-0:00:00 #SBATCH --gpus-per-node=mi250:0 #SBATCH --exclusive=user #SBATCH --hint=nomultithread #SBATCH --account=project_462000119 #SBATCH -o logs/%j.out #SBATCH -e logs/%j.err # if run without sbatch, invoke here if [ -z $SLURM_JOB_ID ]; then mkdir -p logs sbatch "$0" exit fi set -euo pipefail # symlink logs/latest_eval.out and logs/latest_eval.err ln -f -s $SLURM_JOB_ID.out logs/latest_eval.out ln -f -s $SLURM_JOB_ID.err logs/latest_eval.err source /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/bin/activate echo "START TIME: $(date)" # defining the right environment variables export HF_DATASETS_OFFLINE=1 export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache # Converted transformer checkpoint # cd /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/bigscience/lm-evaluation-harness CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/global_step80108 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b84boscar/global_step80108 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/global_step84877 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/global_step84877 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/global_step84877 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/global_step84877 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/global_step84877 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/global_step84877 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/global_step84877 ) CKPTSX=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b84bc4seed1/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b84bc4seed2/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b84bc4seed3/global_step80108 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b84bc4seed4/global_step80108 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b1b25c4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b4bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b9bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b11bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b14bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b18bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b28bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b55bc4/global_step52452 ) CKPTSX=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b1b25oscar/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b4boscar/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b9boscar/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b11boscar/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b14boscar/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b18boscar/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b28boscar/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscar-repetitions/2b855b55boscar/global_step52452 ) CKPTSX=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b55bc4seed1/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b55bc4seed2/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b55bc4seed3/global_step52452 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b14bc4/global_step52452 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b1b25c4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b4bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b9bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b11bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b14bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b18bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b28bc4/global_step52452 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4-repetitions/2b855b55bc4/global_step52452 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/dec-2022-ul2/lm4-2b8-55b-c4/global_step52452 ) CKPTS=( /pfs/lustrep4/scratch/project_462000119/muennighoff/dec-2022-ul2/checkpoints_2b855b55bc4ul2ndfixnew/global_step52452 ) FEWSHOT_CONFIGS=( 0 1 2 3 4 5 ) TOKENIZER=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2 # Iterate through all possible combinations of data config, model ckpt & fewshot config and run the jobs for ((i=0; i<${#CKPTS[@]}; i++)); do for ((j=0; j<${#FEWSHOT_CONFIGS[@]}; j++)); do #echo "sbatch --export=CKPT=${CKPTS[$i]},FEWSHOT_CONFIG=${FEWSHOT_CONFIGS[$j]},DATASET=${DATASETS[$k]} eval.sh" MODEL_CKPT=${CKPTS[$i]} MODEL_CKPT_NO_STEP=${MODEL_CKPT%/*} MODEL_NAME=${MODEL_CKPT_NO_STEP##*/} mkdir -p $MODEL_CKPT_NO_STEP/evaluation/rankeval_r_denoiser #mv $MODEL_CKPT_NO_STEP/evaluation/$MODEL_NAME\_${FEWSHOT_CONFIGS[$j]}.* $MODEL_CKPT_NO_STEP/evaluation/rankeval/ OUTPUT_PATH=$MODEL_CKPT_NO_STEP/evaluation/rankeval_r_denoiser/$MODEL_NAME\_${FEWSHOT_CONFIGS[$j]}.json eval_script="./eval_$i-$j.slurm" cat < $eval_script #!/bin/bash #SBATCH --exclude=nid005159 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=8 #SBATCH --mem=256G #SBATCH -p small-g #SBATCH -t 2-0:00:00 #SBATCH --gpus-per-node=mi250:1 #SBATCH --exclusive=user #SBATCH --hint=nomultithread #SBATCH --account=project_462000119 #SBATCH -o logs/%j.out #SBATCH -e logs/%j.err export HF_DATASETS_OFFLINE=1 export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache VOCAB_FILE="gpt2/vocab.json" MERGE_FILE="gpt2/merges.txt" PP_SIZE=1 TP_SIZE=1 # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS # make as big as it can fit into gpu w/o OOM, but not too close to 100% EVAL_MICRO_BATCH_SIZE=1 MICRO_BS_MULTIPLIER=1 # Model parameters SEQ_LEN=2048 # Dummy arguments MEGATRON_REQUIRED_ARGS=" \ --num-layers -1 \ --hidden-size -1 \ --num-attention-heads -1 \ --seq-length -1 \ --max-position-embeddings -1 \ " ZERO_STAGE=0 mkdir -p ds_configs DS_CONFIG_PATH="ds_configs/\$SLURM_JOB_ID.json" cat < "\$DS_CONFIG_PATH" { "train_micro_batch_size_per_gpu": 1, "train_batch_size": 1, "gradient_clipping": 1.0, "zero_optimization": { "stage": \$ZERO_STAGE }, "bf16": { "enabled": true }, "steps_per_print": 2000, "wall_clock_breakdown": false } EOF DEEPSPEED_ARGS=" \ --deepspeed \ --deepspeed_config \$DS_CONFIG_PATH \ --zero-stage \$ZERO_STAGE \ " CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate_prefix_ul2.py \ --load $MODEL_CKPT \ --results_path $OUTPUT_PATH \ --tensor-model-parallel-size \$TP_SIZE \ --pipeline-model-parallel-size \$PP_SIZE \ --vocab-file \$VOCAB_FILE \ --merge-file \$MERGE_FILE \ --micro-batch-size \$EVAL_MICRO_BATCH_SIZE \ --no-load-optim \ --no-load-rng \ --bf16 \ --inference \ --seq-length \$SEQ_LEN \ --task_list anli_r1,anli_r2,anli_r3,cb,copa,hellaswag,rte,winogrande,storycloze_2016,boolq,arc_easy,arc_challenge,sciq,piqa \ --intermed_results \ --adaptive_seq_len \ --micro_bs_multiplier \$MICRO_BS_MULTIPLIER \ --fewshots ${FEWSHOT_CONFIGS[$j]} \ --prefix \ \$MEGATRON_REQUIRED_ARGS \ \$DEEPSPEED_ARGS \ " echo "\$CMD" echo "START \$SLURM_JOBID: $(date)" srun --label launch.sh \$CMD echo "END \$SLURM_JOBID: $(date)" EOT sbatch $eval_script # Sleep for a bit to avoid hitting the job submission limit sleep 0.1 done done