#SBATCH --job-name=alpaca-7 # Specify job name | |
#SBATCH --partition=pgpu # Specify partition name | |
#SBATCH --mem=0 # Use entire memory of node | |
#SBATCH --gres=gpu:8 # Generic resources; 8 GPU | |
#SBATCH --exclusive # Do not share node | |
#SBATCH --time=48:00:00 # Set a limit on the total run time | |
#SBATCH --output=logs_alp-7.o%j # File name for standard output | |
#SBATCH --error=errors_alp-7.e%j # File name for standard error output | |
cd /path/to/gitrepo | |
# activate conda environment | |
source /home/user/miniconda3/etc/profile.d/conda.sh | |
conda activate medalpaca | |
# recommended to manually set the hf cache dir, as the files are huge | |
export HF_HOME="/path/to/your/hfcache" | |
# feel free to adapt the below command, to run the training | |
# in 8bit with LoRA, fp16 with LoRA or bf16 and fsdp | |
torchrun --nproc_per_node=8 --master_port=9876 medalpaca/train.py \ | |
--model 'decapoda-research/llama-7b-hf' \ | |
--data_path 'medical_meadow_small.json' \ | |
--output_dir './lora-alpaca-7b' \ | |
--train_in_8bit False \ | |
--use_lora False \ | |
--bf16 True \ | |
--tf32 True \ | |
--fp16 False \ | |
--gradient_checkpointing True \ | |
--global_batch_size 256 \ | |
--per_device_batch_size 4 \ | |
--wandb_project 'medalpaca' \ | |
--use_wandb False | |