#!/bin/bash #SBATCH --partition=A100 #SBATCH --nodes=1 #SBATCH --gpus-per-node=8 #SBATCH --ntasks-per-node=8 #SBATCH --exclusive #SBATCH --job-name=your_job_name #SBATCH --account your_account_name module load openmpi module load cuda/11.8 export NCCL_PROTO=simple export FI_EFA_FORK_SAFE=1 export FI_LOG_LEVEL=1 export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn export NCCL_DEBUG=info export PYTHONFAULTHANDLER=1 export CUDA_LAUNCH_BLOCKING=0 export OMPI_MCA_mtl_base_verbose=1 export FI_EFA_ENABLE_SHM_TRANSFER=0 export FI_PROVIDER=efa export FI_EFA_TX_MIN_CREDITS=64 export NCCL_TREE_THRESHOLD=0 export PYTHONWARNINGS="ignore" export CXX=g++ source /path/to/your/python/environment/bin/activate master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) export MASTER_ADDR=$master_addr export MASTER_PORT=33751 export PYTHONPATH=./StableWurst echo "r$SLURM_NODEID master: $MASTER_ADDR" echo "r$SLURM_NODEID Launching python script" cd /path/to/your/directory rm dist_file srun python3 train/train_c_lora.py configs/training/finetune_c_3b_lora.yaml