Spaces:
Runtime error
Runtime error
| #SBATCH --job-name=trlx-sweep | |
| #SBATCH --account=trlx | |
| #SBATCH --partition=a100-cu117 | |
| #SBATCH --nodes=2 | |
| #SBATCH --ntasks-per-node=1 | |
| #SBATCH --mem=0 | |
| #SBATCH --output=%j | |
| #SBATCH --exclusive | |
| export NCCL_DEBUG=WARN | |
| export NCCL_PROTO=simple | |
| export FI_EFA_FORK_SAFE=1 | |
| export FI_LOG_LEVEL=1 | |
| export FI_EFA_USE_DEVICE_RDMA=1 | |
| export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
| export FI_PROVIDER=efa | |
| export FI_EFA_TX_MIN_CREDITS=64 | |
| # export CUDA_LAUNCH_BLOCKING=1 | |
| export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
| export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
| cd $TRLX | |
| source $TRLX/venv-with-pinned-ray/bin/activate | |
| ray start --head --port=6379 & | |
| export HOSTNAMES=($HOSTNAMES) | |
| for node in ${HOSTNAMES[@]:1}; do | |
| echo "Starting ray worker @ $node" | |
| srun --nodes=1 --ntasks=1 -w "$node" ray start --address $MASTER_ADDR:6379 --block & | |
| done | |
| sleep 10 | |
| ray status | |
| NUM_GPUS=16 | |
| python -m trlx.sweep -y --config configs/sweeps/ppo_sweep.yml --accelerate_config configs/accelerate/zero2-bf16.yaml --num_gpus $NUM_GPUS examples/ppo_sentiments.py | |
| # python -m trlx.sweep -y --config configs/sweeps/ilql_sweep.yml --default_config configs/ilql_config.yml --accelerate_config configs/accelerate/zero2-bf16.yaml --num_gpus $NUM_GPUS examples/ilql_sentiments.py | |