Spaces:

teachyourselfcoding
/

chatlawv1

Runtime error

chatlawv1 / trlx /scripts /sweep-cw.sh

Upload 245 files

fa6856c almost 2 years ago

1.29 kB

	#!/bin/bash
	#SBATCH --job-name=trlx-sweep
	#SBATCH --account=trlx
	#SBATCH --partition=a100-cu117
	#SBATCH --nodes=2
	#SBATCH --ntasks-per-node=1
	#SBATCH --mem=0
	#SBATCH --output=%j
	#SBATCH --exclusive

	export NCCL_DEBUG=WARN
	export NCCL_PROTO=simple
	export FI_EFA_FORK_SAFE=1
	export FI_LOG_LEVEL=1
	export FI_EFA_USE_DEVICE_RDMA=1
	export FI_EFA_ENABLE_SHM_TRANSFER=0
	export FI_PROVIDER=efa
	export FI_EFA_TX_MIN_CREDITS=64
	# export CUDA_LAUNCH_BLOCKING=1

	export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
	export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)

	cd $TRLX
	source $TRLX/venv-with-pinned-ray/bin/activate

	ray start --head --port=6379 &

	export HOSTNAMES=($HOSTNAMES)
	for node in ${HOSTNAMES[@]:1}; do
	echo "Starting ray worker @ $node"
	srun --nodes=1 --ntasks=1 -w "$node" ray start --address $MASTER_ADDR:6379 --block &
	done

	sleep 10
	ray status

	NUM_GPUS=16
	python -m trlx.sweep -y --config configs/sweeps/ppo_sweep.yml --accelerate_config configs/accelerate/zero2-bf16.yaml --num_gpus $NUM_GPUS examples/ppo_sentiments.py
	# python -m trlx.sweep -y --config configs/sweeps/ilql_sweep.yml --default_config configs/ilql_config.yml --accelerate_config configs/accelerate/zero2-bf16.yaml --num_gpus $NUM_GPUS examples/ilql_sentiments.py