HPSv2 / configs /controller.sh
tgxs002's picture
init
54199b6
raw
history blame
2.09 kB
exp=${1:-'test'}
gpu=${2:-'1'}
type=${3:-'local'} # choose slurm if you are running on a cluster with slurm scheduler
if [ "$type" == 'local' ]; then
extra_args=${@:4:99}
else
quotatype=${4:-'auto'} # for slurm
partition=${5:-'1'} # for slurm
extra_args=${@:6:99}
quotatype=spot
partition=YOUR_PARTITION
extra_args=${@:4:99}
fi
name=${name/#configs/logs}
name=${name//.sh//$exp}
work_dir="${name}"
now=$(date +"%Y%m%d_%H%M%S")
mkdir -p $work_dir
ncpu='4'
if [ "$quotatype" == 'reserved_normal' ]; then
quotatype='reserved --phx-priority=${gpu} normal'
fi
if [ "$type" == 'local' ]; then
ava_path=/mnt/afs/xswu/datasets/AVA/images
local_data_path=/mnt/afs/xswu/datasets/preference
local_ava_path=/mnt/afs/xswu/datasets/AVA
local_simulacra_path=/mnt/afs/xswu/datasets/simulacra
local_region_path=/mnt/afs/xswu/datasets/regional_dataset
local_ranking_path=/mnt/afs/xswu/datasets/HPDv2
local_benchmark_path=/mnt/afs/xswu/datasets/benchmark
local_ImageReward_path=/mnt/afs/xswu/datasets/ImageReward
local_pap_path=/mnt/afs/xswu/datasets/PAP
header="torchrun --nproc_per_node=${gpu} --nnodes=1 --max_restarts=3 -m src.training.main "
else
data_path=s3://preference_images/
ava_path=s3://AVA/
simulacra_path=s3://simulacra/
region_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/regional_dataset/
local_data_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/human_preference
local_ava_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/AVA
local_simulacra_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/simulacra
local_region_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/regional_dataset
local_ranking_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/ranking_dataset
local_benchmark_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/benchmark
local_ImageReward_path=/mnt/lustre/wuxiaoshi1.vendor/datasets/ImageReward
header="srun --async --partition=$partition -n${gpu} --mpi=pmi2 --gres=gpu:$gpu --ntasks-per-node=${gpu} --quotatype=$quotatype \
--job-name=$exp --cpus-per-task=$ncpu --kill-on-bad-exit=1 -o local.out python -m src.training.main "
fi