#!/bin/bash #SBATCH --partition=g40423 #SBATCH --job-name=testopenclip #SBATCH --nodes 30 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=12 #SBATCH --output=%x_%j.out #SBATCH --comment=laion #SBATCH --open-mode=append #SBATCH --exclusive module load openmpi module load cuda/11.7 export MASTER_ADDR=`hostname` export MASTER_PORT=12802 export NCCL_PROTO=simple export FI_EFA_FORK_SAFE=1 export FI_LOG_LEVEL=1 export FI_EFA_USE_DEVICE_RDMA=1 export NCCL_DEBUG=info export PYTHONFAULTHANDLER=1 export CUDA_LAUNCH_BLOCKING=0 export OMPI_MCA_mtl_base_verbose=1 export FI_EFA_ENABLE_SHM_TRANSFER=0 export FI_PROVIDER=efa export FI_EFA_TX_MIN_CREDITS=64 export NCCL_TREE_THRESHOLD=0 cd /admin/home-mitchellw/open_clip/src export PYTHONPATH="$PYTHONPATH:/admin/home-mitchellw/open_clip/src" EXP_NAME="test-B-32-laion5b-lr1e-3-bs90k" srun --comment laion --cpu_bind=v --accel-bind=gn python -m training.main \ --save-frequency 1 \ --train-data="pipe:aws s3 cp s3://s-datasets/laion5b/{laion2B-data/{000000..231349}.tar,laion2B-multi-data/{000000..226687}.tar,laion1B-nolang-data/{000000..127231}.tar} -" \ --train-num-samples 135646078 \ --dataset-type webdataset \ --dataset-resampled \ --warmup 2000 \ --batch-size=375 \ --epochs=97 \ --lr 1e-3 \ --workers=8 \ --report-to wandb \ --name ${EXP_NAME} \ --logs /scratch/logs/ \ --model ViT-B-32 \ --seed 0 \ --ddp-static-graph \ --local-loss \ --gather-with-grad \ --grad-checkpointing \ --precision amp_bfloat16 \ --wandb-project-name open_clip6 \ --resume "latest" \ --remote-sync s3://s-laion/mitchellw/logs