Iconoclast / scripts /run_sequential_benchmark.slurm
OpenAI Codex
Publish Iconoclast research release
3236af9
#!/bin/bash
# Sequential benchmark runner - runs models one at a time to avoid disk quota issues
#SBATCH --job-name=iconoclast-seq
#SBATCH --output=logs/iconoclast-seq-%j.out
#SBATCH --error=logs/iconoclast-seq-%j.err
#SBATCH --time=12:00:00
#SBATCH --mem=64G
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=4
set -euo pipefail
PERSIST_ROOT="/common/users/$USER/iconoclast_ilabs"
SITE_PACKAGES="$PERSIST_ROOT/python312-site"
SYS_PY="/common/system/venv/python312/bin/python"
PROJECT_ROOT="$HOME/iconoclast"
MODELS=(
"config.gemma2_2b.benchmark.rutgers.toml|gemma2-2b-seq"
"config.mistral_7b.benchmark.rutgers.toml|mistral-7b-seq"
"config.phi4_mini.benchmark.rutgers.toml|phi4-mini-seq"
"config.stablelm2_1p6b.benchmark.rutgers.toml|stablelm2-1p6b-seq"
"config.yi_1p5_9b.benchmark.rutgers.toml|yi-1p5-9b-seq"
"config.falcon3_7b.benchmark.rutgers.toml|falcon3-7b-seq"
"config.olmo2_1b.benchmark.rutgers.toml|olmo2-1b-seq"
)
cd "$PROJECT_ROOT"
mkdir -p logs
for entry in "${MODELS[@]}"; do
IFS='|' read -r config run_name <<< "$entry"
echo ""
echo "============================================================"
echo " STARTING: $run_name"
echo " CONFIG: $config"
echo " TIME: $(date)"
echo "============================================================"
# Set up per-run staging and cache
JOB_ROOT="$PERSIST_ROOT/job-stage/$run_name-$SLURM_JOB_ID"
CACHE_ROOT="$PERSIST_ROOT/job-cache/$run_name-$SLURM_JOB_ID"
rm -rf "$JOB_ROOT" "$CACHE_ROOT"
mkdir -p "$JOB_ROOT"
mkdir -p "$CACHE_ROOT"/{hf,hub,transformers,datasets,xdg-cache,xdg-state}
# Stage the project into job root (just like run_rutgers_ilabs.slurm does)
rsync -a \
--exclude '.venv' \
--exclude '__pycache__' \
--exclude 'logs' \
--exclude '.pytest_cache' \
"$PROJECT_ROOT"/ "$JOB_ROOT"/
# Copy the config template to config.toml (this is the critical step!)
cp "$JOB_ROOT/$config" "$JOB_ROOT/config.toml"
cd "$JOB_ROOT"
# Set all environment variables
export PYTHONPATH="$JOB_ROOT/src:$SITE_PACKAGES"
export HF_HUB_ENABLE_HF_TRANSFER=1
export PYTHONUNBUFFERED=1
export TOKENIZERS_PARALLELISM=false
export USE_TF=0
export USE_FLAX=0
export ICONOCLAST_EXIT_AFTER_OPTIMIZATION=true
export HF_TOKEN="YOUR_HF_TOKEN_HERE"
export XDG_CACHE_HOME="$CACHE_ROOT/xdg-cache"
export XDG_STATE_HOME="$CACHE_ROOT/xdg-state"
export HF_HOME="$CACHE_ROOT/hf"
export HF_DATASETS_CACHE="$CACHE_ROOT/datasets"
export TRANSFORMERS_CACHE="$CACHE_ROOT/transformers"
export HUGGINGFACE_HUB_CACHE="$CACHE_ROOT/hub"
export ICONOCLAST_STUDY_CHECKPOINT_DIR="$PERSIST_ROOT/checkpoints/$run_name"
export ICONOCLAST_RESIDUAL_PLOT_PATH="$PERSIST_ROOT/plots/$run_name"
export ICONOCLAST_CONFIG_TEMPLATE="$config"
export ICONOCLAST_RUN_NAME="$run_name"
echo " stage: $JOB_ROOT"
echo " cache: $CACHE_ROOT"
# Run the benchmark
"$SYS_PY" -c "from iconoclast.main import main; main()" || echo " FAILED: $run_name (exit code $?)"
# Clean up model cache AND staging to free disk quota for next model
echo " Cleaning up for $run_name..."
cd "$PROJECT_ROOT"
rm -rf "$JOB_ROOT" "$CACHE_ROOT"
echo " Done with $run_name at $(date)"
done
echo ""
echo "============================================================"
echo " ALL SEQUENTIAL BENCHMARKS COMPLETE"
echo " TIME: $(date)"
echo "============================================================"