File size: 2,219 Bytes

#!/usr/bin/env bash
set -eux

cd "$(dirname "$0")"

MODEL_DIR="bench-TriLMs-models"
LLAMA_CPP_PATH="."
sizes=("1.5" "2.4" "3.9")
types=("TQ1_0" "TQ2_0" "Q4_K_M" "Q8_0" "F16" "BF16")
gputypes=("TQ2_0" "Q4_K_M" "Q8_0" "F16")

function gather_models() {
  echo Gather the models
  if [ ! -d "$MODEL_DIR" ]; then
    mkdir -p -- "$MODEL_DIR"
  fi
  (
    cd "$MODEL_DIR"
    for sz in "${sizes[@]}"; do
      filename="TriLM_${sz}B_Unpacked-TQ1_0-F16.gguf"
      if [ ! -f "$filename" ]; then
        wget "https://huggingface.co/compilade/quant-tests/resolve/main/${filename}"
      fi
    done
  )
}

function build_llama_cpp() {
  echo Build llama.cpp for CPU

  (
    cd -- "$LLAMA_CPP_PATH"
    if [ -d build ]; then
      pwd
      rm -rf build
    fi
    mkdir build
    cd build
    cmake .. "$@"
    make -j llama-bench llama-quantize
  )
}

function quantize() {
  echo "Make all model types we'll test"
  (
    for sz in "${sizes[@]}"; do
      for ty in "${types[@]}"; do
        filenames=("$MODEL_DIR"/TriLM_"${sz}"B_Unpacked-{TQ1_0-F16,"$ty"}.gguf)
        if [ ! -f "${filenames[1]}" ]; then
          "$LLAMA_CPP_PATH"/build/bin/llama-quantize --allow-requantize "${filenames[@]}" "$ty"
        fi
      done
    done
  )
}

function bench() {
  echo Test each model one by one for different numbers of threads

  for sz in "${sizes[@]}"; do
    for ty in "$@"; do
      for th in 1 2 4 8; do
        {
          "$LLAMA_CPP_PATH"/build/bin/llama-bench -v -m "${MODEL_DIR}/TriLM_${sz}B_Unpacked-${ty}.gguf" -t "${th}" -p 512 -n 128 -r 4 -o json
          printf "%s\n" ","
        }
      done
    done
  done
}

function bench_cpu() {
  bench "${types[@]}" >> "$1"
}

function bench_gpu() {
  bench "${gputypes[@]}" >> "$1"
}

currentTime="$(date +'%s')"
resultFile="results-${currentTime}.json"
infoFile="results-${currentTime}-info.txt"
lscpu > "$infoFile"

gather_models
build_llama_cpp -DGGML_NATIVE=ON -DGGML_CPU=ON
quantize

echo "---" >> "$infoFile"
ls -go "$MODEL_DIR" >> "$infoFile"

bench_cpu "$resultFile"

if [ -x "$(command -v nvidia-smi)" ]; then
  echo GPU detected, benchark with that too.
  build_llama_cpp -DGGML_NATIVE=ON -DGGML_CUDA=ON -DGGML_CUDA_F16=ON
  bench_gpu "$resultFile"
fi