rag-intrinsics-lib / run_ollama.sh
frreiss's picture
Add run Ollama script (#32)
789c7db verified
#! /bin/bash
################################################################################
# Shell script that starts a copy of Ollama with a base model plus all the
# available LoRA adapters in this repository.
#
# Converts `.safetensors` to `.gguf` for running in Ollama.
# Target application is GitHub Actions and CPU only systems.
# To keep the intermediate files, remove the `rm` command at the end.
#
# To run this script:
# 1. Install an appropriate build of Ollama for your machine and use port 55555
# See https://docs.ollama.com/linux#manual-install and set port with
# ```
# curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
# | sudo tar zx -C /usr
# OLLAMA_HOST="localhost:55555" ollama serve &
# ```
# See https://docs.ollama.com/faq#how-do-i-configure-ollama-server for other
# operating systems.
# 3. Download the intrinsics library by running:
# hf download ibm-granite/rag-intrinsics-lib --local-dir ./rag-intrinsics-lib
# 4. Edit the constants BASE_MODEL_NAME, BASE_MODEL_ORG as needed
# 5. Run this script from the root of your local copy of rag-intrinsics-lib.
################################################################################
echo "Setup"
OLLAMA_HOST="localhost:55555"
OLLAMA_DIR="_ollama"
mkdir -p $OLLAMA_DIR
echo ""
echo "Download base model"
MODEL_DIR="$OLLAMA_DIR/models"
mkdir -p $MODEL_DIR
OLLAMA_MODEL_NAME=granite3.3:8b # Quantized model on Ollama
BASE_MODEL_NAME=granite-3.3-8b-instruct
BASE_MODEL_NAME_UPPER=granite-3.3-8B-instruct # llama.cpp output
BASE_MODEL_ORG=ibm-granite
pip install huggingface_hub
hf download $BASE_MODEL_ORG/$BASE_MODEL_NAME --local-dir $MODEL_DIR/$BASE_MODEL_NAME
echo ""
echo "Clone llama.cpp and install dependencies"
LLAMA_CPP_FOLDER_NAME="$OLLAMA_DIR/llama.cpp"
git clone --single-branch --branch master https://github.com/ggml-org/llama.cpp.git $LLAMA_CPP_FOLDER_NAME
pip install -r $LLAMA_CPP_FOLDER_NAME/requirements/requirements-convert_hf_to_gguf.txt
pip install -r $LLAMA_CPP_FOLDER_NAME/requirements/requirements-convert_hf_to_gguf_update.txt
pip install -r $LLAMA_CPP_FOLDER_NAME/requirements/requirements-convert_lora_to_gguf.txt
echo ""
OUTTYPE="q8_0"
OUTTYPE_UPPER="Q8_0" # llama.cpp output
echo "Convert base model to GGUF: $OUTTYPE"
python $LLAMA_CPP_FOLDER_NAME/convert_hf_to_gguf.py $MODEL_DIR/$BASE_MODEL_NAME --outtype $OUTTYPE
MODEL_GGUF=$(realpath "$MODEL_DIR/$BASE_MODEL_NAME/$BASE_MODEL_NAME_UPPER-$OUTTYPE_UPPER.gguf")
echo ""
echo "Convert LoRA adapters to GGUF"
LORA_DIRS=$( find . -name "$BASE_MODEL_NAME" -path "*/lora/$BASE_MODEL_NAME*" | sort | cut -c 3- )
for LORA_DIR in $LORA_DIRS; do
LORA_GGUF="$LORA_DIR/$BASE_MODEL_NAME_UPPER-$OUTTYPE_UPPER-LoRA.gguf"
if [ ! -f "$LORA_GGUF" ]; then
python $LLAMA_CPP_FOLDER_NAME/convert_lora_to_gguf.py $LORA_DIR --base $MODEL_DIR/$BASE_MODEL_NAME --outtype $OUTTYPE
fi
done
echo ""
echo "Create Modelfiles and Ollama models"
MODELFILE_DIR="$OLLAMA_DIR/Modelfiles"
mkdir -p $MODELFILE_DIR
for LORA_DIR in $LORA_DIRS; do
LORA_GGUF=$(realpath "$LORA_DIR/$BASE_MODEL_NAME_UPPER-$OUTTYPE_UPPER-LoRA.gguf")
MODEL_FILE=$LORA_DIR
MODELFILE=${MODEL_FILE//\//_}
MODELFILE=${MODELFILE//../"Modelfile"}
MODELFILEPATH=$MODELFILE_DIR/$MODELFILE
LORA_NAME=$(echo "$LORA_DIR" | cut -d "/" -f 1)
echo ""
echo "Creating $LORA_NAME | $MODELFILEPATH"
# Use GGUF converted model
# printf "FROM $MODEL_GGUF\nADAPTER $LORA_GGUF\n" > $MODELFILEPATH
# printf "FROM $MODEL_GGUF\nADAPTER $LORA_GGUF\n"
# Use quantized model from Ollama
printf "FROM $OLLAMA_MODEL_NAME\nADAPTER $LORA_GGUF\n" > $MODELFILEPATH
printf "FROM $OLLAMA_MODEL_NAME\nADAPTER $LORA_GGUF\n"
echo ""
OLLAMA_HOST=$OLLAMA_HOST ollama create $LORA_NAME -f $MODELFILEPATH
done
echo ""
echo "Clean up"
echo "rm -rf $OLLAMA_DIR"
rm -rf $OLLAMA_DIR