In [None]:
# @title # ⚡ Imat-AutoGGUF

# @markdown Made by https://huggingface.co/Virt-io

# @markdown Edited https://github.com/mlabonne/llm-course LazyMergekit to work with Imatrix

# @markdown

# @markdown The `token` corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens) in Colab.

# @markdown ---

# @markdown ### ⚡ Quantization parameters
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # @param {type:"string"}
IMATRIX_OPTION = 'Imatrix' # @param ["Imatrix", "Imatrix-RP", "Imatrix-RP-Extended"]
if IMATRIX_OPTION == "Imatrix":
 IMATRIX = f"Google-Colab-Imatrix-GGUF/Imatrix/imatrix.txt"
if IMATRIX_OPTION == "Imatrix-RP":
 IMATRIX = f"Google-Colab-Imatrix-GGUF/Imatrix/imatrix-with-rp-data.txt"
if IMATRIX_OPTION == "Imatrix-RP-Extended":
 IMATRIX = f"Google-Colab-Imatrix-GGUF/Imatrix/imatrix-rp-extended.txt"
print(IMATRIX)
QUANTIZATION_METHODS = "IQ4_NL, Q8_0" # @param {type:"string"}
QUANTIZATION_METHODS = QUANTIZATION_METHODS.replace(" ", "").split(",")

# @markdown ---

# @markdown ### 🤗 Hugging Face Hub
username = "Virt-io" # @param {type:"string"}
token = "HF_TOKEN" # @param {type:"string"}

MODEL_NAME = MODEL_ID.split('/')[-1]

# Git clone llamacpp
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull

# Download model
!git lfs install
!git clone https://huggingface.co/{MODEL_ID}

# Download Imatrix
!git clone https://huggingface.co/Virt-io/Google-Colab-Imatrix-GGUF

# Install python dependencies and reload instance
!pip install -r llama.cpp/requirements/requirements-convert.txt

# Build llamacpp
!cd llama.cpp && make clean && LLAMA_CUDA=1 LLAMA_LTO=1 LLAMA_CUDA_DMMV_X=64 LLAMA_CUDA_MMV_Y=4 LLAMA_CUDA_KQUANTS_ITER=2 LLAMA_CUDA_F16=1 LLAMA_CUDA_DMMV_F16=1 make -j16

# Convert to fp16
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.gguf"
!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}

# Run imatrix
imat_dat = f"{fp16}.{IMATRIX_OPTION}.dat"
!./llama.cpp/imatrix -ngl 100 -c 512 -b 512 --model {fp16} -f {IMATRIX} -o {imat_dat}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
 qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
 !./llama.cpp/quantize --imatrix {imat_dat} {fp16} {qtype} {method}

In [None]:
# @markdown Upload to HF
!pip install -q huggingface_hub
from huggingface_hub import create_repo, HfApi
from google.colab import userdata, runtime

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
 repo_id = f"{username}/{MODEL_NAME}-GGUF",
 repo_type="model",
 exist_ok=True,
 token=hf_token
)

# Upload gguf files
api.upload_folder(
 folder_path=MODEL_NAME,
 repo_id=f"{username}/{MODEL_NAME}-GGUF",
 allow_patterns=["*.gguf", "*.fp16.gguf", "*.dat", "*.md"],
 token=hf_token
)

# Kill runtime
runtime.unassign()