GGUF-Quantization-Script / gguf-imat.py
Lewdiculous's picture
Add checks for F16.gguf and imatrix.dat, as well make HF model removal optional by default.
7df3c68 verified
raw
history blame
8.33 kB
import os
import requests
import zipfile
import subprocess
import shutil
from huggingface_hub import snapshot_download
# Clone or update the llama.cpp repository with --depth 1
def clone_or_update_llama_cpp():
print("Preparing...")
base_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(base_dir)
if not os.path.exists("llama.cpp"):
subprocess.run(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp"])
else:
os.chdir("llama.cpp")
subprocess.run(["git", "pull"])
os.chdir(base_dir)
print("The 'llama.cpp' repository is ready.")
# Download and extract the latest release of llama.cpp Windows binaries
def download_llama_release():
base_dir = os.path.dirname(os.path.abspath(__file__))
dl_dir = os.path.join(base_dir, "bin", "dl")
if not os.path.exists(dl_dir):
os.makedirs(dl_dir)
os.chdir(dl_dir)
latest_release_url = "https://github.com/ggerganov/llama.cpp/releases/latest"
response = requests.get(latest_release_url)
if response.status_code == 200:
latest_release_tag = response.url.split("/")[-1]
download_url = f"https://github.com/ggerganov/llama.cpp/releases/download/{latest_release_tag}/llama-{latest_release_tag}-bin-win-cuda-cu12.2.0-x64.zip"
response = requests.get(download_url)
if response.status_code == 200:
with open(f"llama-{latest_release_tag}-bin-win-cuda-cu12.2.0-x64.zip", "wb") as f:
f.write(response.content)
with zipfile.ZipFile(f"llama-{latest_release_tag}-bin-win-cuda-cu12.2.0-x64.zip", "r") as zip_ref:
zip_ref.extractall(os.path.join(base_dir, "bin"))
print("Downloading latest 'llama.cpp' prebuilt Windows binaries...")
print("Download and extraction completed successfully.")
return latest_release_tag
else:
print("Failed to download the release file.")
else:
print("Failed to fetch the latest release information.")
# Download and extract the Cuda .dll resources if they aren't present in the bin folder
def download_cudart_if_necessary(latest_release_tag):
base_dir = os.path.dirname(os.path.abspath(__file__))
cudart_dl_dir = os.path.join(base_dir, "bin", "dl")
if not os.path.exists(cudart_dl_dir):
os.makedirs(cudart_dl_dir)
cudart_zip_file = os.path.join(cudart_dl_dir, "cudart-llama-bin-win-cu12.2.0-x64.zip")
cudart_extracted_files = ["cublas64_12.dll", "cublasLt64_12.dll", "cudart64_12.dll"]
# Check if all required files exist
if all(os.path.exists(os.path.join(base_dir, "bin", file)) for file in cudart_extracted_files):
print("Cuda resources already exist. Skipping download.")
else:
cudart_download_url = f"https://github.com/ggerganov/llama.cpp/releases/download/{latest_release_tag}/cudart-llama-bin-win-cu12.2.0-x64.zip"
response = requests.get(cudart_download_url)
if response.status_code == 200:
with open(cudart_zip_file, "wb") as f:
f.write(response.content)
with zipfile.ZipFile(cudart_zip_file, "r") as zip_ref:
zip_ref.extractall(os.path.join(base_dir, "bin"))
print("Preparing 'cuda' resources...")
print("Download and extraction of cudart completed successfully.")
else:
print("Failed to download the cudart release file.")
# Ask for user input to download or fetch from cache the specified model repository if it doesn't exist
def download_model_repo():
base_dir = os.path.dirname(os.path.abspath(__file__))
models_dir = os.path.join(base_dir, "models")
if not os.path.exists(models_dir):
os.makedirs(models_dir)
model_id = input("Enter the model ID to download (e.g., huggingface/transformers): ")
model_name = model_id.split("/")[-1]
model_dir = os.path.join(models_dir, model_name)
# Check if the model repository already exists
if os.path.exists(model_dir):
print("Model repository already exists. Using existing repository.")
# If the model already exists, prompt the user if they want to delete the model directory
delete_model_dir = input("Remove HF model folder after converting original model to GGUF? (yes/no) (default: no): ").strip().lower()
# Convert the existing model to GGUF F16 format and generate imatrix.dat
convert_model_to_gguf_f16(base_dir, model_dir, model_name, delete_model_dir)
else:
revision = input("Enter the revision (branch, tag, or commit) to download (default: main): ") or "main"
# Ask the user if they want to remove the HF model folder after conversion
delete_model_dir = input("Remove HF model folder after converting original model to GGUF? (yes/no) (default: no): ").strip().lower()
print("Downloading model repository...")
snapshot_download(repo_id=model_id, local_dir=model_dir, revision=revision)
print("Model repository downloaded successfully.")
# Convert the downloaded model to GGUF F16 format and generate imatrix.dat
convert_model_to_gguf_f16(base_dir, model_dir, model_name, delete_model_dir)
# Convert the downloaded model to GGUF F16 format
def convert_model_to_gguf_f16(base_dir, model_dir, model_name, delete_model_dir):
convert_script = os.path.join(base_dir, "llama.cpp", "convert.py")
gguf_dir = os.path.join(base_dir, "models", f"{model_name}-GGUF")
gguf_model_path = os.path.join(gguf_dir, f"{model_name}-F16.gguf")
if not os.path.exists(gguf_dir):
os.makedirs(gguf_dir)
# Check if F16 file already exists
if not os.path.exists(gguf_model_path):
# Execute the conversion command
subprocess.run(["python", convert_script, model_dir, "--outfile", gguf_model_path, "--outtype", "f16"])
# Delete the original model directory under conditions
if delete_model_dir == 'yes' or delete_model_dir == 'y':
shutil.rmtree(model_dir)
print(f"Original model directory '{model_dir}' deleted.")
else:
print(f"Original model directory '{model_dir}' was not deleted. You can remove it manually.")
# Check if imatrix.dat exists within gguf_dir
imatrix_exe = os.path.join(base_dir, "bin", "imatrix.exe")
imatrix_output = os.path.join(gguf_dir, "imatrix.dat")
imatrix_txt = os.path.join(base_dir, "imatrix", "imatrix.txt")
if not os.path.exists(imatrix_output):
# Execute the imatrix command
subprocess.run([imatrix_exe, "-m", gguf_model_path, "-f", imatrix_txt, "-ngl", "13"], cwd=gguf_dir)
# Move the imatrix.dat file to the GGUF folder
shutil.move(os.path.join(gguf_dir, "imatrix.dat"), gguf_dir)
print("imatrix.dat generated successfully.")
else:
print("Skipping imatrix generation as imatrix.dat already exists.")
else:
print("Skipping model conversion as F16 file already exists.")
# Quantize the models
quantize_models(base_dir, model_name)
# Quantize models with different options
def quantize_models(base_dir, model_name):
gguf_dir = os.path.join(base_dir, "models", f"{model_name}-GGUF")
f16_gguf_path = os.path.join(gguf_dir, f"{model_name}-F16.gguf")
quantization_options = [
"IQ3_M", "IQ3_XXS",
"Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS",
"Q5_K_M", "Q5_K_S",
"Q6_K",
"Q8_0"
]
for quant_option in quantization_options:
quantized_gguf_name = f"{model_name}-{quant_option}-imat.gguf"
quantized_gguf_path = os.path.join(gguf_dir, quantized_gguf_name)
quantize_command = os.path.join(base_dir, "bin", "quantize.exe")
imatrix_path = os.path.join(gguf_dir, "imatrix.dat")
subprocess.run([quantize_command, "--imatrix", imatrix_path,
f16_gguf_path, quantized_gguf_path, quant_option], cwd=gguf_dir)
print(f"Model quantized with {quant_option} option.")
# Main function - Steps
def main():
clone_or_update_llama_cpp()
latest_release_tag = download_llama_release()
download_cudart_if_necessary(latest_release_tag)
download_model_repo()
print("Finished preparing resources.")
if __name__ == "__main__":
main()