| """ |
| export_gguf_windows.py β Merge LoRA adapters and export to GGUF on Windows. |
| |
| Pipeline: |
| 1. Load base model + LoRA adapters via Unsloth |
| 2. Merge LoRA into weights, save 16-bit safetensors (HF format) |
| 3. Download convert_hf_to_gguf.py from llama.cpp (if not cached) |
| 4. Convert merged model β F16 GGUF |
| 5. Quantize F16 GGUF β Q4_K_M via llama_cpp.llama_model_quantize |
| 6. Update Modelfile to point at the Q4_K_M GGUF |
| |
| Usage (from project root): |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 7b |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 0.5b --push |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
| import io |
| import os |
|
|
| if sys.platform == "win32": |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") |
| sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") |
|
|
| os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1") |
| os.environ.setdefault("TORCH_COMPILE_DISABLE", "1") |
|
|
| |
| import unsloth |
| import transformers.utils.hub |
| import transformers.tokenization_utils_base |
| _noop = lambda *a, **kw: [] |
| transformers.tokenization_utils_base.list_repo_templates = _noop |
| transformers.utils.hub.list_repo_templates = _noop |
|
|
| import argparse |
| import subprocess |
| import urllib.request |
| from pathlib import Path |
|
|
| |
| parser = argparse.ArgumentParser(description="Merge LoRA + export GGUF on Windows") |
| parser.add_argument("--model", default="7b", choices=["0.5b","1.5b","3b","7b","8b"], |
| help="Which fine-tuned model to export (default: 7b)") |
| parser.add_argument("--quant", default="q4_k_m", |
| choices=["f16","q4_k_m","q5_k_m","q8_0"], |
| help="Output quantisation (default: q4_k_m)") |
| parser.add_argument("--push", action="store_true", help="Push GGUF to HF Hub after export") |
| parser.add_argument("--skip-merge", action="store_true", help="Skip merge if merged/ dir already exists") |
| parser.add_argument("--skip-quant", action="store_true", help="Skip quantisation, keep F16 GGUF only") |
| args = parser.parse_args() |
|
|
| |
| _PROFILES = { |
| "0.5b": dict(base_id="unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit", |
| hf_repo="RayMelius/soci-agent-q4", seq_len=2048), |
| "1.5b": dict(base_id="unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit", |
| hf_repo="RayMelius/soci-agent-1b5", seq_len=2048), |
| "3b": dict(base_id="unsloth/Qwen2.5-3B-Instruct-bnb-4bit", |
| hf_repo="RayMelius/soci-agent-3b", seq_len=2048), |
| "7b": dict(base_id="unsloth/Qwen2.5-7B-Instruct-bnb-4bit", |
| hf_repo="RayMelius/soci-agent-7b", seq_len=512), |
| "8b": dict(base_id="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", |
| hf_repo="RayMelius/soci-agent-8b", seq_len=512), |
| } |
| PROFILE = _PROFILES[args.model] |
| HF_REPO = PROFILE["hf_repo"] |
| SEQ_LEN = PROFILE["seq_len"] |
|
|
| TRAIN_DIR = Path("data/training") |
| MODEL_DIR = TRAIN_DIR / args.model |
| LORA_DIR = MODEL_DIR / "lora_adapters" |
| MERGED_DIR = MODEL_DIR / "merged" |
| GGUF_DIR = MODEL_DIR / "gguf" |
| CONVERT_CACHE = TRAIN_DIR / "_llama_convert" |
|
|
| GGUF_DIR.mkdir(parents=True, exist_ok=True) |
| CONVERT_CACHE.mkdir(parents=True, exist_ok=True) |
|
|
| if not LORA_DIR.exists() or not any(LORA_DIR.iterdir()): |
| print(f"[ERROR] No LoRA adapters found at {LORA_DIR}") |
| print(f" Run: python scripts/finetune_local.py --base-model {args.model}") |
| sys.exit(1) |
|
|
| |
| print(f"\n=== Step 1: Merge LoRA adapters ({args.model}) ===") |
|
|
| if args.skip_merge and MERGED_DIR.exists() and any(MERGED_DIR.glob("*.safetensors")): |
| print(f" Skipping merge β {MERGED_DIR} already exists.") |
| else: |
| from unsloth import FastLanguageModel |
|
|
| print(f" Loading {LORA_DIR} ...") |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = str(LORA_DIR), |
| max_seq_length = SEQ_LEN, |
| dtype = None, |
| load_in_4bit = True, |
| ) |
|
|
| print(f" Merging LoRA and saving 16-bit weights to {MERGED_DIR} ...") |
| model.save_pretrained_merged( |
| str(MERGED_DIR), |
| tokenizer, |
| save_method = "merged_16bit", |
| ) |
| print(f" Merged model saved.") |
|
|
| |
| |
| |
| print(f"\n=== Step 2: Prepare llama.cpp convert script ===") |
|
|
| LLAMA_REPO = CONVERT_CACHE / "llama.cpp" |
| CONVERT_SCRIPT = LLAMA_REPO / "convert_hf_to_gguf.py" |
| LLAMA_GGUF_PY = LLAMA_REPO / "gguf-py" |
|
|
| if LLAMA_REPO.exists() and CONVERT_SCRIPT.exists(): |
| print(f" Repo cached at {LLAMA_REPO} β pulling latest ...") |
| subprocess.run(["git", "-C", str(LLAMA_REPO), "pull", "--ff-only", "-q"], check=False) |
| else: |
| print(f" Cloning llama.cpp (shallow) into {LLAMA_REPO} ...") |
| subprocess.check_call([ |
| "git", "clone", "--depth=1", "--filter=blob:none", |
| "https://github.com/ggml-org/llama.cpp.git", |
| str(LLAMA_REPO), |
| ]) |
| print(f" Installing llama.cpp gguf-py + convert dependencies ...") |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", |
| str(LLAMA_GGUF_PY)]) |
| reqs = LLAMA_REPO / "requirements" / "requirements-convert_hf_to_gguf.txt" |
| if reqs.exists(): |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", str(reqs)]) |
|
|
| |
| _convert_env = os.environ.copy() |
| _convert_env["PYTHONPATH"] = str(LLAMA_GGUF_PY / "src") + os.pathsep + _convert_env.get("PYTHONPATH", "") |
|
|
| print(f" Convert script: {CONVERT_SCRIPT}") |
|
|
| |
| print(f"\n=== Step 3: Convert to F16 GGUF ===") |
|
|
| GGUF_F16 = GGUF_DIR / f"{args.model}-f16.gguf" |
|
|
| if GGUF_F16.exists(): |
| print(f" Already exists: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)") |
| else: |
| cmd = [ |
| sys.executable, str(CONVERT_SCRIPT), |
| str(MERGED_DIR), |
| "--outfile", str(GGUF_F16), |
| "--outtype", "f16", |
| ] |
| print(f" Running: {' '.join(cmd)}") |
| result = subprocess.run(cmd, capture_output=False, env=_convert_env) |
| if result.returncode != 0: |
| print(f"[ERROR] Conversion failed (exit {result.returncode})") |
| sys.exit(1) |
| print(f" F16 GGUF: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)") |
|
|
| |
| QUANT_TYPE_MAP = { |
| "f16": 4, |
| "q8_0": 7, |
| "q4_k_m": 15, |
| "q5_k_m": 17, |
| } |
|
|
| if args.skip_quant or args.quant == "f16": |
| GGUF_FINAL = GGUF_F16 |
| print(f"\n=== Step 4: Skipping quantisation (using F16) ===") |
| else: |
| print(f"\n=== Step 4: Quantise β {args.quant.upper()} ===") |
| GGUF_FINAL = GGUF_DIR / f"{args.model}-{args.quant}.gguf" |
|
|
| if GGUF_FINAL.exists(): |
| print(f" Already exists: {GGUF_FINAL} ({GGUF_FINAL.stat().st_size / 1e6:.0f} MB)") |
| else: |
| import ctypes |
| import llama_cpp |
|
|
| ftype = QUANT_TYPE_MAP[args.quant] |
| params = llama_cpp.llama_model_quantize_default_params() |
| params.ftype = ftype |
| params.nthread = 4 |
| params.allow_requantize = False |
|
|
| print(f" Quantising {GGUF_F16.name} β {GGUF_FINAL.name} ...") |
| ret = llama_cpp.llama_model_quantize( |
| str(GGUF_F16).encode(), |
| str(GGUF_FINAL).encode(), |
| ctypes.byref(params), |
| ) |
| if ret != 0: |
| print(f"[ERROR] Quantisation failed (return code {ret})") |
| sys.exit(1) |
| mb = GGUF_FINAL.stat().st_size / 1e6 |
| print(f" {args.quant.upper()} GGUF: {GGUF_FINAL} ({mb:.0f} MB)") |
|
|
| |
| print(f"\n=== Step 5: Update Modelfile ===") |
|
|
| modelfile_path = Path("Modelfile") |
| if modelfile_path.exists(): |
| content = modelfile_path.read_text(encoding="utf-8") |
| |
| gguf_rel = GGUF_FINAL.as_posix() |
| new_from = f"FROM ./{gguf_rel}" |
|
|
| lines = content.splitlines() |
| updated = [] |
| inserted = False |
| for line in lines: |
| stripped = line.strip() |
| if stripped.startswith("FROM ") and not stripped.startswith("#"): |
| |
| updated.append(f"#{line}") |
| if not inserted: |
| updated.append(new_from) |
| inserted = True |
| else: |
| updated.append(line) |
| if not inserted: |
| updated.insert(0, new_from) |
|
|
| modelfile_path.write_text("\n".join(updated) + "\n", encoding="utf-8") |
| print(f" Modelfile updated: FROM β ./{gguf_rel}") |
| else: |
| print(f" [WARN] Modelfile not found β skipping update") |
|
|
| |
| if args.push: |
| print(f"\n=== Step 6: Push GGUF to {HF_REPO} ===") |
| try: |
| from dotenv import load_dotenv; load_dotenv() |
| except ImportError: |
| pass |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| if not HF_TOKEN: |
| env_file = Path(".env") |
| if env_file.exists(): |
| for line in env_file.read_text().splitlines(): |
| if line.startswith("HF_TOKEN="): |
| HF_TOKEN = line.split("=", 1)[1].strip().strip('"') |
|
|
| if not HF_TOKEN: |
| print(" [WARN] No HF_TOKEN β skipping push. Set HF_TOKEN in .env or env var.") |
| else: |
| from huggingface_hub import login, HfApi |
| login(token=HF_TOKEN, add_to_git_credential=False) |
| api = HfApi() |
| api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True) |
| mb = GGUF_FINAL.stat().st_size / 1e6 |
| print(f" Uploading {GGUF_FINAL.name} ({mb:.0f} MB)...") |
| api.upload_file( |
| path_or_fileobj = str(GGUF_FINAL), |
| path_in_repo = GGUF_FINAL.name, |
| repo_id = HF_REPO, |
| repo_type = "model", |
| ) |
| print(f" Done: https://huggingface.co/{HF_REPO}/blob/main/{GGUF_FINAL.name}") |
|
|
| |
| print(f""" |
| === Export complete === |
| GGUF : {GGUF_FINAL} |
| Size : {GGUF_FINAL.stat().st_size / 1e6:.0f} MB |
| |
| To use with Ollama: |
| ollama create soci-agent -f Modelfile |
| ollama run soci-agent |
| |
| Or for {args.model}: |
| ollama create soci-agent-{args.model} -f Modelfile |
| set OLLAMA_MODEL=soci-agent-{args.model} |
| set SOCI_PROVIDER=ollama |
| """) |
|
|