indialcristi
commited on
Commit
•
97a961c
1
Parent(s):
a9192b0
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- gguf-model-4-f16.bin +3 -0
- llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- llama.cpp/convert-baichuan-hf-to-gguf.py +310 -0
- llama.cpp/convert-bloom-hf-to-gguf.py +238 -0
- llama.cpp/convert-falcon-hf-to-gguf.py +250 -0
- llama.cpp/convert-gptneox-hf-to-gguf.py +212 -0
- llama.cpp/convert-llama-ggml-to-gguf.py +451 -0
- llama.cpp/convert-mpt-hf-to-gguf.py +218 -0
- llama.cpp/convert-persimmon-to-gguf.py +130 -0
- llama.cpp/convert-refact-hf-to-gguf.py +263 -0
- llama.cpp/convert-starcoder-hf-to-gguf.py +202 -0
- llama.cpp/examples/finetune/convert-finetune-checkpoint-to-gguf.py +489 -0
- llama.cpp/examples/gguf/CMakeLists.txt +5 -0
- llama.cpp/examples/gguf/gguf.cpp +249 -0
- llama.cpp/examples/llava/convert-image-encoder-to-gguf.py +250 -0
- llama.cpp/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +499 -0
- llama.cpp/gguf-py/LICENSE +21 -0
- llama.cpp/gguf-py/README.md +71 -0
- llama.cpp/gguf-py/gguf/__init__.py +1 -0
- llama.cpp/gguf-py/gguf/__pycache__/gguf.cpython-310.pyc +0 -0
- llama.cpp/gguf-py/gguf/gguf.py +1070 -0
- llama.cpp/gguf-py/gguf/py.typed +0 -0
- llama.cpp/gguf-py/pyproject.toml +29 -0
- llama.cpp/gguf-py/tests/test_gguf.py +7 -0
- llama.cpp/models/ggml-vocab-aquila.gguf +3 -0
- llama.cpp/models/ggml-vocab-falcon.gguf +3 -0
- llama.cpp/models/ggml-vocab-llama.gguf +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
llama.cpp/models/ggml-vocab-aquila.gguf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
llama.cpp/models/ggml-vocab-falcon.gguf filter=lfs diff=lfs merge=lfs -text
|
gguf-model-4-f16.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7228c76f1035d9e8e3347f53649d705984dcd100bbf7ab918086e9b27ffe36b3
|
3 |
+
size 14484764256
|
llama.cpp/.github/workflows/gguf-publish.yml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This workflow will upload a Python Package using Twine when a GGUF release is created
|
2 |
+
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
3 |
+
|
4 |
+
# See `gguf-py/README.md` for how to make a release.
|
5 |
+
|
6 |
+
# This workflow uses actions that are not certified by GitHub.
|
7 |
+
# They are provided by a third-party and are governed by
|
8 |
+
# separate terms of service, privacy policy, and support
|
9 |
+
# documentation.
|
10 |
+
|
11 |
+
name: Upload Python Package
|
12 |
+
|
13 |
+
on:
|
14 |
+
workflow_dispatch:
|
15 |
+
push:
|
16 |
+
# Pattern matched against refs/tags
|
17 |
+
tags:
|
18 |
+
- 'gguf-v*' # Push events to every version tag
|
19 |
+
|
20 |
+
|
21 |
+
jobs:
|
22 |
+
deploy:
|
23 |
+
|
24 |
+
runs-on: ubuntu-latest
|
25 |
+
|
26 |
+
steps:
|
27 |
+
- uses: actions/checkout@v3
|
28 |
+
- name: Set up Python
|
29 |
+
uses: actions/setup-python@v2
|
30 |
+
with:
|
31 |
+
python-version: '3.9.x'
|
32 |
+
- name: Install dependencies
|
33 |
+
run: |
|
34 |
+
cd gguf-py
|
35 |
+
python -m pip install poetry
|
36 |
+
poetry install
|
37 |
+
|
38 |
+
- name: Build package
|
39 |
+
run: cd gguf-py && poetry build
|
40 |
+
- name: Publish package
|
41 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
42 |
+
with:
|
43 |
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
44 |
+
packages-dir: gguf-py/dist
|
llama.cpp/convert-baichuan-hf-to-gguf.py
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF baichuan --> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import TYPE_CHECKING, Any
|
13 |
+
import itertools
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
|
23 |
+
if TYPE_CHECKING:
|
24 |
+
from typing import TypeAlias
|
25 |
+
|
26 |
+
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
27 |
+
|
28 |
+
# reverse HF permute back to original pth layout
|
29 |
+
|
30 |
+
|
31 |
+
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
32 |
+
if n_kv_head is not None and n_head != n_kv_head:
|
33 |
+
n_head //= n_kv_head
|
34 |
+
|
35 |
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
36 |
+
.swapaxes(1, 2)
|
37 |
+
.reshape(weights.shape))
|
38 |
+
|
39 |
+
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
40 |
+
r = weights.shape[0] // 3
|
41 |
+
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
42 |
+
|
43 |
+
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
44 |
+
r = weights.shape[0] // 3
|
45 |
+
return weights[r * n_part : r * n_part + r, ...]
|
46 |
+
|
47 |
+
def count_model_parts(dir_model: str) -> int:
|
48 |
+
num_parts = 0
|
49 |
+
|
50 |
+
for filename in os.listdir(dir_model):
|
51 |
+
if filename.startswith("pytorch_model-"):
|
52 |
+
num_parts += 1
|
53 |
+
|
54 |
+
if num_parts > 0:
|
55 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
56 |
+
|
57 |
+
return num_parts
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
def parse_args() -> argparse.Namespace:
|
62 |
+
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
63 |
+
parser.add_argument(
|
64 |
+
"--vocab-only", action="store_true",
|
65 |
+
help="extract only the vocab",
|
66 |
+
)
|
67 |
+
parser.add_argument(
|
68 |
+
"--outfile", type=Path,
|
69 |
+
help="path to write to; default: based on input",
|
70 |
+
)
|
71 |
+
parser.add_argument(
|
72 |
+
"model", type=Path,
|
73 |
+
help="directory containing model file, or model file itself (*.bin)",
|
74 |
+
)
|
75 |
+
parser.add_argument(
|
76 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
77 |
+
help="output format - use 0 for float32, 1 for float16",
|
78 |
+
)
|
79 |
+
return parser.parse_args()
|
80 |
+
|
81 |
+
args = parse_args()
|
82 |
+
|
83 |
+
dir_model = args.model
|
84 |
+
ftype = args.ftype
|
85 |
+
if not dir_model.is_dir():
|
86 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
87 |
+
sys.exit(1)
|
88 |
+
|
89 |
+
# possible tensor data types
|
90 |
+
# ftype == 0 -> float32
|
91 |
+
# ftype == 1 -> float16
|
92 |
+
|
93 |
+
# map from ftype to string
|
94 |
+
ftype_str = ["f32", "f16"]
|
95 |
+
|
96 |
+
if args.outfile is not None:
|
97 |
+
fname_out = args.outfile
|
98 |
+
else:
|
99 |
+
# output in the same directory as the model by default
|
100 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
101 |
+
|
102 |
+
print("gguf: loading model "+dir_model.name)
|
103 |
+
|
104 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
105 |
+
hparams = json.load(f)
|
106 |
+
print("hello print: ",hparams["architectures"][0])
|
107 |
+
if hparams["architectures"][0] != "BaichuanForCausalLM":
|
108 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
109 |
+
|
110 |
+
sys.exit()
|
111 |
+
|
112 |
+
# get number of model parts
|
113 |
+
num_parts = count_model_parts(dir_model)
|
114 |
+
print(f"num_parts:{num_parts}\n")
|
115 |
+
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
116 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
117 |
+
|
118 |
+
print("gguf: get model metadata")
|
119 |
+
|
120 |
+
block_count = hparams["num_hidden_layers"]
|
121 |
+
head_count = hparams["num_attention_heads"]
|
122 |
+
|
123 |
+
if "num_key_value_heads" in hparams:
|
124 |
+
head_count_kv = hparams["num_key_value_heads"]
|
125 |
+
else:
|
126 |
+
head_count_kv = head_count
|
127 |
+
|
128 |
+
if "_name_or_path" in hparams:
|
129 |
+
hf_repo = hparams["_name_or_path"]
|
130 |
+
else:
|
131 |
+
hf_repo = ""
|
132 |
+
|
133 |
+
if "max_sequence_length" in hparams:
|
134 |
+
ctx_length = hparams["max_sequence_length"]
|
135 |
+
elif "max_position_embeddings" in hparams:
|
136 |
+
ctx_length = hparams["max_position_embeddings"]
|
137 |
+
elif "model_max_length" in hparams:
|
138 |
+
ctx_length = hparams["model_max_length"]
|
139 |
+
else:
|
140 |
+
print("gguf: can not find ctx length parameter.")
|
141 |
+
|
142 |
+
sys.exit()
|
143 |
+
|
144 |
+
|
145 |
+
gguf_writer.add_name(dir_model.name)
|
146 |
+
gguf_writer.add_source_hf_repo(hf_repo)
|
147 |
+
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
148 |
+
gguf_writer.add_context_length(ctx_length)
|
149 |
+
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
150 |
+
gguf_writer.add_block_count(block_count)
|
151 |
+
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
152 |
+
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
153 |
+
gguf_writer.add_head_count(head_count)
|
154 |
+
gguf_writer.add_head_count_kv(head_count_kv)
|
155 |
+
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
156 |
+
|
157 |
+
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
158 |
+
if "type" in hparams["rope_scaling"]:
|
159 |
+
if hparams["rope_scaling"]["type"] == "linear":
|
160 |
+
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
161 |
+
|
162 |
+
|
163 |
+
# TOKENIZATION
|
164 |
+
|
165 |
+
print("gguf: get tokenizer metadata")
|
166 |
+
|
167 |
+
tokens: list[bytes] = []
|
168 |
+
scores: list[float] = []
|
169 |
+
toktypes: list[int] = []
|
170 |
+
|
171 |
+
tokenizer_model_file = dir_model / 'tokenizer.model'
|
172 |
+
if not tokenizer_model_file.is_file():
|
173 |
+
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
174 |
+
sys.exit(1)
|
175 |
+
|
176 |
+
# vocab type sentencepiece
|
177 |
+
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
178 |
+
|
179 |
+
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
180 |
+
vocab_size = hparams.get('vocab_size')
|
181 |
+
if vocab_size is None:
|
182 |
+
vocab_size = tokenizer.vocab_size()
|
183 |
+
|
184 |
+
for i in range(vocab_size):
|
185 |
+
text: bytes
|
186 |
+
score: float
|
187 |
+
|
188 |
+
piece = tokenizer.id_to_piece(i)
|
189 |
+
text = piece.encode("utf-8")
|
190 |
+
score = tokenizer.get_score(i)
|
191 |
+
|
192 |
+
toktype = 1 # defualt to normal token type
|
193 |
+
if tokenizer.is_unknown(i):
|
194 |
+
toktype = 2
|
195 |
+
if tokenizer.is_control(i):
|
196 |
+
toktype = 3
|
197 |
+
|
198 |
+
# toktype = 4 is user-defined = tokens from added_tokens.json
|
199 |
+
|
200 |
+
if tokenizer.is_unused(i):
|
201 |
+
toktype = 5
|
202 |
+
if tokenizer.is_byte(i):
|
203 |
+
toktype = 6
|
204 |
+
|
205 |
+
tokens.append(text)
|
206 |
+
scores.append(score)
|
207 |
+
toktypes.append(toktype)
|
208 |
+
|
209 |
+
added_tokens_file = dir_model / 'added_tokens.json'
|
210 |
+
if added_tokens_file.is_file():
|
211 |
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
212 |
+
addtokens_json = json.load(f)
|
213 |
+
|
214 |
+
print("gguf: get added tokens")
|
215 |
+
|
216 |
+
for key in addtokens_json:
|
217 |
+
tokens.append( key.encode("utf-8") )
|
218 |
+
scores.append(-1000.0)
|
219 |
+
toktypes.append(4) # user-defined token type
|
220 |
+
|
221 |
+
|
222 |
+
gguf_writer.add_tokenizer_model("llama")
|
223 |
+
gguf_writer.add_token_list(tokens)
|
224 |
+
gguf_writer.add_token_scores(scores)
|
225 |
+
gguf_writer.add_token_types(toktypes)
|
226 |
+
|
227 |
+
special_vocab = gguf.SpecialVocab(dir_model)
|
228 |
+
special_vocab.add_to_gguf(gguf_writer)
|
229 |
+
|
230 |
+
# TENSORS
|
231 |
+
|
232 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
233 |
+
|
234 |
+
# tensor info
|
235 |
+
print("gguf: get tensor metadata")
|
236 |
+
|
237 |
+
if num_parts == 0:
|
238 |
+
part_names = iter(("pytorch_model.bin",))
|
239 |
+
else:
|
240 |
+
part_names = (
|
241 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
242 |
+
)
|
243 |
+
|
244 |
+
|
245 |
+
for part_name in part_names:
|
246 |
+
if args.vocab_only:
|
247 |
+
break
|
248 |
+
print("gguf: loading model part '" + part_name + "'")
|
249 |
+
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
250 |
+
|
251 |
+
tmp=model_part
|
252 |
+
for i in range(block_count):
|
253 |
+
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
254 |
+
print(f"Unpacking and permuting layer {i}")
|
255 |
+
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
256 |
+
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
257 |
+
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
258 |
+
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
259 |
+
|
260 |
+
for name in model_part.keys():
|
261 |
+
data = model_part[name]
|
262 |
+
# we don't need these
|
263 |
+
if name.endswith(".rotary_emb.inv_freq"):
|
264 |
+
continue
|
265 |
+
|
266 |
+
old_dtype = data.dtype
|
267 |
+
|
268 |
+
# convert any unsupported data types to float32
|
269 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
270 |
+
data = data.to(torch.float32)
|
271 |
+
|
272 |
+
data = data.squeeze().numpy()
|
273 |
+
|
274 |
+
# map tensor names
|
275 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
276 |
+
if new_name is None:
|
277 |
+
print("Can not map tensor '" + name + "'")
|
278 |
+
sys.exit()
|
279 |
+
|
280 |
+
n_dims = len(data.shape)
|
281 |
+
data_dtype = data.dtype
|
282 |
+
|
283 |
+
# if f32 desired, convert any float16 to float32
|
284 |
+
if ftype == 0 and data_dtype == np.float16:
|
285 |
+
data = data.astype(np.float32)
|
286 |
+
|
287 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
288 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
289 |
+
data = data.astype(np.float32)
|
290 |
+
|
291 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
292 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
293 |
+
data = data.astype(np.float16)
|
294 |
+
|
295 |
+
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
296 |
+
gguf_writer.add_tensor(new_name, data)
|
297 |
+
|
298 |
+
|
299 |
+
print("gguf: write header")
|
300 |
+
gguf_writer.write_header_to_file()
|
301 |
+
print("gguf: write metadata")
|
302 |
+
gguf_writer.write_kv_data_to_file()
|
303 |
+
if not args.vocab_only:
|
304 |
+
print("gguf: write tensors")
|
305 |
+
gguf_writer.write_tensors_to_file()
|
306 |
+
|
307 |
+
gguf_writer.close()
|
308 |
+
|
309 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
310 |
+
print("")
|
llama.cpp/convert-bloom-hf-to-gguf.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF bloom --> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
import struct
|
11 |
+
import sys
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Any
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
18 |
+
|
19 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
20 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
21 |
+
import gguf
|
22 |
+
|
23 |
+
|
24 |
+
def count_model_parts(dir_model: Path) -> int:
|
25 |
+
num_parts = 0
|
26 |
+
for filename in os.listdir(dir_model):
|
27 |
+
if filename.startswith("pytorch_model-"):
|
28 |
+
num_parts += 1
|
29 |
+
|
30 |
+
if num_parts > 0:
|
31 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
32 |
+
return num_parts
|
33 |
+
|
34 |
+
|
35 |
+
# Supported Models:
|
36 |
+
# https://huggingface.co/bigscience/bloom-1b7
|
37 |
+
# https://huggingface.co/bigscience/bloom-3b
|
38 |
+
# https://huggingface.co/bigscience/bloom-7b1
|
39 |
+
# https://huggingface.co/Langboat/bloom-1b4-zh
|
40 |
+
def parse_args() -> argparse.Namespace:
|
41 |
+
parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
|
42 |
+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
43 |
+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
44 |
+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
45 |
+
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
46 |
+
return parser.parse_args()
|
47 |
+
|
48 |
+
args = parse_args()
|
49 |
+
|
50 |
+
dir_model = args.model
|
51 |
+
ftype = args.ftype
|
52 |
+
if not dir_model.is_dir():
|
53 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
54 |
+
sys.exit(1)
|
55 |
+
|
56 |
+
# possible tensor data types
|
57 |
+
# ftype == 0 -> float32
|
58 |
+
# ftype == 1 -> float16
|
59 |
+
|
60 |
+
# map from ftype to string
|
61 |
+
ftype_str = ["f32", "f16"]
|
62 |
+
|
63 |
+
if args.outfile is not None:
|
64 |
+
fname_out = args.outfile
|
65 |
+
else:
|
66 |
+
# output in the same directory as the model by default
|
67 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
68 |
+
|
69 |
+
print("gguf: loading model "+dir_model.name)
|
70 |
+
|
71 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
72 |
+
hparams = json.load(f)
|
73 |
+
|
74 |
+
if hparams["architectures"][0] != "BloomForCausalLM":
|
75 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
76 |
+
sys.exit(1)
|
77 |
+
|
78 |
+
# get number of model parts
|
79 |
+
num_parts = count_model_parts(dir_model)
|
80 |
+
|
81 |
+
ARCH=gguf.MODEL_ARCH.BLOOM
|
82 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
83 |
+
|
84 |
+
print("gguf: get model metadata")
|
85 |
+
|
86 |
+
block_count = hparams["n_layer"]
|
87 |
+
|
88 |
+
gguf_writer.add_name("Bloom")
|
89 |
+
n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
|
90 |
+
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
|
91 |
+
gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
|
92 |
+
gguf_writer.add_embedding_length(n_embed)
|
93 |
+
gguf_writer.add_feed_forward_length(4 * n_embed)
|
94 |
+
gguf_writer.add_block_count(block_count)
|
95 |
+
gguf_writer.add_head_count(n_head)
|
96 |
+
gguf_writer.add_head_count_kv(n_head)
|
97 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
98 |
+
gguf_writer.add_file_type(ftype)
|
99 |
+
|
100 |
+
# TOKENIZATION
|
101 |
+
|
102 |
+
print("gguf: get tokenizer metadata")
|
103 |
+
|
104 |
+
tokens: list[bytearray] = []
|
105 |
+
scores: list[float] = []
|
106 |
+
toktypes: list[int] = []
|
107 |
+
|
108 |
+
# gpt2 tokenizer
|
109 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
110 |
+
|
111 |
+
print("gguf: get gpt2 tokenizer vocab")
|
112 |
+
|
113 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
114 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
115 |
+
|
116 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
117 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
118 |
+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
119 |
+
assert max(tokenizer.vocab.values()) < vocab_size
|
120 |
+
|
121 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
122 |
+
|
123 |
+
for i in range(vocab_size):
|
124 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
125 |
+
scores.append(0.0) # dummy
|
126 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
127 |
+
|
128 |
+
gguf_writer.add_token_list(tokens)
|
129 |
+
gguf_writer.add_token_scores(scores)
|
130 |
+
gguf_writer.add_token_types(toktypes)
|
131 |
+
|
132 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
133 |
+
special_vocab.add_to_gguf(gguf_writer)
|
134 |
+
|
135 |
+
# TENSORS
|
136 |
+
|
137 |
+
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
138 |
+
|
139 |
+
# params for qkv transform
|
140 |
+
n_head_kv = hparams.get("n_head_kv", n_head)
|
141 |
+
head_dim = n_embed // n_head
|
142 |
+
|
143 |
+
# tensor info
|
144 |
+
print("gguf: get tensor metadata")
|
145 |
+
|
146 |
+
if num_parts == 0:
|
147 |
+
part_names = iter(("pytorch_model.bin",))
|
148 |
+
else:
|
149 |
+
part_names = (
|
150 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
151 |
+
)
|
152 |
+
|
153 |
+
for part_name in part_names:
|
154 |
+
if args.vocab_only:
|
155 |
+
break
|
156 |
+
print("gguf: loading model part '" + part_name + "'")
|
157 |
+
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
158 |
+
|
159 |
+
has_lm_head = True
|
160 |
+
if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
|
161 |
+
has_lm_head = False
|
162 |
+
|
163 |
+
for original_name in model_part.keys():
|
164 |
+
data = model_part[original_name]
|
165 |
+
name = re.sub(r'transformer\.', '', original_name)
|
166 |
+
|
167 |
+
old_dtype = data.dtype
|
168 |
+
|
169 |
+
# convert any unsupported data types to float32
|
170 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
171 |
+
data = data.to(torch.float32)
|
172 |
+
|
173 |
+
data = data.squeeze().numpy()
|
174 |
+
|
175 |
+
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
176 |
+
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
177 |
+
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
178 |
+
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
179 |
+
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
|
180 |
+
data = np.concatenate(
|
181 |
+
(qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
182 |
+
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
183 |
+
qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
|
184 |
+
axis=0
|
185 |
+
)
|
186 |
+
print("re-format attention.linear_qkv.weight")
|
187 |
+
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
188 |
+
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
189 |
+
data = np.concatenate(
|
190 |
+
(qkv_bias[:, 0, :].reshape((n_embed,)),
|
191 |
+
qkv_bias[:, 1, :].reshape((n_embed,)),
|
192 |
+
qkv_bias[:, 2, :].reshape((n_embed,))),
|
193 |
+
axis=0
|
194 |
+
)
|
195 |
+
print("re-format attention.linear_qkv.bias")
|
196 |
+
|
197 |
+
# map tensor names
|
198 |
+
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
199 |
+
if new_name is None:
|
200 |
+
print("Can not map tensor '" + name + "'")
|
201 |
+
sys.exit()
|
202 |
+
|
203 |
+
n_dims = len(data.shape)
|
204 |
+
data_dtype = data.dtype
|
205 |
+
|
206 |
+
# if f32 desired, convert any float16 to float32
|
207 |
+
if ftype == 0 and data_dtype == np.float16:
|
208 |
+
data = data.astype(np.float32)
|
209 |
+
|
210 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
211 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
212 |
+
data = data.astype(np.float32)
|
213 |
+
|
214 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
215 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
216 |
+
data = data.astype(np.float16)
|
217 |
+
|
218 |
+
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
219 |
+
|
220 |
+
gguf_writer.add_tensor(new_name, data)
|
221 |
+
|
222 |
+
if not has_lm_head and name == "word_embeddings.weight":
|
223 |
+
gguf_writer.add_tensor("output.weight", data)
|
224 |
+
print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
|
225 |
+
|
226 |
+
|
227 |
+
print("gguf: write header")
|
228 |
+
gguf_writer.write_header_to_file()
|
229 |
+
print("gguf: write metadata")
|
230 |
+
gguf_writer.write_kv_data_to_file()
|
231 |
+
if not args.vocab_only:
|
232 |
+
print("gguf: write tensors")
|
233 |
+
gguf_writer.write_tensors_to_file()
|
234 |
+
|
235 |
+
gguf_writer.close()
|
236 |
+
|
237 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
238 |
+
print("")
|
llama.cpp/convert-falcon-hf-to-gguf.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF falcon--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import contextlib
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
import struct
|
11 |
+
import sys
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Any
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
18 |
+
|
19 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
20 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
21 |
+
import gguf
|
22 |
+
|
23 |
+
|
24 |
+
def count_model_parts(dir_model: Path, prefix: str) -> int:
|
25 |
+
num_parts = 0
|
26 |
+
for filename in os.listdir(dir_model):
|
27 |
+
if filename.startswith(prefix):
|
28 |
+
num_parts += 1
|
29 |
+
|
30 |
+
if num_parts > 0:
|
31 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
32 |
+
return num_parts
|
33 |
+
|
34 |
+
|
35 |
+
def parse_args() -> argparse.Namespace:
|
36 |
+
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
|
37 |
+
parser.add_argument(
|
38 |
+
"--vocab-only", action="store_true",
|
39 |
+
help="extract only the vocab",
|
40 |
+
)
|
41 |
+
parser.add_argument(
|
42 |
+
"--outfile", type=Path,
|
43 |
+
help="path to write to; default: based on input",
|
44 |
+
)
|
45 |
+
parser.add_argument(
|
46 |
+
"model", type=Path,
|
47 |
+
help="directory containing model file, or model file itself (*.bin)",
|
48 |
+
)
|
49 |
+
parser.add_argument(
|
50 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
51 |
+
help="output format - use 0 for float32, 1 for float16",
|
52 |
+
)
|
53 |
+
return parser.parse_args()
|
54 |
+
|
55 |
+
args = parse_args()
|
56 |
+
|
57 |
+
dir_model = args.model
|
58 |
+
ftype = args.ftype
|
59 |
+
if not dir_model.is_dir():
|
60 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
61 |
+
sys.exit(1)
|
62 |
+
|
63 |
+
# possible tensor data types
|
64 |
+
# ftype == 0 -> float32
|
65 |
+
# ftype == 1 -> float16
|
66 |
+
|
67 |
+
# map from ftype to string
|
68 |
+
ftype_str = ["f32", "f16"]
|
69 |
+
|
70 |
+
if args.outfile is not None:
|
71 |
+
fname_out = args.outfile
|
72 |
+
else:
|
73 |
+
# output in the same directory as the model by default
|
74 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
75 |
+
|
76 |
+
print("gguf: loading model "+dir_model.name)
|
77 |
+
|
78 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
79 |
+
hparams = json.load(f)
|
80 |
+
|
81 |
+
if hparams["architectures"][0] != "FalconForCausalLM":
|
82 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
83 |
+
|
84 |
+
sys.exit(1)
|
85 |
+
|
86 |
+
# get number of model parts
|
87 |
+
num_parts = count_model_parts(dir_model, "model-00")
|
88 |
+
if num_parts:
|
89 |
+
is_safetensors = True
|
90 |
+
from safetensors import safe_open
|
91 |
+
else:
|
92 |
+
is_safetensors = False
|
93 |
+
num_parts = count_model_parts(dir_model, "pytorch_model-")
|
94 |
+
|
95 |
+
ARCH=gguf.MODEL_ARCH.FALCON
|
96 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
97 |
+
|
98 |
+
print("gguf: get model metadata")
|
99 |
+
|
100 |
+
block_count = hparams["num_hidden_layers"]
|
101 |
+
|
102 |
+
gguf_writer.add_name("Falcon")
|
103 |
+
gguf_writer.add_context_length(2048) # not in config.json
|
104 |
+
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
105 |
+
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
106 |
+
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
107 |
+
gguf_writer.add_block_count(block_count)
|
108 |
+
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
109 |
+
if "num_kv_heads" in hparams:
|
110 |
+
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
|
111 |
+
else:
|
112 |
+
gguf_writer.add_head_count_kv(1)
|
113 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
114 |
+
gguf_writer.add_file_type(ftype)
|
115 |
+
|
116 |
+
# TOKENIZATION
|
117 |
+
|
118 |
+
print("gguf: get tokenizer metadata")
|
119 |
+
|
120 |
+
tokens: list[bytearray] = []
|
121 |
+
scores: list[float] = []
|
122 |
+
toktypes: list[int] = []
|
123 |
+
|
124 |
+
# gpt2 tokenizer
|
125 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
126 |
+
|
127 |
+
print("gguf: get gpt2 tokenizer vocab")
|
128 |
+
|
129 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
130 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
131 |
+
|
132 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
133 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
134 |
+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
135 |
+
assert max(tokenizer.vocab.values()) < vocab_size
|
136 |
+
|
137 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
138 |
+
|
139 |
+
for i in range(vocab_size):
|
140 |
+
tokens.append(reverse_vocab[i])
|
141 |
+
scores.append(0.0) # dummy
|
142 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
143 |
+
|
144 |
+
gguf_writer.add_token_list(tokens)
|
145 |
+
gguf_writer.add_token_scores(scores)
|
146 |
+
gguf_writer.add_token_types(toktypes)
|
147 |
+
|
148 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
149 |
+
special_vocab.add_to_gguf(gguf_writer)
|
150 |
+
|
151 |
+
# TENSORS
|
152 |
+
|
153 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
154 |
+
|
155 |
+
# params for qkv transform
|
156 |
+
n_head = hparams["num_attention_heads"]
|
157 |
+
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
|
158 |
+
|
159 |
+
head_dim = hparams["hidden_size"] // n_head
|
160 |
+
|
161 |
+
# tensor info
|
162 |
+
print("gguf: get tensor metadata")
|
163 |
+
|
164 |
+
if num_parts == 0:
|
165 |
+
part_names = iter(("pytorch_model.bin",))
|
166 |
+
elif is_safetensors:
|
167 |
+
part_names = (
|
168 |
+
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
|
169 |
+
)
|
170 |
+
else:
|
171 |
+
part_names = (
|
172 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
173 |
+
)
|
174 |
+
|
175 |
+
for part_name in part_names:
|
176 |
+
if args.vocab_only:
|
177 |
+
break
|
178 |
+
print("gguf: loading model part '" + part_name + "'")
|
179 |
+
if is_safetensors:
|
180 |
+
ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
|
181 |
+
else:
|
182 |
+
ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
|
183 |
+
|
184 |
+
with ctx as model_part:
|
185 |
+
for name in model_part.keys():
|
186 |
+
data = model_part.get_tensor(name) if is_safetensors else model_part[name]
|
187 |
+
|
188 |
+
old_dtype = data.dtype
|
189 |
+
|
190 |
+
# convert any unsupported data types to float32
|
191 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
192 |
+
data = data.to(torch.float32)
|
193 |
+
|
194 |
+
# QKV tensor transform
|
195 |
+
# The original query_key_value tensor contains n_head_kv "kv groups",
|
196 |
+
# each consisting of n_head/n_head_kv query weights followed by one key
|
197 |
+
# and one value weight (shared by all query heads in the kv group).
|
198 |
+
# This layout makes it a big pain to work with in GGML.
|
199 |
+
# So we rearrange them here,, so that we have n_head query weights
|
200 |
+
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
201 |
+
# in contiguous fashion.
|
202 |
+
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
203 |
+
|
204 |
+
if "query_key_value" in name:
|
205 |
+
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
206 |
+
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
|
207 |
+
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
208 |
+
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
209 |
+
data = torch.cat((q,k,v)).reshape_as(data)
|
210 |
+
|
211 |
+
data = data.squeeze().numpy()
|
212 |
+
|
213 |
+
# map tensor names
|
214 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
215 |
+
if new_name is None:
|
216 |
+
print("Can not map tensor '" + name + "'")
|
217 |
+
sys.exit()
|
218 |
+
|
219 |
+
n_dims = len(data.shape)
|
220 |
+
data_dtype = data.dtype
|
221 |
+
|
222 |
+
# if f32 desired, convert any float16 to float32
|
223 |
+
if ftype == 0 and data_dtype == np.float16:
|
224 |
+
data = data.astype(np.float32)
|
225 |
+
|
226 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
227 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
228 |
+
data = data.astype(np.float32)
|
229 |
+
|
230 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
231 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
232 |
+
data = data.astype(np.float16)
|
233 |
+
|
234 |
+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
235 |
+
|
236 |
+
gguf_writer.add_tensor(new_name, data)
|
237 |
+
|
238 |
+
|
239 |
+
print("gguf: write header")
|
240 |
+
gguf_writer.write_header_to_file()
|
241 |
+
print("gguf: write metadata")
|
242 |
+
gguf_writer.write_kv_data_to_file()
|
243 |
+
if not args.vocab_only:
|
244 |
+
print("gguf: write tensors")
|
245 |
+
gguf_writer.write_tensors_to_file()
|
246 |
+
|
247 |
+
gguf_writer.close()
|
248 |
+
|
249 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
250 |
+
print("")
|
llama.cpp/convert-gptneox-hf-to-gguf.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF gptneox--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
|
23 |
+
def count_model_parts(dir_model: Path) -> int:
|
24 |
+
num_parts = 0
|
25 |
+
for filename in os.listdir(dir_model):
|
26 |
+
if filename.startswith("pytorch_model-"):
|
27 |
+
num_parts += 1
|
28 |
+
|
29 |
+
if num_parts > 0:
|
30 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
31 |
+
return num_parts
|
32 |
+
|
33 |
+
|
34 |
+
def parse_args() -> argparse.Namespace:
|
35 |
+
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
|
36 |
+
parser.add_argument(
|
37 |
+
"--vocab-only", action="store_true",
|
38 |
+
help="extract only the vocab",
|
39 |
+
)
|
40 |
+
parser.add_argument(
|
41 |
+
"--outfile", type=Path,
|
42 |
+
help="path to write to; default: based on input",
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
"model", type=Path,
|
46 |
+
help="directory containing model file, or model file itself (*.bin)",
|
47 |
+
)
|
48 |
+
parser.add_argument(
|
49 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
50 |
+
help="output format - use 0 for float32, 1 for float16",
|
51 |
+
)
|
52 |
+
return parser.parse_args()
|
53 |
+
|
54 |
+
args = parse_args()
|
55 |
+
|
56 |
+
dir_model = args.model
|
57 |
+
ftype = args.ftype
|
58 |
+
if not dir_model.is_dir():
|
59 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
60 |
+
sys.exit(1)
|
61 |
+
|
62 |
+
# possible tensor data types
|
63 |
+
# ftype == 0 -> float32
|
64 |
+
# ftype == 1 -> float16
|
65 |
+
|
66 |
+
# map from ftype to string
|
67 |
+
ftype_str = ["f32", "f16"]
|
68 |
+
|
69 |
+
if args.outfile is not None:
|
70 |
+
fname_out = args.outfile
|
71 |
+
else:
|
72 |
+
# output in the same directory as the model by default
|
73 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
74 |
+
|
75 |
+
print("gguf: loading model "+dir_model.name)
|
76 |
+
|
77 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
78 |
+
hparams = json.load(f)
|
79 |
+
|
80 |
+
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
|
81 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
82 |
+
|
83 |
+
sys.exit()
|
84 |
+
|
85 |
+
# get number of model parts
|
86 |
+
num_parts = count_model_parts(dir_model)
|
87 |
+
|
88 |
+
ARCH=gguf.MODEL_ARCH.GPTNEOX
|
89 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
90 |
+
|
91 |
+
print("gguf: get model metadata")
|
92 |
+
|
93 |
+
block_count = hparams["num_hidden_layers"]
|
94 |
+
|
95 |
+
gguf_writer.add_name(dir_model.name)
|
96 |
+
gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
97 |
+
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
98 |
+
gguf_writer.add_block_count(block_count)
|
99 |
+
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
100 |
+
gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
|
101 |
+
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
102 |
+
gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
103 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
104 |
+
|
105 |
+
# TOKENIZATION
|
106 |
+
|
107 |
+
print("gguf: get tokenizer metadata")
|
108 |
+
|
109 |
+
tokens: list[bytearray] = []
|
110 |
+
scores: list[float] = []
|
111 |
+
toktypes: list[int] = []
|
112 |
+
|
113 |
+
# gpt2 tokenizer
|
114 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
115 |
+
|
116 |
+
print("gguf: get gpt2 tokenizer vocab")
|
117 |
+
|
118 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
119 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
120 |
+
|
121 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
122 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
123 |
+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
124 |
+
assert max(tokenizer.vocab.values()) < vocab_size
|
125 |
+
|
126 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
127 |
+
|
128 |
+
for i in range(vocab_size):
|
129 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
130 |
+
scores.append(0.0) # dummy
|
131 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
132 |
+
|
133 |
+
gguf_writer.add_token_list(tokens)
|
134 |
+
gguf_writer.add_token_scores(scores)
|
135 |
+
gguf_writer.add_token_types(toktypes)
|
136 |
+
|
137 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
138 |
+
special_vocab.add_to_gguf(gguf_writer)
|
139 |
+
|
140 |
+
# TENSORS
|
141 |
+
|
142 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
143 |
+
|
144 |
+
# tensor info
|
145 |
+
print("gguf: get tensor metadata")
|
146 |
+
|
147 |
+
if num_parts == 0:
|
148 |
+
part_names = iter(("pytorch_model.bin",))
|
149 |
+
else:
|
150 |
+
part_names = (
|
151 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
152 |
+
)
|
153 |
+
|
154 |
+
for part_name in part_names:
|
155 |
+
if args.vocab_only:
|
156 |
+
break
|
157 |
+
print("gguf: loading model part '" + part_name + "'")
|
158 |
+
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
159 |
+
|
160 |
+
for name in model_part.keys():
|
161 |
+
data = model_part[name]
|
162 |
+
|
163 |
+
# we don't need these
|
164 |
+
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
|
165 |
+
continue
|
166 |
+
|
167 |
+
old_dtype = data.dtype
|
168 |
+
|
169 |
+
# convert any unsupported data types to float32
|
170 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
171 |
+
data = data.to(torch.float32)
|
172 |
+
|
173 |
+
data = data.squeeze().numpy()
|
174 |
+
|
175 |
+
# map tensor names
|
176 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
177 |
+
if new_name is None:
|
178 |
+
print("Can not map tensor '" + name + "'")
|
179 |
+
sys.exit()
|
180 |
+
|
181 |
+
n_dims = len(data.shape)
|
182 |
+
data_dtype = data.dtype
|
183 |
+
|
184 |
+
# if f32 desired, convert any float16 to float32
|
185 |
+
if ftype == 0 and data_dtype == np.float16:
|
186 |
+
data = data.astype(np.float32)
|
187 |
+
|
188 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
189 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
190 |
+
data = data.astype(np.float32)
|
191 |
+
|
192 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
193 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
194 |
+
data = data.astype(np.float16)
|
195 |
+
|
196 |
+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
197 |
+
|
198 |
+
gguf_writer.add_tensor(new_name, data)
|
199 |
+
|
200 |
+
|
201 |
+
print("gguf: write header")
|
202 |
+
gguf_writer.write_header_to_file()
|
203 |
+
print("gguf: write metadata")
|
204 |
+
gguf_writer.write_kv_data_to_file()
|
205 |
+
if not args.vocab_only:
|
206 |
+
print("gguf: write tensors")
|
207 |
+
gguf_writer.write_tensors_to_file()
|
208 |
+
|
209 |
+
gguf_writer.close()
|
210 |
+
|
211 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
212 |
+
print("")
|
llama.cpp/convert-llama-ggml-to-gguf.py
ADDED
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import math
|
6 |
+
import struct
|
7 |
+
import sys
|
8 |
+
from enum import IntEnum
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
import os
|
14 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
15 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
16 |
+
import gguf
|
17 |
+
|
18 |
+
# Note: Does not support GGML_QKK_64
|
19 |
+
QK_K = 256
|
20 |
+
# Items here are (block size, type size)
|
21 |
+
GGML_QUANT_SIZES = {
|
22 |
+
gguf.GGMLQuantizationType.F32 : (1, 4),
|
23 |
+
gguf.GGMLQuantizationType.F16 : (1, 2),
|
24 |
+
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
|
25 |
+
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
|
26 |
+
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
|
27 |
+
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
|
28 |
+
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
|
29 |
+
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
|
30 |
+
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
31 |
+
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
32 |
+
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
|
33 |
+
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
34 |
+
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
35 |
+
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
36 |
+
}
|
37 |
+
|
38 |
+
class GGMLFormat(IntEnum):
|
39 |
+
GGML = 0
|
40 |
+
GGMF = 1
|
41 |
+
GGJT = 2
|
42 |
+
|
43 |
+
class GGMLFType(IntEnum):
|
44 |
+
ALL_F32 = 0
|
45 |
+
MOSTLY_F16 = 1
|
46 |
+
MOSTLY_Q4_0 = 2
|
47 |
+
MOSTLY_Q4_1 = 3
|
48 |
+
MOSTLY_Q4_1_SOME_F16 = 4
|
49 |
+
MOSTLY_Q8_0 = 7
|
50 |
+
MOSTLY_Q5_0 = 8
|
51 |
+
MOSTLY_Q5_1 = 9
|
52 |
+
MOSTLY_Q2_K = 10
|
53 |
+
MOSTLY_Q3_K_S = 11
|
54 |
+
MOSTLY_Q3_K_M = 12
|
55 |
+
MOSTLY_Q3_K_L = 13
|
56 |
+
MOSTLY_Q4_K_S = 14
|
57 |
+
MOSTLY_Q4_K_M = 15
|
58 |
+
MOSTLY_Q5_K_S = 16
|
59 |
+
MOSTLY_Q5_K_M = 17
|
60 |
+
MOSTLY_Q6_K = 18
|
61 |
+
|
62 |
+
class Hyperparameters:
|
63 |
+
def __init__(self):
|
64 |
+
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
65 |
+
self.n_layer = self.n_rot = self.n_ff = 0
|
66 |
+
self.ftype = GGMLFType.ALL_F32
|
67 |
+
|
68 |
+
def set_n_ff(self, model):
|
69 |
+
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
|
70 |
+
assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
|
71 |
+
ff_tensor = model.tensors[ff_tensor_idx]
|
72 |
+
self.n_ff = ff_tensor.dims[1]
|
73 |
+
|
74 |
+
def load(self, data, offset):
|
75 |
+
(
|
76 |
+
self.n_vocab,
|
77 |
+
self.n_embd,
|
78 |
+
self.n_mult,
|
79 |
+
self.n_head,
|
80 |
+
self.n_layer,
|
81 |
+
self.n_rot,
|
82 |
+
ftype,
|
83 |
+
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
|
84 |
+
try:
|
85 |
+
self.ftype = GGMLFType(ftype)
|
86 |
+
except ValueError:
|
87 |
+
raise ValueError(f'Invalid ftype {ftype}')
|
88 |
+
return 4 * 7
|
89 |
+
|
90 |
+
def __str__(self):
|
91 |
+
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
92 |
+
|
93 |
+
class Vocab:
|
94 |
+
def __init__(self, load_scores = True):
|
95 |
+
self.items = []
|
96 |
+
self.load_scores = load_scores
|
97 |
+
|
98 |
+
def load(self, data, offset, n_vocab):
|
99 |
+
orig_offset = offset
|
100 |
+
for _ in range(n_vocab):
|
101 |
+
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
|
102 |
+
assert itemlen < 4096, 'Absurd vocab item length'
|
103 |
+
offset += 4
|
104 |
+
item_text = bytes(data[offset:offset + itemlen])
|
105 |
+
offset += itemlen
|
106 |
+
if self.load_scores:
|
107 |
+
item_score = struct.unpack('<f', data[offset:offset + 4])[0]
|
108 |
+
offset += 4
|
109 |
+
else:
|
110 |
+
item_score = 0.0
|
111 |
+
self.items.append((item_text, item_score))
|
112 |
+
return offset - orig_offset
|
113 |
+
|
114 |
+
class Tensor:
|
115 |
+
def __init__(self, use_padding = True):
|
116 |
+
self.name = None
|
117 |
+
self.dims: tuple[int, ...] = ()
|
118 |
+
self.dtype = None
|
119 |
+
self.start_offset = 0
|
120 |
+
self.len_bytes = np.int64(0)
|
121 |
+
self.use_padding = use_padding
|
122 |
+
|
123 |
+
def load(self, data, offset):
|
124 |
+
orig_offset = offset
|
125 |
+
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
|
126 |
+
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
|
127 |
+
assert name_len < 4096, 'Absurd tensor name length'
|
128 |
+
quant = GGML_QUANT_SIZES.get(dtype)
|
129 |
+
assert quant is not None, 'Unknown tensor type'
|
130 |
+
(blksize, tysize) = quant
|
131 |
+
offset += 12
|
132 |
+
self.dtype= dtype
|
133 |
+
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
|
134 |
+
offset += 4 * n_dims
|
135 |
+
self.name = bytes(data[offset:offset + name_len])
|
136 |
+
offset += name_len
|
137 |
+
pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
|
138 |
+
offset += pad
|
139 |
+
n_elems = np.prod(self.dims)
|
140 |
+
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
141 |
+
self.start_offset = offset
|
142 |
+
self.len_bytes = n_bytes
|
143 |
+
offset += n_bytes
|
144 |
+
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
145 |
+
return offset - orig_offset
|
146 |
+
|
147 |
+
class GGMLModel:
|
148 |
+
def __init__(self):
|
149 |
+
self.hyperparameters = None
|
150 |
+
self.vocab = None
|
151 |
+
self.tensor_map = {}
|
152 |
+
self.tensors = []
|
153 |
+
|
154 |
+
def validate_header(self, data, offset):
|
155 |
+
magic = bytes(data[offset:offset + 4])
|
156 |
+
if magic == b'GGUF':
|
157 |
+
raise ValueError('File is already in GGUF format.')
|
158 |
+
if magic == b'lmgg':
|
159 |
+
self.file_format = GGMLFormat.GGML
|
160 |
+
self.format_version = 1
|
161 |
+
return 4
|
162 |
+
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
|
163 |
+
if magic == b'fmgg':
|
164 |
+
if version != 1:
|
165 |
+
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
|
166 |
+
self.file_format = GGMLFormat.GGMF
|
167 |
+
self.format_version = version
|
168 |
+
return 8
|
169 |
+
if magic == b'tjgg':
|
170 |
+
if version < 1 or version > 3:
|
171 |
+
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
|
172 |
+
self.file_format = GGMLFormat.GGJT
|
173 |
+
self.format_version = version
|
174 |
+
return 8
|
175 |
+
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
|
176 |
+
|
177 |
+
def validate_conversion(self, ftype):
|
178 |
+
err = ''
|
179 |
+
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
|
180 |
+
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
181 |
+
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
182 |
+
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
183 |
+
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
184 |
+
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
185 |
+
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
186 |
+
if len(err) > 0:
|
187 |
+
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
188 |
+
|
189 |
+
def load(self, data, offset):
|
190 |
+
offset += self.validate_header(data, offset)
|
191 |
+
hp = Hyperparameters()
|
192 |
+
offset += hp.load(data, offset)
|
193 |
+
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
|
194 |
+
self.validate_conversion(hp.ftype)
|
195 |
+
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
|
196 |
+
offset += vocab.load(data, offset, hp.n_vocab)
|
197 |
+
tensors: list[Tensor] = []
|
198 |
+
tensor_map = {}
|
199 |
+
while offset < len(data):
|
200 |
+
tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
|
201 |
+
offset += tensor.load(data, offset)
|
202 |
+
tensor_map[tensor.name] = len(tensors)
|
203 |
+
tensors.append(tensor)
|
204 |
+
self.hyperparameters = hp
|
205 |
+
self.vocab = vocab
|
206 |
+
self.tensors = tensors
|
207 |
+
self.tensor_map = tensor_map
|
208 |
+
hp.set_n_ff(self)
|
209 |
+
return offset
|
210 |
+
|
211 |
+
class GGMLToGGUF:
|
212 |
+
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
213 |
+
hp = ggml_model.hyperparameters
|
214 |
+
self.model = ggml_model
|
215 |
+
self.data = data
|
216 |
+
self.cfg = cfg
|
217 |
+
self.params_override = params_override
|
218 |
+
self.vocab_override = vocab_override
|
219 |
+
self.special_vocab = special_vocab
|
220 |
+
if params_override is not None:
|
221 |
+
n_kv_head = params_override.n_head_kv
|
222 |
+
else:
|
223 |
+
if cfg.gqa == 1:
|
224 |
+
n_kv_head = hp.n_head
|
225 |
+
else:
|
226 |
+
gqa = float(cfg.gqa)
|
227 |
+
n_kv_head = None
|
228 |
+
for x in range(1, 256):
|
229 |
+
if float(hp.n_head) / float(x) == gqa:
|
230 |
+
n_kv_head = x
|
231 |
+
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
|
232 |
+
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
233 |
+
self.n_kv_head = n_kv_head
|
234 |
+
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
|
235 |
+
|
236 |
+
def save(self):
|
237 |
+
print('* Preparing to save GGUF file')
|
238 |
+
gguf_writer = gguf.GGUFWriter(
|
239 |
+
self.cfg.output,
|
240 |
+
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
241 |
+
use_temp_file = False )
|
242 |
+
self.add_params(gguf_writer)
|
243 |
+
self.add_vocab(gguf_writer)
|
244 |
+
if self.special_vocab is not None:
|
245 |
+
self.special_vocab.add_to_gguf(gguf_writer)
|
246 |
+
self.add_tensors(gguf_writer)
|
247 |
+
print(" gguf: write header")
|
248 |
+
gguf_writer.write_header_to_file()
|
249 |
+
print(" gguf: write metadata")
|
250 |
+
gguf_writer.write_kv_data_to_file()
|
251 |
+
print(" gguf: write tensors")
|
252 |
+
gguf_writer.write_tensors_to_file()
|
253 |
+
gguf_writer.close()
|
254 |
+
|
255 |
+
def add_params(self, gguf_writer):
|
256 |
+
hp = self.model.hyperparameters
|
257 |
+
cfg = self.cfg
|
258 |
+
if cfg.desc is not None:
|
259 |
+
desc = cfg.desc
|
260 |
+
else:
|
261 |
+
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
|
262 |
+
try:
|
263 |
+
# Filenames aren't necessarily valid UTF8.
|
264 |
+
name = cfg.name if cfg.name is not None else cfg.input.name
|
265 |
+
except UnicodeDecodeError:
|
266 |
+
name = None
|
267 |
+
print('* Adding model parameters and KV items')
|
268 |
+
if name is not None:
|
269 |
+
gguf_writer.add_name(name)
|
270 |
+
gguf_writer.add_description(desc)
|
271 |
+
gguf_writer.add_file_type(int(hp.ftype))
|
272 |
+
if self.params_override is not None:
|
273 |
+
po = self.params_override
|
274 |
+
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
|
275 |
+
assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
|
276 |
+
assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
|
277 |
+
gguf_writer.add_context_length (po.n_ctx)
|
278 |
+
gguf_writer.add_embedding_length (po.n_embd)
|
279 |
+
gguf_writer.add_block_count (po.n_layer)
|
280 |
+
gguf_writer.add_feed_forward_length (po.n_ff)
|
281 |
+
gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
|
282 |
+
gguf_writer.add_head_count (po.n_head)
|
283 |
+
gguf_writer.add_head_count_kv (po.n_head_kv)
|
284 |
+
gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
|
285 |
+
return
|
286 |
+
gguf_writer.add_context_length(cfg.context_length)
|
287 |
+
gguf_writer.add_embedding_length(hp.n_embd)
|
288 |
+
gguf_writer.add_block_count(hp.n_layer)
|
289 |
+
gguf_writer.add_feed_forward_length(hp.n_ff)
|
290 |
+
gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
|
291 |
+
gguf_writer.add_head_count(hp.n_head)
|
292 |
+
gguf_writer.add_head_count_kv(self.n_kv_head)
|
293 |
+
gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
|
294 |
+
|
295 |
+
def add_vocab(self, gguf_writer):
|
296 |
+
hp = self.model.hyperparameters
|
297 |
+
gguf_writer.add_tokenizer_model('llama')
|
298 |
+
tokens = []
|
299 |
+
scores = []
|
300 |
+
toktypes = []
|
301 |
+
if self.vocab_override is not None:
|
302 |
+
vo = self.vocab_override
|
303 |
+
print('* Adding vocab item(s)')
|
304 |
+
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
305 |
+
tokens.append(vbytes)
|
306 |
+
scores.append(score)
|
307 |
+
toktypes.append(ttype)
|
308 |
+
assert len(tokens) == hp.n_vocab, \
|
309 |
+
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
310 |
+
gguf_writer.add_token_list(tokens)
|
311 |
+
gguf_writer.add_token_scores(scores)
|
312 |
+
if len(toktypes) > 0:
|
313 |
+
gguf_writer.add_token_types(toktypes)
|
314 |
+
return
|
315 |
+
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
316 |
+
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
|
317 |
+
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
318 |
+
tt = 1 # Normal
|
319 |
+
# Special handling for UNK, BOS, EOS tokens.
|
320 |
+
if tokid <= 2:
|
321 |
+
if tokid == 0:
|
322 |
+
vbytes = b'<unk>'
|
323 |
+
tt = 2
|
324 |
+
elif tokid == 1:
|
325 |
+
vbytes = b'<s>'
|
326 |
+
tt = 3
|
327 |
+
else:
|
328 |
+
vbytes = b'</s>'
|
329 |
+
tt = 3
|
330 |
+
elif len(vbytes) == 0:
|
331 |
+
tt = 3 # Control
|
332 |
+
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
333 |
+
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
|
334 |
+
tt = 6 # Byte
|
335 |
+
else:
|
336 |
+
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
|
337 |
+
toktypes.append(tt)
|
338 |
+
tokens.append(vbytes)
|
339 |
+
scores.append(vscore)
|
340 |
+
gguf_writer.add_token_list(tokens)
|
341 |
+
gguf_writer.add_token_scores(scores)
|
342 |
+
gguf_writer.add_token_types(toktypes)
|
343 |
+
gguf_writer.add_unk_token_id(0)
|
344 |
+
gguf_writer.add_bos_token_id(1)
|
345 |
+
gguf_writer.add_eos_token_id(2)
|
346 |
+
|
347 |
+
def add_tensors(self, gguf_writer):
|
348 |
+
tensor_map = self.name_map
|
349 |
+
data = self.data
|
350 |
+
print(f'* Adding {len(self.model.tensors)} tensor(s)')
|
351 |
+
for tensor in self.model.tensors:
|
352 |
+
name = str(tensor.name, 'UTF-8')
|
353 |
+
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
354 |
+
assert mapped_name is not None, f'Bad name {name}'
|
355 |
+
tempdims = list(tensor.dims[:])
|
356 |
+
if len(tempdims) > 1:
|
357 |
+
temp = tempdims[1]
|
358 |
+
tempdims[1] = tempdims[0]
|
359 |
+
tempdims[0] = temp
|
360 |
+
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
361 |
+
gguf_writer.add_tensor(
|
362 |
+
mapped_name,
|
363 |
+
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
364 |
+
raw_shape = tempdims,
|
365 |
+
raw_dtype = tensor.dtype )
|
366 |
+
|
367 |
+
def handle_metadata(cfg, hp):
|
368 |
+
import convert
|
369 |
+
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
370 |
+
hf_config_path = cfg.model_metadata_dir / "config.json"
|
371 |
+
orig_config_path = cfg.model_metadata_dir / "params.json"
|
372 |
+
# We pass a fake model here. "original" mode will check the shapes of some
|
373 |
+
# tensors if information is missing in the .json file: other than that, the
|
374 |
+
# model data isn't used so this should be safe (at least for now).
|
375 |
+
fakemodel = {
|
376 |
+
'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
|
377 |
+
'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
|
378 |
+
}
|
379 |
+
fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
|
380 |
+
fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
|
381 |
+
if hf_config_path.exists():
|
382 |
+
params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
|
383 |
+
elif orig_config_path.exists():
|
384 |
+
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
385 |
+
else:
|
386 |
+
raise ValueError('Unable to load metadata')
|
387 |
+
vocab = convert.load_vocab(
|
388 |
+
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
389 |
+
cfg.vocabtype )
|
390 |
+
# FIXME: Respect cfg.vocab_dir?
|
391 |
+
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
392 |
+
convert.check_vocab_size(params, vocab)
|
393 |
+
return (params, vocab, svocab)
|
394 |
+
|
395 |
+
def handle_args():
|
396 |
+
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
397 |
+
parser.add_argument('--input', '-i', type = Path, required = True,
|
398 |
+
help = 'Input GGMLv3 filename')
|
399 |
+
parser.add_argument('--output', '-o', type = Path, required = True,
|
400 |
+
help ='Output GGUF filename')
|
401 |
+
parser.add_argument('--name',
|
402 |
+
help = 'Set model name')
|
403 |
+
parser.add_argument('--desc',
|
404 |
+
help = 'Set model description')
|
405 |
+
parser.add_argument('--gqa', type = int, default = 1,
|
406 |
+
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
407 |
+
parser.add_argument('--eps', default = '5.0e-06',
|
408 |
+
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
409 |
+
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
410 |
+
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
411 |
+
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
412 |
+
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
413 |
+
parser.add_argument("--vocab-dir", type=Path,
|
414 |
+
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
415 |
+
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
416 |
+
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
417 |
+
return parser.parse_args()
|
418 |
+
|
419 |
+
def main():
|
420 |
+
cfg = handle_args()
|
421 |
+
print(f'* Using config: {cfg}')
|
422 |
+
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
423 |
+
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
|
424 |
+
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
|
425 |
+
data = np.memmap(cfg.input, mode = 'r')
|
426 |
+
model = GGMLModel()
|
427 |
+
print('* Scanning GGML input file')
|
428 |
+
offset = model.load(data, 0)
|
429 |
+
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
430 |
+
vocab_override = None
|
431 |
+
params_override = None
|
432 |
+
special_vocab = None
|
433 |
+
if cfg.model_metadata_dir is not None:
|
434 |
+
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
|
435 |
+
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
436 |
+
print(f'* Overriding params: {params_override}')
|
437 |
+
print(f'* Overriding vocab: {vocab_override}')
|
438 |
+
print(f'* Special vocab: {special_vocab}')
|
439 |
+
else:
|
440 |
+
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
441 |
+
if model.file_format == GGMLFormat.GGML:
|
442 |
+
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
443 |
+
converter = GGMLToGGUF(model, data, cfg,
|
444 |
+
params_override = params_override,
|
445 |
+
vocab_override = vocab_override,
|
446 |
+
special_vocab = special_vocab )
|
447 |
+
converter.save()
|
448 |
+
print(f'* Successful completion. Output saved to: {cfg.output}')
|
449 |
+
|
450 |
+
if __name__ == '__main__':
|
451 |
+
main()
|
llama.cpp/convert-mpt-hf-to-gguf.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF mpt--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
|
23 |
+
def count_model_parts(dir_model: Path) -> int:
|
24 |
+
num_parts = 0
|
25 |
+
for filename in os.listdir(dir_model):
|
26 |
+
if filename.startswith("pytorch_model-"):
|
27 |
+
num_parts += 1
|
28 |
+
|
29 |
+
if num_parts > 0:
|
30 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
31 |
+
return num_parts
|
32 |
+
|
33 |
+
|
34 |
+
def parse_args() -> argparse.Namespace:
|
35 |
+
parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
|
36 |
+
parser.add_argument(
|
37 |
+
"--vocab-only", action="store_true",
|
38 |
+
help="extract only the vocab",
|
39 |
+
)
|
40 |
+
parser.add_argument(
|
41 |
+
"--outfile", type=Path,
|
42 |
+
help="path to write to; default: based on input",
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
"model", type=Path,
|
46 |
+
help="directory containing model file, or model file itself (*.bin)",
|
47 |
+
)
|
48 |
+
parser.add_argument(
|
49 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
50 |
+
help="output format - use 0 for float32, 1 for float16",
|
51 |
+
)
|
52 |
+
return parser.parse_args()
|
53 |
+
|
54 |
+
args = parse_args()
|
55 |
+
|
56 |
+
dir_model = args.model
|
57 |
+
ftype = args.ftype
|
58 |
+
if not dir_model.is_dir():
|
59 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
60 |
+
sys.exit(1)
|
61 |
+
|
62 |
+
# possible tensor data types
|
63 |
+
# ftype == 0 -> float32
|
64 |
+
# ftype == 1 -> float16
|
65 |
+
|
66 |
+
# map from ftype to string
|
67 |
+
ftype_str = ["f32", "f16"]
|
68 |
+
|
69 |
+
if args.outfile is not None:
|
70 |
+
fname_out = args.outfile
|
71 |
+
else:
|
72 |
+
# output in the same directory as the model by default
|
73 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
74 |
+
|
75 |
+
print("gguf: loading model "+dir_model.name)
|
76 |
+
|
77 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
78 |
+
hparams = json.load(f)
|
79 |
+
|
80 |
+
if hparams["architectures"][0] != "MPTForCausalLM":
|
81 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
82 |
+
|
83 |
+
sys.exit()
|
84 |
+
|
85 |
+
# get number of model parts
|
86 |
+
num_parts = count_model_parts(dir_model)
|
87 |
+
|
88 |
+
ARCH=gguf.MODEL_ARCH.MPT
|
89 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
90 |
+
|
91 |
+
print("gguf: get model metadata")
|
92 |
+
|
93 |
+
block_count = hparams["n_layers"]
|
94 |
+
|
95 |
+
gguf_writer.add_name(dir_model.name)
|
96 |
+
gguf_writer.add_context_length(hparams["max_seq_len"])
|
97 |
+
gguf_writer.add_embedding_length(hparams["d_model"])
|
98 |
+
gguf_writer.add_block_count(block_count)
|
99 |
+
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
|
100 |
+
gguf_writer.add_head_count(hparams["n_heads"])
|
101 |
+
if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
|
102 |
+
gguf_writer.add_head_count_kv(kv_n_heads)
|
103 |
+
gguf_writer.add_layer_norm_eps(1e-05)
|
104 |
+
if hparams["attn_config"]["clip_qkv"] is not None:
|
105 |
+
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
|
106 |
+
gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
|
107 |
+
|
108 |
+
# TOKENIZATION
|
109 |
+
|
110 |
+
print("gguf: get tokenizer metadata")
|
111 |
+
|
112 |
+
tokens: list[bytearray] = []
|
113 |
+
scores: list[float] = []
|
114 |
+
toktypes: list[int] = []
|
115 |
+
|
116 |
+
# gpt2 tokenizer
|
117 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
118 |
+
|
119 |
+
print("gguf: get gpt2 tokenizer vocab")
|
120 |
+
|
121 |
+
# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
|
122 |
+
# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
|
123 |
+
# accomodate some "reserved" tokens; this is causing problems down the line in
|
124 |
+
# llama.cpp, so we pad the vocab with dummy tokens:
|
125 |
+
|
126 |
+
vocab_size = hparams["vocab_size"]
|
127 |
+
|
128 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
129 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
130 |
+
|
131 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
132 |
+
|
133 |
+
for i in range(vocab_size):
|
134 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
135 |
+
scores.append(0.0) # dummy
|
136 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
137 |
+
|
138 |
+
gguf_writer.add_token_list(tokens)
|
139 |
+
gguf_writer.add_token_scores(scores)
|
140 |
+
gguf_writer.add_token_types(toktypes)
|
141 |
+
|
142 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
143 |
+
special_vocab.add_to_gguf(gguf_writer)
|
144 |
+
|
145 |
+
# TENSORS
|
146 |
+
|
147 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
148 |
+
|
149 |
+
# tensor info
|
150 |
+
print("gguf: get tensor metadata")
|
151 |
+
|
152 |
+
if num_parts == 0:
|
153 |
+
part_names = iter(("pytorch_model.bin",))
|
154 |
+
else:
|
155 |
+
part_names = (
|
156 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
157 |
+
)
|
158 |
+
|
159 |
+
for part_name in part_names:
|
160 |
+
if args.vocab_only:
|
161 |
+
break
|
162 |
+
print("gguf: loading model part '" + part_name + "'")
|
163 |
+
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
164 |
+
|
165 |
+
for name in model_part.keys():
|
166 |
+
data = model_part[name]
|
167 |
+
|
168 |
+
old_dtype = data.dtype
|
169 |
+
|
170 |
+
# convert any unsupported data types to float32
|
171 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
172 |
+
data = data.to(torch.float32)
|
173 |
+
|
174 |
+
data = data.squeeze().numpy()
|
175 |
+
|
176 |
+
# map tensor names
|
177 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
178 |
+
if new_name is None:
|
179 |
+
print("Cannot map tensor '" + name + "'")
|
180 |
+
continue # for the sake of compatibility with some old published models, don't quit
|
181 |
+
sys.exit()
|
182 |
+
|
183 |
+
n_dims = len(data.shape)
|
184 |
+
data_dtype = data.dtype
|
185 |
+
|
186 |
+
# if f32 desired, convert any float16 to float32
|
187 |
+
if ftype == 0 and data_dtype == np.float16:
|
188 |
+
data = data.astype(np.float32)
|
189 |
+
|
190 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
191 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
192 |
+
data = data.astype(np.float32)
|
193 |
+
|
194 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
195 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
196 |
+
data = data.astype(np.float16)
|
197 |
+
|
198 |
+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
199 |
+
|
200 |
+
gguf_writer.add_tensor(new_name, data)
|
201 |
+
|
202 |
+
# note: MPT output is tied to (same as) wte in original model;
|
203 |
+
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
|
204 |
+
if new_name == "token_embd.weight":
|
205 |
+
gguf_writer.add_tensor("output.weight", data)
|
206 |
+
|
207 |
+
print("gguf: write header")
|
208 |
+
gguf_writer.write_header_to_file()
|
209 |
+
print("gguf: write metadata")
|
210 |
+
gguf_writer.write_kv_data_to_file()
|
211 |
+
if not args.vocab_only:
|
212 |
+
print("gguf: write tensors")
|
213 |
+
gguf_writer.write_tensors_to_file()
|
214 |
+
|
215 |
+
gguf_writer.close()
|
216 |
+
|
217 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
218 |
+
print("")
|
llama.cpp/convert-persimmon-to-gguf.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
from pprint import pprint
|
4 |
+
import sys
|
5 |
+
import argparse
|
6 |
+
from pathlib import Path
|
7 |
+
from sentencepiece import SentencePieceProcessor
|
8 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
9 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
10 |
+
import gguf
|
11 |
+
|
12 |
+
def _flatten_dict(dct, tensors, prefix=None):
|
13 |
+
assert isinstance(dct, dict)
|
14 |
+
for key in dct.keys():
|
15 |
+
new_prefix = prefix + '.' + key if prefix is not None else key
|
16 |
+
if isinstance(dct[key], torch.Tensor):
|
17 |
+
tensors[new_prefix] = dct[key]
|
18 |
+
elif isinstance(dct[key], dict):
|
19 |
+
_flatten_dict(dct[key], tensors, new_prefix)
|
20 |
+
else:
|
21 |
+
raise ValueError(type(dct[key]))
|
22 |
+
return None
|
23 |
+
|
24 |
+
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
25 |
+
tokenizer_path = dir_model / 'adept_vocab.model'
|
26 |
+
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
27 |
+
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
28 |
+
print('gguf: adding tokens')
|
29 |
+
tokens: list[bytes] = []
|
30 |
+
scores: list[float] = []
|
31 |
+
toktypes: list[int] = []
|
32 |
+
|
33 |
+
for i in range(tokenizer.vocab_size()):
|
34 |
+
text: bytes
|
35 |
+
score: float
|
36 |
+
|
37 |
+
piece = tokenizer.id_to_piece(i)
|
38 |
+
text = piece.encode("utf-8")
|
39 |
+
score = tokenizer.get_score(i)
|
40 |
+
|
41 |
+
toktype = 1
|
42 |
+
if tokenizer.is_unknown(i):
|
43 |
+
toktype = 2
|
44 |
+
if tokenizer.is_control(i):
|
45 |
+
toktype = 3
|
46 |
+
if tokenizer.is_unused(i):
|
47 |
+
toktype = 5
|
48 |
+
if tokenizer.is_byte(i):
|
49 |
+
toktype = 6
|
50 |
+
|
51 |
+
tokens.append(text)
|
52 |
+
scores.append(score)
|
53 |
+
toktypes.append(toktype)
|
54 |
+
pass
|
55 |
+
return tokens, scores, toktypes
|
56 |
+
|
57 |
+
def main():
|
58 |
+
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
59 |
+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
60 |
+
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
|
61 |
+
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
|
62 |
+
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
|
63 |
+
args = parser.parse_args()
|
64 |
+
sys.path.append(str(args.adept_inference_dir))
|
65 |
+
persimmon_model = torch.load(args.ckpt_path)
|
66 |
+
hparams = persimmon_model['args']
|
67 |
+
pprint(hparams)
|
68 |
+
tensors = {}
|
69 |
+
_flatten_dict(persimmon_model['model'], tensors, None)
|
70 |
+
|
71 |
+
arch = gguf.MODEL_ARCH.PERSIMMON
|
72 |
+
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
|
73 |
+
|
74 |
+
block_count = hparams.num_layers
|
75 |
+
head_count = hparams.num_attention_heads
|
76 |
+
head_count_kv = head_count
|
77 |
+
ctx_length = hparams.seq_length
|
78 |
+
hidden_size = hparams.hidden_size
|
79 |
+
|
80 |
+
gguf_writer.add_name('persimmon-8b-chat')
|
81 |
+
gguf_writer.add_context_length(ctx_length)
|
82 |
+
gguf_writer.add_embedding_length(hidden_size)
|
83 |
+
gguf_writer.add_block_count(block_count)
|
84 |
+
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
85 |
+
gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
86 |
+
gguf_writer.add_head_count(head_count)
|
87 |
+
gguf_writer.add_head_count_kv(head_count_kv)
|
88 |
+
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
89 |
+
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
|
90 |
+
|
91 |
+
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
|
92 |
+
gguf_writer.add_tokenizer_model('llama')
|
93 |
+
gguf_writer.add_token_list(tokens)
|
94 |
+
gguf_writer.add_token_scores(scores)
|
95 |
+
gguf_writer.add_token_types(toktypes)
|
96 |
+
gguf_writer.add_bos_token_id(71013)
|
97 |
+
gguf_writer.add_eos_token_id(71013)
|
98 |
+
|
99 |
+
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
100 |
+
print(tensor_map)
|
101 |
+
for name in tensors.keys():
|
102 |
+
data = tensors[name]
|
103 |
+
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
104 |
+
continue
|
105 |
+
old_dtype = data.dtype
|
106 |
+
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
107 |
+
data = data.to(torch.float32).squeeze().numpy()
|
108 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
109 |
+
if new_name is None:
|
110 |
+
print("Can not map tensor '" + name + "'")
|
111 |
+
sys.exit()
|
112 |
+
n_dims = len(data.shape)
|
113 |
+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
114 |
+
gguf_writer.add_tensor(new_name, data)
|
115 |
+
print("gguf: write header")
|
116 |
+
gguf_writer.write_header_to_file()
|
117 |
+
print("gguf: write metadata")
|
118 |
+
gguf_writer.write_kv_data_to_file()
|
119 |
+
print("gguf: write tensors")
|
120 |
+
gguf_writer.write_tensors_to_file()
|
121 |
+
|
122 |
+
gguf_writer.close()
|
123 |
+
|
124 |
+
print(f"gguf: model successfully exported to '{args.outfile}'")
|
125 |
+
print("")
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
if __name__ == '__main__':
|
130 |
+
main()
|
llama.cpp/convert-refact-hf-to-gguf.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF refact--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import torch
|
14 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
15 |
+
|
16 |
+
if "NO_LOCAL_GGUF" not in os.environ:
|
17 |
+
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
18 |
+
import gguf
|
19 |
+
|
20 |
+
def count_model_parts(dir_model: Path) -> int:
|
21 |
+
num_parts = 0
|
22 |
+
for filename in os.listdir(dir_model):
|
23 |
+
if filename.startswith("pytorch_model-"):
|
24 |
+
num_parts += 1
|
25 |
+
|
26 |
+
if num_parts > 0:
|
27 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
28 |
+
return num_parts
|
29 |
+
|
30 |
+
|
31 |
+
def parse_args() -> argparse.Namespace:
|
32 |
+
parser = argparse.ArgumentParser(
|
33 |
+
description="Convert a Refact model to a GGML compatible file"
|
34 |
+
)
|
35 |
+
parser.add_argument(
|
36 |
+
"--vocab-only",
|
37 |
+
action="store_true",
|
38 |
+
help="extract only the vocab",
|
39 |
+
)
|
40 |
+
parser.add_argument(
|
41 |
+
"--outfile",
|
42 |
+
type=Path,
|
43 |
+
help="path to write to; default: based on input",
|
44 |
+
)
|
45 |
+
parser.add_argument(
|
46 |
+
"model",
|
47 |
+
type=Path,
|
48 |
+
help="directory containing model file, or model file itself (*.bin)",
|
49 |
+
)
|
50 |
+
parser.add_argument(
|
51 |
+
"ftype",
|
52 |
+
type=int,
|
53 |
+
choices=[0, 1],
|
54 |
+
default=1,
|
55 |
+
nargs="?",
|
56 |
+
help="output format - use 0 for float32, 1 for float16",
|
57 |
+
)
|
58 |
+
return parser.parse_args()
|
59 |
+
|
60 |
+
|
61 |
+
args = parse_args()
|
62 |
+
|
63 |
+
dir_model = args.model
|
64 |
+
ftype = args.ftype
|
65 |
+
if not dir_model.is_dir():
|
66 |
+
print(f"Error: {args.model} is not a directory", file=sys.stderr)
|
67 |
+
sys.exit(1)
|
68 |
+
|
69 |
+
# possible tensor data types
|
70 |
+
# ftype == 0 -> float32
|
71 |
+
# ftype == 1 -> float16
|
72 |
+
|
73 |
+
# map from ftype to string
|
74 |
+
ftype_str = ["f32", "f16"]
|
75 |
+
|
76 |
+
if args.outfile is not None:
|
77 |
+
fname_out = args.outfile
|
78 |
+
else:
|
79 |
+
# output in the same directory as the model by default
|
80 |
+
fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
|
81 |
+
|
82 |
+
print("gguf: loading model " + dir_model.name)
|
83 |
+
|
84 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
85 |
+
hparams = json.load(f)
|
86 |
+
|
87 |
+
if hparams["architectures"][0] != "GPTRefactForCausalLM":
|
88 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
89 |
+
|
90 |
+
sys.exit(1)
|
91 |
+
|
92 |
+
# get number of model parts
|
93 |
+
num_parts = count_model_parts(dir_model)
|
94 |
+
|
95 |
+
ARCH = gguf.MODEL_ARCH.REFACT
|
96 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
97 |
+
|
98 |
+
print("gguf: get model metadata")
|
99 |
+
|
100 |
+
# Get refact feed forward dimension
|
101 |
+
hidden_dim = hparams["n_embd"]
|
102 |
+
inner_dim = 4 * hidden_dim
|
103 |
+
hidden_dim = int(2 * inner_dim / 3)
|
104 |
+
multiple_of = 256
|
105 |
+
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
106 |
+
|
107 |
+
block_count = hparams["n_layer"]
|
108 |
+
|
109 |
+
gguf_writer.add_name("Refact")
|
110 |
+
# refact uses Alibi. So this is from config.json which might be used by training.
|
111 |
+
gguf_writer.add_context_length(hparams["n_positions"])
|
112 |
+
gguf_writer.add_embedding_length(hparams["n_embd"])
|
113 |
+
|
114 |
+
gguf_writer.add_feed_forward_length(ff_dim)
|
115 |
+
gguf_writer.add_block_count(block_count)
|
116 |
+
gguf_writer.add_head_count(hparams["n_head"])
|
117 |
+
gguf_writer.add_head_count_kv(1)
|
118 |
+
gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
|
119 |
+
gguf_writer.add_file_type(ftype)
|
120 |
+
|
121 |
+
# TOKENIZATION
|
122 |
+
|
123 |
+
print("gguf: get tokenizer metadata")
|
124 |
+
|
125 |
+
tokens: list[bytearray] = []
|
126 |
+
scores: list[float] = []
|
127 |
+
toktypes: list[int] = []
|
128 |
+
|
129 |
+
# gpt2 tokenizer
|
130 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
131 |
+
|
132 |
+
print("gguf: get gpt2 tokenizer vocab")
|
133 |
+
|
134 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
135 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
136 |
+
|
137 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
138 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
139 |
+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
140 |
+
assert max(tokenizer.vocab.values()) < vocab_size
|
141 |
+
|
142 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
143 |
+
|
144 |
+
for i in range(vocab_size):
|
145 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
146 |
+
scores.append(0.0) # dummy
|
147 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
148 |
+
|
149 |
+
gguf_writer.add_token_list(tokens)
|
150 |
+
gguf_writer.add_token_scores(scores)
|
151 |
+
gguf_writer.add_token_types(toktypes)
|
152 |
+
|
153 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
154 |
+
special_vocab.add_to_gguf(gguf_writer)
|
155 |
+
|
156 |
+
# TENSORS
|
157 |
+
|
158 |
+
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
159 |
+
|
160 |
+
# params for qkv transform
|
161 |
+
n_head = hparams["n_head"]
|
162 |
+
n_head_kv = 1
|
163 |
+
|
164 |
+
head_dim = hparams["n_embd"] // n_head
|
165 |
+
|
166 |
+
# tensor info
|
167 |
+
print("gguf: get tensor metadata")
|
168 |
+
|
169 |
+
if num_parts == 0:
|
170 |
+
part_names = iter(("pytorch_model.bin",))
|
171 |
+
else:
|
172 |
+
part_names = (
|
173 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
174 |
+
)
|
175 |
+
for part_name in part_names:
|
176 |
+
if args.vocab_only:
|
177 |
+
break
|
178 |
+
print("gguf: loading model part '" + part_name + "'")
|
179 |
+
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
180 |
+
|
181 |
+
for i in range(block_count):
|
182 |
+
if f"transformer.h.{i}.attn.kv.weight" in model_part:
|
183 |
+
data = model_part[f"transformer.h.{i}.attn.kv.weight"]
|
184 |
+
model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
|
185 |
+
: n_head_kv * head_dim
|
186 |
+
]
|
187 |
+
model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
|
188 |
+
n_head_kv * head_dim :
|
189 |
+
]
|
190 |
+
del model_part[f"transformer.h.{i}.attn.kv.weight"]
|
191 |
+
if f"transformer.h.{i}.attn.q.weight" in model_part:
|
192 |
+
model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
|
193 |
+
f"transformer.h.{i}.attn.q.weight"
|
194 |
+
]
|
195 |
+
del model_part[f"transformer.h.{i}.attn.q.weight"]
|
196 |
+
if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
|
197 |
+
data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
198 |
+
model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
|
199 |
+
model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
|
200 |
+
del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
201 |
+
|
202 |
+
for name in model_part.keys():
|
203 |
+
data = model_part[name]
|
204 |
+
|
205 |
+
old_dtype = data.dtype
|
206 |
+
|
207 |
+
# convert any unsupported data types to float32
|
208 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
209 |
+
data = data.to(torch.float32)
|
210 |
+
|
211 |
+
data = data.squeeze().numpy()
|
212 |
+
|
213 |
+
# map tensor names
|
214 |
+
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
215 |
+
if new_name is None:
|
216 |
+
print("Can not map tensor '" + name + "'")
|
217 |
+
sys.exit()
|
218 |
+
|
219 |
+
n_dims = len(data.shape)
|
220 |
+
data_dtype = data.dtype
|
221 |
+
|
222 |
+
# if f32 desired, convert any float16 to float32
|
223 |
+
if ftype == 0 and data_dtype == np.float16:
|
224 |
+
data = data.astype(np.float32)
|
225 |
+
|
226 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
227 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
228 |
+
data = data.astype(np.float32)
|
229 |
+
|
230 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
231 |
+
if (
|
232 |
+
ftype == 1
|
233 |
+
and data_dtype == np.float32
|
234 |
+
and name.endswith(".weight")
|
235 |
+
and n_dims == 2
|
236 |
+
):
|
237 |
+
data = data.astype(np.float16)
|
238 |
+
|
239 |
+
print(
|
240 |
+
new_name
|
241 |
+
+ ", n_dims = "
|
242 |
+
+ str(n_dims)
|
243 |
+
+ ", "
|
244 |
+
+ str(old_dtype)
|
245 |
+
+ " --> "
|
246 |
+
+ str(data.dtype)
|
247 |
+
)
|
248 |
+
|
249 |
+
gguf_writer.add_tensor(new_name, data)
|
250 |
+
|
251 |
+
|
252 |
+
print("gguf: write header")
|
253 |
+
gguf_writer.write_header_to_file()
|
254 |
+
print("gguf: write metadata")
|
255 |
+
gguf_writer.write_kv_data_to_file()
|
256 |
+
if not args.vocab_only:
|
257 |
+
print("gguf: write tensors")
|
258 |
+
gguf_writer.write_tensors_to_file()
|
259 |
+
|
260 |
+
gguf_writer.close()
|
261 |
+
|
262 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
263 |
+
print("")
|
llama.cpp/convert-starcoder-hf-to-gguf.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF starcoder --> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
|
23 |
+
def count_model_parts(dir_model: Path) -> int:
|
24 |
+
num_parts = 0
|
25 |
+
for filename in os.listdir(dir_model):
|
26 |
+
if filename.startswith("pytorch_model-"):
|
27 |
+
num_parts += 1
|
28 |
+
|
29 |
+
if num_parts > 0:
|
30 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
31 |
+
return num_parts
|
32 |
+
|
33 |
+
|
34 |
+
def parse_args() -> argparse.Namespace:
|
35 |
+
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
|
36 |
+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
37 |
+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
38 |
+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
39 |
+
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
40 |
+
return parser.parse_args()
|
41 |
+
|
42 |
+
args = parse_args()
|
43 |
+
|
44 |
+
dir_model = args.model
|
45 |
+
ftype = args.ftype
|
46 |
+
if not dir_model.is_dir():
|
47 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
48 |
+
sys.exit(1)
|
49 |
+
|
50 |
+
# possible tensor data types
|
51 |
+
# ftype == 0 -> float32
|
52 |
+
# ftype == 1 -> float16
|
53 |
+
|
54 |
+
# map from ftype to string
|
55 |
+
ftype_str = ["f32", "f16"]
|
56 |
+
|
57 |
+
if args.outfile is not None:
|
58 |
+
fname_out = args.outfile
|
59 |
+
else:
|
60 |
+
# output in the same directory as the model by default
|
61 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
62 |
+
|
63 |
+
print("gguf: loading model "+dir_model.name)
|
64 |
+
|
65 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
66 |
+
hparams = json.load(f)
|
67 |
+
|
68 |
+
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
|
69 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
70 |
+
|
71 |
+
sys.exit(1)
|
72 |
+
|
73 |
+
# get number of model parts
|
74 |
+
num_parts = count_model_parts(dir_model)
|
75 |
+
|
76 |
+
ARCH=gguf.MODEL_ARCH.STARCODER
|
77 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
78 |
+
|
79 |
+
print("gguf: get model metadata")
|
80 |
+
|
81 |
+
block_count = hparams["n_layer"]
|
82 |
+
|
83 |
+
gguf_writer.add_name("StarCoder")
|
84 |
+
gguf_writer.add_context_length(hparams["n_positions"])
|
85 |
+
gguf_writer.add_embedding_length(hparams["n_embd"])
|
86 |
+
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
|
87 |
+
gguf_writer.add_block_count(block_count)
|
88 |
+
gguf_writer.add_head_count(hparams["n_head"])
|
89 |
+
gguf_writer.add_head_count_kv(1)
|
90 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
91 |
+
gguf_writer.add_file_type(ftype)
|
92 |
+
|
93 |
+
# TOKENIZATION
|
94 |
+
|
95 |
+
print("gguf: get tokenizer metadata")
|
96 |
+
|
97 |
+
tokens: list[bytearray] = []
|
98 |
+
scores: list[float] = []
|
99 |
+
toktypes: list[int] = []
|
100 |
+
|
101 |
+
# gpt2 tokenizer
|
102 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
103 |
+
|
104 |
+
print("gguf: get gpt2 tokenizer vocab")
|
105 |
+
|
106 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
107 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
108 |
+
|
109 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
110 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
111 |
+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
112 |
+
assert max(tokenizer.vocab.values()) < vocab_size
|
113 |
+
|
114 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
115 |
+
|
116 |
+
for i in range(vocab_size):
|
117 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
118 |
+
scores.append(0.0) # dummy
|
119 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
120 |
+
|
121 |
+
gguf_writer.add_token_list(tokens)
|
122 |
+
gguf_writer.add_token_scores(scores)
|
123 |
+
gguf_writer.add_token_types(toktypes)
|
124 |
+
|
125 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
126 |
+
special_vocab.add_to_gguf(gguf_writer)
|
127 |
+
|
128 |
+
# TENSORS
|
129 |
+
|
130 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
131 |
+
|
132 |
+
# params for qkv transform
|
133 |
+
n_head = hparams["n_head"]
|
134 |
+
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
135 |
+
|
136 |
+
head_dim = hparams["n_embd"] // n_head
|
137 |
+
|
138 |
+
# tensor info
|
139 |
+
print("gguf: get tensor metadata")
|
140 |
+
|
141 |
+
if num_parts == 0:
|
142 |
+
part_names = iter(("pytorch_model.bin",))
|
143 |
+
else:
|
144 |
+
part_names = (
|
145 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
146 |
+
)
|
147 |
+
|
148 |
+
for part_name in part_names:
|
149 |
+
if args.vocab_only:
|
150 |
+
break
|
151 |
+
print("gguf: loading model part '" + part_name + "'")
|
152 |
+
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
153 |
+
|
154 |
+
for name in model_part.keys():
|
155 |
+
data = model_part[name]
|
156 |
+
|
157 |
+
old_dtype = data.dtype
|
158 |
+
|
159 |
+
# convert any unsupported data types to float32
|
160 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
161 |
+
data = data.to(torch.float32)
|
162 |
+
|
163 |
+
data = data.squeeze().numpy()
|
164 |
+
|
165 |
+
# map tensor names
|
166 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
167 |
+
if new_name is None:
|
168 |
+
print("Can not map tensor '" + name + "'")
|
169 |
+
sys.exit()
|
170 |
+
|
171 |
+
n_dims = len(data.shape)
|
172 |
+
data_dtype = data.dtype
|
173 |
+
|
174 |
+
# if f32 desired, convert any float16 to float32
|
175 |
+
if ftype == 0 and data_dtype == np.float16:
|
176 |
+
data = data.astype(np.float32)
|
177 |
+
|
178 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
179 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
180 |
+
data = data.astype(np.float32)
|
181 |
+
|
182 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
183 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
184 |
+
data = data.astype(np.float16)
|
185 |
+
|
186 |
+
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
187 |
+
|
188 |
+
gguf_writer.add_tensor(new_name, data)
|
189 |
+
|
190 |
+
|
191 |
+
print("gguf: write header")
|
192 |
+
gguf_writer.write_header_to_file()
|
193 |
+
print("gguf: write metadata")
|
194 |
+
gguf_writer.write_kv_data_to_file()
|
195 |
+
if not args.vocab_only:
|
196 |
+
print("gguf: write tensors")
|
197 |
+
gguf_writer.write_tensors_to_file()
|
198 |
+
|
199 |
+
gguf_writer.close()
|
200 |
+
|
201 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
202 |
+
print("")
|
llama.cpp/examples/finetune/convert-finetune-checkpoint-to-gguf.py
ADDED
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# finetune checkpoint --> gguf conversion
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import gguf
|
6 |
+
import os
|
7 |
+
import struct
|
8 |
+
import sys
|
9 |
+
import numpy as np
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
# gguf constants
|
13 |
+
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
14 |
+
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
15 |
+
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
16 |
+
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
17 |
+
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
18 |
+
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
19 |
+
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
20 |
+
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
21 |
+
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
22 |
+
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
23 |
+
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
24 |
+
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
25 |
+
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
26 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
27 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
28 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
29 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
30 |
+
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
31 |
+
|
32 |
+
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
33 |
+
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
34 |
+
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
35 |
+
|
36 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
37 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
38 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
39 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
40 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
41 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
42 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
43 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
44 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
45 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
46 |
+
|
47 |
+
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
48 |
+
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
49 |
+
LLM_KV_TRAINING_TYPE = "training.type"
|
50 |
+
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
51 |
+
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
52 |
+
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
53 |
+
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
54 |
+
|
55 |
+
LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
|
56 |
+
LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
|
57 |
+
LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
|
58 |
+
LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
|
59 |
+
LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
|
60 |
+
LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
|
61 |
+
LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
|
62 |
+
LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
|
63 |
+
LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
|
64 |
+
LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
|
65 |
+
LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
|
66 |
+
LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
|
67 |
+
|
68 |
+
class Tensor:
|
69 |
+
def __init__(self, dtype='f', ne=None):
|
70 |
+
if ne is None:
|
71 |
+
ne = []
|
72 |
+
self.dtype = dtype
|
73 |
+
self.ne = ne
|
74 |
+
self.nbytes = 0
|
75 |
+
if self.dtype == 'f':
|
76 |
+
if len(self.ne) == 0:
|
77 |
+
self.nbytes = 0
|
78 |
+
else:
|
79 |
+
self.nbytes = int(np.product(self.ne)) * 4
|
80 |
+
else:
|
81 |
+
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
82 |
+
|
83 |
+
def load(self, data, offset):
|
84 |
+
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
85 |
+
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
86 |
+
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
87 |
+
|
88 |
+
assert(nd == len(self.ne))
|
89 |
+
ne = []
|
90 |
+
for d in range(nd):
|
91 |
+
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
92 |
+
ne.append(n)
|
93 |
+
|
94 |
+
if tuple(ne) != tuple(self.ne):
|
95 |
+
raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
|
96 |
+
|
97 |
+
if self.dtype == 'f':
|
98 |
+
assert(dtype == 0)
|
99 |
+
else:
|
100 |
+
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
101 |
+
|
102 |
+
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
103 |
+
# 32-byte alignment
|
104 |
+
offset += (0 - offset) & 31
|
105 |
+
self.data = data[offset:offset+self.nbytes]
|
106 |
+
offset += self.nbytes
|
107 |
+
return offset
|
108 |
+
|
109 |
+
def max_storage_size(self):
|
110 |
+
result = 0
|
111 |
+
result += 4 # nd
|
112 |
+
result += 4 # namelen
|
113 |
+
result += 4 # dtype
|
114 |
+
result += len(self.ne)*8 # ne
|
115 |
+
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
116 |
+
result += 31 # 32-byte alignment
|
117 |
+
result += self.nbytes
|
118 |
+
return result
|
119 |
+
|
120 |
+
def save_gguf(self, gguf_writer, name):
|
121 |
+
gguf_writer.add_tensor(
|
122 |
+
name=name,
|
123 |
+
tensor=self.data,
|
124 |
+
raw_shape=np.array(list(reversed(self.ne))),
|
125 |
+
raw_dtype=gguf.GGMLQuantizationType.F32)
|
126 |
+
|
127 |
+
class OptimizationContext:
|
128 |
+
def __init__(self):
|
129 |
+
pass
|
130 |
+
|
131 |
+
def load(self, data, offset):
|
132 |
+
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
133 |
+
offset += 4
|
134 |
+
|
135 |
+
if self.version != 1:
|
136 |
+
raise ValueError('Invalid version of optimization context in checkpoint file')
|
137 |
+
|
138 |
+
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
139 |
+
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
140 |
+
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
141 |
+
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
142 |
+
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
143 |
+
|
144 |
+
self.adam_m = Tensor('f', [self.nx])
|
145 |
+
self.adam_v = Tensor('f', [self.nx])
|
146 |
+
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
147 |
+
|
148 |
+
self.lbfgs_x = Tensor('f', [self.nx])
|
149 |
+
self.lbfgs_xp = Tensor('f', [self.nx])
|
150 |
+
self.lbfgs_g = Tensor('f', [self.nx])
|
151 |
+
self.lbfgs_gp = Tensor('f', [self.nx])
|
152 |
+
self.lbfgs_d = Tensor('f', [self.nx])
|
153 |
+
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
154 |
+
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
155 |
+
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
156 |
+
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
157 |
+
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
158 |
+
|
159 |
+
# forgot to save type in version 1:
|
160 |
+
# guess self.type from number of remaining bytes
|
161 |
+
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
162 |
+
[self.adam_m, self.adam_v]
|
163 |
+
+([self.adam_pf] if (self.past > 0) else [])])
|
164 |
+
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
165 |
+
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
166 |
+
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
167 |
+
self.lbfgs_lmal, self.lbfgs_lmys,
|
168 |
+
self.lbfgs_lms, self.lbfgs_lmy]
|
169 |
+
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
170 |
+
# due to alignment padding the size might not by exact
|
171 |
+
# but the difference in size for both types is significant,
|
172 |
+
# so we can just use whichever is closest
|
173 |
+
remaining = len(data) - offset
|
174 |
+
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
175 |
+
self.type = 0
|
176 |
+
else:
|
177 |
+
self.type = 1
|
178 |
+
|
179 |
+
if self.type == 0:
|
180 |
+
offset = self.adam_m.load(data, offset)
|
181 |
+
offset = self.adam_v.load(data, offset)
|
182 |
+
offset = self.adam_pf.load(data,offset)
|
183 |
+
|
184 |
+
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
185 |
+
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
186 |
+
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
187 |
+
|
188 |
+
elif self.type == 1:
|
189 |
+
offset = self.lbfgs_x.load(data, offset)
|
190 |
+
offset = self.lbfgs_xp.load(data, offset)
|
191 |
+
offset = self.lbfgs_g.load(data, offset)
|
192 |
+
offset = self.lbfgs_gp.load(data, offset)
|
193 |
+
offset = self.lbfgs_d.load(data, offset)
|
194 |
+
offset = self.lbfgs_pf.load(data, offset)
|
195 |
+
offset = self.lbfgs_lmal.load(data, offset)
|
196 |
+
offset = self.lbfgs_lmys.load(data, offset)
|
197 |
+
offset = self.lbfgs_lms.load(data, offset)
|
198 |
+
offset = self.lbfgs_lmy.load(data, offset)
|
199 |
+
|
200 |
+
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
201 |
+
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
202 |
+
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
203 |
+
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
204 |
+
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
205 |
+
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
206 |
+
|
207 |
+
else:
|
208 |
+
raise ValueError(f"Invalid optimizer type '{self.type}'")
|
209 |
+
|
210 |
+
return offset
|
211 |
+
|
212 |
+
def save_gguf(self, gguf_writer):
|
213 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
214 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
215 |
+
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
216 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
217 |
+
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
218 |
+
|
219 |
+
if self.type == 0:
|
220 |
+
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
221 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
222 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
223 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
224 |
+
|
225 |
+
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
226 |
+
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
227 |
+
if self.past > 0:
|
228 |
+
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
229 |
+
|
230 |
+
elif self.type == 1:
|
231 |
+
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
232 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
233 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
234 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
235 |
+
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
236 |
+
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
237 |
+
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
238 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
239 |
+
|
240 |
+
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
241 |
+
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
242 |
+
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
243 |
+
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
244 |
+
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
245 |
+
if self.past > 0:
|
246 |
+
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
247 |
+
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
248 |
+
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
249 |
+
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
250 |
+
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
251 |
+
else:
|
252 |
+
raise ValueError('Unknown optimizer type')
|
253 |
+
|
254 |
+
class LoraParams:
|
255 |
+
def __init__(self):
|
256 |
+
pass
|
257 |
+
|
258 |
+
def load(self, data, offset):
|
259 |
+
self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
260 |
+
self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
261 |
+
self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
262 |
+
self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
263 |
+
self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
264 |
+
self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
265 |
+
self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
266 |
+
self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
267 |
+
self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
268 |
+
self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
269 |
+
self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
270 |
+
self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
271 |
+
return offset
|
272 |
+
|
273 |
+
def save_gguf(self, gguf_writer):
|
274 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
|
275 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
|
276 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
|
277 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
|
278 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
|
279 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
|
280 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
|
281 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
|
282 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
|
283 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
|
284 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
|
285 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
|
286 |
+
|
287 |
+
class ModelParams:
|
288 |
+
def __init__(self, n_ff = None):
|
289 |
+
self.n_ff = n_ff
|
290 |
+
|
291 |
+
def load(self, data, offset):
|
292 |
+
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
293 |
+
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
294 |
+
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
295 |
+
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
296 |
+
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
297 |
+
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
298 |
+
return offset
|
299 |
+
|
300 |
+
def get_n_ff(self):
|
301 |
+
if self.n_ff is None:
|
302 |
+
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
303 |
+
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
304 |
+
else:
|
305 |
+
return self.n_ff
|
306 |
+
|
307 |
+
def save_gguf(self, gguf_writer):
|
308 |
+
# self.n_vocab not saved
|
309 |
+
gguf_writer.add_embedding_length(self.n_embd)
|
310 |
+
gguf_writer.add_head_count(self.n_head)
|
311 |
+
gguf_writer.add_block_count(self.n_layer)
|
312 |
+
gguf_writer.add_rope_dimension_count(self.n_rot)
|
313 |
+
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
314 |
+
|
315 |
+
def tensor_name(key, bid=None, suffix=".weight"):
|
316 |
+
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
|
317 |
+
|
318 |
+
class Layer:
|
319 |
+
def __init__(self, params, lora_params, bid):
|
320 |
+
self.bid = bid
|
321 |
+
self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
|
322 |
+
self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
|
323 |
+
self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
324 |
+
self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
325 |
+
self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
326 |
+
self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
327 |
+
self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
328 |
+
self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
329 |
+
self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
330 |
+
self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
331 |
+
self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
|
332 |
+
self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
|
333 |
+
self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
|
334 |
+
self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
|
335 |
+
self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
|
336 |
+
self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
|
337 |
+
self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
|
338 |
+
self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
|
339 |
+
|
340 |
+
def load(self, data, offset):
|
341 |
+
offset = self.att_norm_a.load(data, offset)
|
342 |
+
offset = self.att_norm_b.load(data, offset)
|
343 |
+
offset = self.wq_a.load(data, offset)
|
344 |
+
offset = self.wq_b.load(data, offset)
|
345 |
+
offset = self.wk_a.load(data, offset)
|
346 |
+
offset = self.wk_b.load(data, offset)
|
347 |
+
offset = self.wv_a.load(data, offset)
|
348 |
+
offset = self.wv_b.load(data, offset)
|
349 |
+
offset = self.wo_a.load(data, offset)
|
350 |
+
offset = self.wo_b.load(data, offset)
|
351 |
+
offset = self.ffn_norm_a.load(data, offset)
|
352 |
+
offset = self.ffn_norm_b.load(data, offset)
|
353 |
+
offset = self.w1_a.load(data, offset)
|
354 |
+
offset = self.w1_b.load(data, offset)
|
355 |
+
offset = self.w2_a.load(data, offset)
|
356 |
+
offset = self.w2_b.load(data, offset)
|
357 |
+
offset = self.w3_a.load(data, offset)
|
358 |
+
offset = self.w3_b.load(data, offset)
|
359 |
+
return offset
|
360 |
+
|
361 |
+
def save_gguf(self, gguf_writer):
|
362 |
+
self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
|
363 |
+
self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
|
364 |
+
self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
|
365 |
+
self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
|
366 |
+
self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
|
367 |
+
self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
|
368 |
+
self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
|
369 |
+
self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
|
370 |
+
self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
|
371 |
+
self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
|
372 |
+
self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
|
373 |
+
self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
|
374 |
+
self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
|
375 |
+
self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
|
376 |
+
self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
|
377 |
+
self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
|
378 |
+
self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
|
379 |
+
self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
|
380 |
+
|
381 |
+
class LoraModel:
|
382 |
+
def __init__(self, n_ff = None):
|
383 |
+
self.params = ModelParams(n_ff = n_ff)
|
384 |
+
self.lora_params = LoraParams()
|
385 |
+
self.layers = []
|
386 |
+
|
387 |
+
def load(self, data, offset):
|
388 |
+
offset = self.params.load(data, offset)
|
389 |
+
offset = self.lora_params.load(data, offset)
|
390 |
+
|
391 |
+
self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
|
392 |
+
self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
|
393 |
+
self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
|
394 |
+
self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
|
395 |
+
self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
|
396 |
+
self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
|
397 |
+
|
398 |
+
offset = self.tok_embd_a.load(data, offset)
|
399 |
+
offset = self.tok_embd_b.load(data, offset)
|
400 |
+
offset = self.norm_a.load(data, offset)
|
401 |
+
offset = self.norm_b.load(data, offset)
|
402 |
+
offset = self.output_a.load(data, offset)
|
403 |
+
offset = self.output_b.load(data, offset)
|
404 |
+
|
405 |
+
self.layers.clear()
|
406 |
+
for bid in range(self.params.n_layer):
|
407 |
+
layer = Layer(self.params, self.lora_params, bid)
|
408 |
+
offset = layer.load(data, offset)
|
409 |
+
self.layers.append(layer)
|
410 |
+
|
411 |
+
return offset
|
412 |
+
|
413 |
+
def save_gguf(self, gguf_writer):
|
414 |
+
self.params.save_gguf(gguf_writer)
|
415 |
+
self.lora_params.save_gguf(gguf_writer)
|
416 |
+
|
417 |
+
self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
|
418 |
+
self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
|
419 |
+
self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
|
420 |
+
self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
|
421 |
+
self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
|
422 |
+
self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
|
423 |
+
|
424 |
+
for layer in self.layers:
|
425 |
+
layer.save_gguf(gguf_writer)
|
426 |
+
|
427 |
+
class LoraCheckpoint:
|
428 |
+
def __init__(self, n_ff = None):
|
429 |
+
self.model = LoraModel(n_ff = n_ff)
|
430 |
+
self.opt_ctx = OptimizationContext()
|
431 |
+
|
432 |
+
def load(self, data, offset):
|
433 |
+
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
434 |
+
if magic != b'ggcl':
|
435 |
+
raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
|
436 |
+
|
437 |
+
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
438 |
+
if self.version != 0:
|
439 |
+
raise ValueError('Invalid version of checkpoint file')
|
440 |
+
|
441 |
+
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
442 |
+
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
443 |
+
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
444 |
+
|
445 |
+
offset = self.model.load(data, offset)
|
446 |
+
offset = self.opt_ctx.load(data, offset)
|
447 |
+
|
448 |
+
return offset
|
449 |
+
|
450 |
+
def save_gguf(self, gguf_writer):
|
451 |
+
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
452 |
+
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
453 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
454 |
+
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
|
455 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
456 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
457 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
458 |
+
self.model.save_gguf(gguf_writer)
|
459 |
+
self.opt_ctx.save_gguf(gguf_writer)
|
460 |
+
|
461 |
+
def handle_args():
|
462 |
+
parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
|
463 |
+
parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
|
464 |
+
parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
|
465 |
+
parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
|
466 |
+
return parser.parse_args()
|
467 |
+
|
468 |
+
def main():
|
469 |
+
cfg = handle_args()
|
470 |
+
print(cfg)
|
471 |
+
data = np.memmap(cfg.input, mode = 'r')
|
472 |
+
chk = LoraCheckpoint(n_ff = cfg.ff)
|
473 |
+
offset = 0
|
474 |
+
offset = chk.load(data, offset)
|
475 |
+
# we should have read all available data
|
476 |
+
assert(offset == len(data))
|
477 |
+
|
478 |
+
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
479 |
+
chk.save_gguf(gguf_writer)
|
480 |
+
print(" gguf: write header")
|
481 |
+
gguf_writer.write_header_to_file()
|
482 |
+
print(" gguf: write metadata")
|
483 |
+
gguf_writer.write_kv_data_to_file()
|
484 |
+
print(" gguf: write tensors")
|
485 |
+
gguf_writer.write_tensors_to_file()
|
486 |
+
gguf_writer.close()
|
487 |
+
|
488 |
+
if __name__ == '__main__':
|
489 |
+
main()
|
llama.cpp/examples/gguf/CMakeLists.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set(TARGET gguf)
|
2 |
+
add_executable(${TARGET} gguf.cpp)
|
3 |
+
install(TARGETS ${TARGET} RUNTIME)
|
4 |
+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
5 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
llama.cpp/examples/gguf/gguf.cpp
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "ggml.h"
|
2 |
+
#include "llama.h"
|
3 |
+
|
4 |
+
#include <cstdio>
|
5 |
+
#include <cinttypes>
|
6 |
+
#include <string>
|
7 |
+
#include <sstream>
|
8 |
+
#include <fstream>
|
9 |
+
#include <vector>
|
10 |
+
|
11 |
+
#undef MIN
|
12 |
+
#undef MAX
|
13 |
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
14 |
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
15 |
+
|
16 |
+
template <typename T>
|
17 |
+
static std::string to_string(const T & val) {
|
18 |
+
std::stringstream ss;
|
19 |
+
ss << val;
|
20 |
+
return ss.str();
|
21 |
+
}
|
22 |
+
|
23 |
+
static bool gguf_ex_write(const std::string & fname) {
|
24 |
+
struct gguf_context * ctx = gguf_init_empty();
|
25 |
+
|
26 |
+
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
27 |
+
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
|
28 |
+
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
|
29 |
+
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
|
30 |
+
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
31 |
+
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
32 |
+
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
33 |
+
gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull);
|
34 |
+
gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll);
|
35 |
+
gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789);
|
36 |
+
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
37 |
+
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
38 |
+
|
39 |
+
gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
|
40 |
+
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
|
41 |
+
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
|
42 |
+
|
43 |
+
struct ggml_init_params params = {
|
44 |
+
/*.mem_size =*/ 128ull*1024ull*1024ull,
|
45 |
+
/*.mem_buffer =*/ NULL,
|
46 |
+
/*.no_alloc =*/ false,
|
47 |
+
};
|
48 |
+
|
49 |
+
struct ggml_context * ctx_data = ggml_init(params);
|
50 |
+
|
51 |
+
const int n_tensors = 10;
|
52 |
+
|
53 |
+
// tensor infos
|
54 |
+
for (int i = 0; i < n_tensors; ++i) {
|
55 |
+
const std::string name = "tensor_" + to_string(i);
|
56 |
+
|
57 |
+
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
58 |
+
int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
|
59 |
+
|
60 |
+
for (int j = 0; j < n_dims; ++j) {
|
61 |
+
ne[j] = rand() % 10 + 1;
|
62 |
+
}
|
63 |
+
|
64 |
+
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
|
65 |
+
ggml_set_name(cur, name.c_str());
|
66 |
+
|
67 |
+
{
|
68 |
+
float * data = (float *) cur->data;
|
69 |
+
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
70 |
+
data[j] = 100 + i;
|
71 |
+
}
|
72 |
+
}
|
73 |
+
|
74 |
+
gguf_add_tensor(ctx, cur);
|
75 |
+
}
|
76 |
+
|
77 |
+
gguf_write_to_file(ctx, fname.c_str(), false);
|
78 |
+
|
79 |
+
printf("%s: wrote file '%s;\n", __func__, fname.c_str());
|
80 |
+
|
81 |
+
ggml_free(ctx_data);
|
82 |
+
gguf_free(ctx);
|
83 |
+
|
84 |
+
return true;
|
85 |
+
}
|
86 |
+
|
87 |
+
// just read tensor info
|
88 |
+
static bool gguf_ex_read_0(const std::string & fname) {
|
89 |
+
struct gguf_init_params params = {
|
90 |
+
/*.no_alloc = */ false,
|
91 |
+
/*.ctx = */ NULL,
|
92 |
+
};
|
93 |
+
|
94 |
+
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
95 |
+
|
96 |
+
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
97 |
+
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
98 |
+
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
99 |
+
|
100 |
+
// kv
|
101 |
+
{
|
102 |
+
const int n_kv = gguf_get_n_kv(ctx);
|
103 |
+
|
104 |
+
printf("%s: n_kv: %d\n", __func__, n_kv);
|
105 |
+
|
106 |
+
for (int i = 0; i < n_kv; ++i) {
|
107 |
+
const char * key = gguf_get_key(ctx, i);
|
108 |
+
|
109 |
+
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
110 |
+
}
|
111 |
+
}
|
112 |
+
|
113 |
+
// find kv string
|
114 |
+
{
|
115 |
+
const char * findkey = "some.parameter.string";
|
116 |
+
|
117 |
+
const int keyidx = gguf_find_key(ctx, findkey);
|
118 |
+
if (keyidx == -1) {
|
119 |
+
printf("%s: find key: %s not found.\n", __func__, findkey);
|
120 |
+
} else {
|
121 |
+
const char * key_value = gguf_get_val_str(ctx, keyidx);
|
122 |
+
printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
|
123 |
+
}
|
124 |
+
}
|
125 |
+
|
126 |
+
// tensor info
|
127 |
+
{
|
128 |
+
const int n_tensors = gguf_get_n_tensors(ctx);
|
129 |
+
|
130 |
+
printf("%s: n_tensors: %d\n", __func__, n_tensors);
|
131 |
+
|
132 |
+
for (int i = 0; i < n_tensors; ++i) {
|
133 |
+
const char * name = gguf_get_tensor_name (ctx, i);
|
134 |
+
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
135 |
+
|
136 |
+
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
137 |
+
}
|
138 |
+
}
|
139 |
+
|
140 |
+
gguf_free(ctx);
|
141 |
+
|
142 |
+
return true;
|
143 |
+
}
|
144 |
+
|
145 |
+
// read and create ggml_context containing the tensors and their data
|
146 |
+
static bool gguf_ex_read_1(const std::string & fname) {
|
147 |
+
struct ggml_context * ctx_data = NULL;
|
148 |
+
|
149 |
+
struct gguf_init_params params = {
|
150 |
+
/*.no_alloc = */ false,
|
151 |
+
/*.ctx = */ &ctx_data,
|
152 |
+
};
|
153 |
+
|
154 |
+
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
155 |
+
|
156 |
+
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
157 |
+
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
158 |
+
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
159 |
+
|
160 |
+
// kv
|
161 |
+
{
|
162 |
+
const int n_kv = gguf_get_n_kv(ctx);
|
163 |
+
|
164 |
+
printf("%s: n_kv: %d\n", __func__, n_kv);
|
165 |
+
|
166 |
+
for (int i = 0; i < n_kv; ++i) {
|
167 |
+
const char * key = gguf_get_key(ctx, i);
|
168 |
+
|
169 |
+
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
170 |
+
}
|
171 |
+
}
|
172 |
+
|
173 |
+
// tensor info
|
174 |
+
{
|
175 |
+
const int n_tensors = gguf_get_n_tensors(ctx);
|
176 |
+
|
177 |
+
printf("%s: n_tensors: %d\n", __func__, n_tensors);
|
178 |
+
|
179 |
+
for (int i = 0; i < n_tensors; ++i) {
|
180 |
+
const char * name = gguf_get_tensor_name (ctx, i);
|
181 |
+
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
182 |
+
|
183 |
+
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
184 |
+
}
|
185 |
+
}
|
186 |
+
|
187 |
+
// data
|
188 |
+
{
|
189 |
+
const int n_tensors = gguf_get_n_tensors(ctx);
|
190 |
+
|
191 |
+
for (int i = 0; i < n_tensors; ++i) {
|
192 |
+
printf("%s: reading tensor %d data\n", __func__, i);
|
193 |
+
|
194 |
+
const char * name = gguf_get_tensor_name(ctx, i);
|
195 |
+
|
196 |
+
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
197 |
+
|
198 |
+
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
199 |
+
|
200 |
+
// print first 10 elements
|
201 |
+
const float * data = (const float *) cur->data;
|
202 |
+
|
203 |
+
printf("%s data[:10] : ", name);
|
204 |
+
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
205 |
+
printf("%f ", data[j]);
|
206 |
+
}
|
207 |
+
printf("\n\n");
|
208 |
+
|
209 |
+
// check data
|
210 |
+
{
|
211 |
+
const float * data = (const float *) cur->data;
|
212 |
+
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
213 |
+
if (data[j] != 100 + i) {
|
214 |
+
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
215 |
+
return false;
|
216 |
+
}
|
217 |
+
}
|
218 |
+
}
|
219 |
+
}
|
220 |
+
}
|
221 |
+
|
222 |
+
printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
223 |
+
|
224 |
+
ggml_free(ctx_data);
|
225 |
+
gguf_free(ctx);
|
226 |
+
|
227 |
+
return true;
|
228 |
+
}
|
229 |
+
|
230 |
+
int main(int argc, char ** argv) {
|
231 |
+
if (argc < 3) {
|
232 |
+
printf("usage: %s data.gguf r|w\n", argv[0]);
|
233 |
+
return -1;
|
234 |
+
}
|
235 |
+
|
236 |
+
const std::string fname(argv[1]);
|
237 |
+
const std::string mode (argv[2]);
|
238 |
+
|
239 |
+
GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
|
240 |
+
|
241 |
+
if (mode == "w") {
|
242 |
+
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
243 |
+
} else if (mode == "r") {
|
244 |
+
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
245 |
+
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
246 |
+
}
|
247 |
+
|
248 |
+
return 0;
|
249 |
+
}
|
llama.cpp/examples/llava/convert-image-encoder-to-gguf.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import numpy as np
|
7 |
+
from gguf import *
|
8 |
+
from transformers import CLIPModel, CLIPProcessor
|
9 |
+
|
10 |
+
TEXT = "clip.text"
|
11 |
+
VISION = "clip.vision"
|
12 |
+
|
13 |
+
|
14 |
+
def k(raw_key: str, arch: str) -> str:
|
15 |
+
return raw_key.format(arch=arch)
|
16 |
+
|
17 |
+
|
18 |
+
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
|
19 |
+
if name in (
|
20 |
+
"logit_scale",
|
21 |
+
"text_model.embeddings.position_ids",
|
22 |
+
"vision_model.embeddings.position_ids",
|
23 |
+
):
|
24 |
+
return True
|
25 |
+
|
26 |
+
if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
|
27 |
+
return True
|
28 |
+
|
29 |
+
if name.startswith("v") and not has_vision:
|
30 |
+
return True
|
31 |
+
|
32 |
+
if name.startswith("t") and not has_text:
|
33 |
+
return True
|
34 |
+
|
35 |
+
return False
|
36 |
+
|
37 |
+
|
38 |
+
def get_tensor_name(name: str) -> str:
|
39 |
+
if "projection" in name:
|
40 |
+
return name
|
41 |
+
|
42 |
+
if "mm_projector" in name:
|
43 |
+
return name.replace("model.mm_projector", "mm")
|
44 |
+
|
45 |
+
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
46 |
+
|
47 |
+
|
48 |
+
def bytes_to_unicode():
|
49 |
+
"""
|
50 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
51 |
+
The reversible bpe codes work on unicode strings.
|
52 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
53 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
54 |
+
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
55 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
56 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
57 |
+
"""
|
58 |
+
bs = (
|
59 |
+
list(range(ord("!"), ord("~") + 1))
|
60 |
+
+ list(range(ord("¡"), ord("¬") + 1))
|
61 |
+
+ list(range(ord("®"), ord("ÿ") + 1))
|
62 |
+
)
|
63 |
+
cs = bs[:]
|
64 |
+
n = 0
|
65 |
+
for b in range(2**8):
|
66 |
+
if b not in bs:
|
67 |
+
bs.append(b)
|
68 |
+
cs.append(2**8 + n)
|
69 |
+
n += 1
|
70 |
+
cs = [chr(n) for n in cs]
|
71 |
+
return dict(zip(bs, cs))
|
72 |
+
|
73 |
+
|
74 |
+
ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
|
75 |
+
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
76 |
+
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
77 |
+
ap.add_argument("--text-only", action="store_true", required=False,
|
78 |
+
help="Save a text-only model. It can't be used to encode images")
|
79 |
+
ap.add_argument("--vision-only", action="store_true", required=False,
|
80 |
+
help="Save a vision-only model. It can't be used to encode texts")
|
81 |
+
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
82 |
+
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
83 |
+
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
84 |
+
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
85 |
+
|
86 |
+
args = ap.parse_args()
|
87 |
+
|
88 |
+
|
89 |
+
if args.text_only and args.vision_only:
|
90 |
+
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
91 |
+
exit(1)
|
92 |
+
|
93 |
+
if args.use_f32:
|
94 |
+
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
95 |
+
|
96 |
+
# output in the same directory as the model if output_dir is None
|
97 |
+
dir_model = args.model_dir
|
98 |
+
|
99 |
+
|
100 |
+
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
101 |
+
vocab = json.load(f)
|
102 |
+
tokens = [key for key in vocab]
|
103 |
+
|
104 |
+
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
105 |
+
config = json.load(f)
|
106 |
+
v_hparams = config["vision_config"]
|
107 |
+
t_hparams = config["text_config"]
|
108 |
+
|
109 |
+
# possible data types
|
110 |
+
# ftype == 0 -> float32
|
111 |
+
# ftype == 1 -> float16
|
112 |
+
#
|
113 |
+
# map from ftype to string
|
114 |
+
ftype_str = ["f32", "f16"]
|
115 |
+
|
116 |
+
ftype = 1
|
117 |
+
if args.use_f32:
|
118 |
+
ftype = 0
|
119 |
+
|
120 |
+
|
121 |
+
model = CLIPModel.from_pretrained(dir_model)
|
122 |
+
processor = CLIPProcessor.from_pretrained(dir_model)
|
123 |
+
|
124 |
+
fname_middle = None
|
125 |
+
has_text_encoder = True
|
126 |
+
has_vision_encoder = True
|
127 |
+
has_llava_projector = False
|
128 |
+
if args.text_only:
|
129 |
+
fname_middle = "text-"
|
130 |
+
has_vision_encoder = False
|
131 |
+
elif args.vision_only:
|
132 |
+
fname_middle = "vision-"
|
133 |
+
has_text_encoder = False
|
134 |
+
elif args.llava_projector is not None:
|
135 |
+
fname_middle = "mmproj-"
|
136 |
+
has_text_encoder = False
|
137 |
+
has_llava_projector = True
|
138 |
+
else:
|
139 |
+
fname_middle = ""
|
140 |
+
|
141 |
+
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
142 |
+
os.makedirs(output_dir, exist_ok=True)
|
143 |
+
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
144 |
+
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
145 |
+
fout = GGUFWriter(path=fname_out, arch="clip")
|
146 |
+
|
147 |
+
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
148 |
+
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
149 |
+
fout.add_bool("clip.has_llava_projector", has_llava_projector)
|
150 |
+
fout.add_file_type(ftype)
|
151 |
+
model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
|
152 |
+
fout.add_name(model_name)
|
153 |
+
if args.text_only:
|
154 |
+
fout.add_description("text-only CLIP model")
|
155 |
+
elif args.vision_only and not has_llava_projector:
|
156 |
+
fout.add_description("vision-only CLIP model")
|
157 |
+
elif has_llava_projector:
|
158 |
+
fout.add_description("image encoder for LLaVA")
|
159 |
+
else:
|
160 |
+
fout.add_description("two-tower CLIP model")
|
161 |
+
|
162 |
+
if has_text_encoder:
|
163 |
+
# text_model hparams
|
164 |
+
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
|
165 |
+
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
|
166 |
+
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
|
167 |
+
fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
|
168 |
+
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
|
169 |
+
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
|
170 |
+
fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
|
171 |
+
fout.add_token_list(tokens)
|
172 |
+
|
173 |
+
if has_vision_encoder:
|
174 |
+
# vision_model hparams
|
175 |
+
fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
|
176 |
+
fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
|
177 |
+
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
|
178 |
+
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
|
179 |
+
fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
|
180 |
+
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
181 |
+
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
|
182 |
+
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
183 |
+
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
184 |
+
|
185 |
+
image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
|
186 |
+
image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
|
187 |
+
fout.add_array("clip.vision.image_mean", image_mean)
|
188 |
+
fout.add_array("clip.vision.image_std", image_std)
|
189 |
+
|
190 |
+
use_gelu = v_hparams["hidden_act"] == "gelu"
|
191 |
+
fout.add_bool("clip.use_gelu", use_gelu)
|
192 |
+
|
193 |
+
|
194 |
+
if has_llava_projector:
|
195 |
+
model.vision_model.encoder.layers.pop(-1)
|
196 |
+
projector = torch.load(args.llava_projector)
|
197 |
+
for name, data in projector.items():
|
198 |
+
name = get_tensor_name(name)
|
199 |
+
if data.ndim == 2:
|
200 |
+
data = data.squeeze().numpy().astype(np.float16)
|
201 |
+
else:
|
202 |
+
data = data.squeeze().numpy().astype(np.float32)
|
203 |
+
|
204 |
+
fout.add_tensor(name, data)
|
205 |
+
|
206 |
+
print("Projector tensors added\n")
|
207 |
+
|
208 |
+
state_dict = model.state_dict()
|
209 |
+
for name, data in state_dict.items():
|
210 |
+
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
|
211 |
+
# we don't need this
|
212 |
+
print(f"skipping parameter: {name}")
|
213 |
+
continue
|
214 |
+
|
215 |
+
name = get_tensor_name(name)
|
216 |
+
data = data.squeeze().numpy()
|
217 |
+
|
218 |
+
n_dims = len(data.shape)
|
219 |
+
|
220 |
+
# ftype == 0 -> float32, ftype == 1 -> float16
|
221 |
+
ftype_cur = 0
|
222 |
+
if n_dims == 4:
|
223 |
+
print(f"tensor {name} is always saved in f16")
|
224 |
+
data = data.astype(np.float16)
|
225 |
+
ftype_cur = 1
|
226 |
+
elif ftype == 1:
|
227 |
+
if name[-7:] == ".weight" and n_dims == 2:
|
228 |
+
print(" Converting to float16")
|
229 |
+
data = data.astype(np.float16)
|
230 |
+
ftype_cur = 1
|
231 |
+
else:
|
232 |
+
print(" Converting to float32")
|
233 |
+
data = data.astype(np.float32)
|
234 |
+
ftype_cur = 0
|
235 |
+
else:
|
236 |
+
if data.dtype != np.float32:
|
237 |
+
print(" Converting to float32")
|
238 |
+
data = data.astype(np.float32)
|
239 |
+
ftype_cur = 0
|
240 |
+
|
241 |
+
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
242 |
+
fout.add_tensor(name, data)
|
243 |
+
|
244 |
+
|
245 |
+
fout.write_header_to_file()
|
246 |
+
fout.write_kv_data_to_file()
|
247 |
+
fout.write_tensors_to_file()
|
248 |
+
fout.close()
|
249 |
+
|
250 |
+
print("Done. Output file: " + fname_out)
|
llama.cpp/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
ADDED
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# train-text-from-scratch checkpoint --> gguf conversion
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import os
|
6 |
+
import struct
|
7 |
+
import sys
|
8 |
+
import numpy as np
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
12 |
+
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
|
13 |
+
import gguf
|
14 |
+
|
15 |
+
# gguf constants
|
16 |
+
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
17 |
+
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
18 |
+
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
19 |
+
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
20 |
+
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
21 |
+
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
22 |
+
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
23 |
+
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
24 |
+
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
25 |
+
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
26 |
+
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
27 |
+
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
28 |
+
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
29 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
30 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
31 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
32 |
+
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
33 |
+
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
34 |
+
|
35 |
+
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
36 |
+
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
37 |
+
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
38 |
+
|
39 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
40 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
41 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
42 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
43 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
44 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
45 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
46 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
47 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
48 |
+
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
49 |
+
|
50 |
+
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
51 |
+
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
52 |
+
LLM_KV_TRAINING_TYPE = "training.type"
|
53 |
+
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
54 |
+
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
55 |
+
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
56 |
+
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
57 |
+
|
58 |
+
class Tensor:
|
59 |
+
def __init__(self, dtype='f', ne=None):
|
60 |
+
if ne is None:
|
61 |
+
ne = []
|
62 |
+
self.dtype = dtype
|
63 |
+
self.ne = ne
|
64 |
+
self.nbytes = 0
|
65 |
+
if self.dtype == 'f':
|
66 |
+
if len(self.ne) == 0:
|
67 |
+
self.nbytes = 0
|
68 |
+
else:
|
69 |
+
self.nbytes = int(np.product(self.ne)) * 4
|
70 |
+
else:
|
71 |
+
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
72 |
+
|
73 |
+
def load(self, data, offset):
|
74 |
+
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
75 |
+
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
76 |
+
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
77 |
+
|
78 |
+
assert(nd == len(self.ne))
|
79 |
+
ne = []
|
80 |
+
for d in range(nd):
|
81 |
+
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
82 |
+
ne.append(n)
|
83 |
+
|
84 |
+
assert(tuple(ne) == tuple(self.ne))
|
85 |
+
|
86 |
+
if self.dtype == 'f':
|
87 |
+
assert(dtype == 0)
|
88 |
+
else:
|
89 |
+
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
90 |
+
|
91 |
+
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
92 |
+
# 32-byte alignment
|
93 |
+
offset += (0 - offset) & 31
|
94 |
+
self.data = data[offset:offset+self.nbytes]
|
95 |
+
offset += self.nbytes
|
96 |
+
return offset
|
97 |
+
|
98 |
+
def max_storage_size(self):
|
99 |
+
result = 0
|
100 |
+
result += 4 # nd
|
101 |
+
result += 4 # namelen
|
102 |
+
result += 4 # dtype
|
103 |
+
result += len(self.ne)*8 # ne
|
104 |
+
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
105 |
+
result += 31 # 32-byte alignment
|
106 |
+
result += self.nbytes
|
107 |
+
return result
|
108 |
+
|
109 |
+
def save_gguf(self, gguf_writer, name):
|
110 |
+
gguf_writer.add_tensor(
|
111 |
+
name=name,
|
112 |
+
tensor=self.data,
|
113 |
+
raw_shape=np.array(list(reversed(self.ne))),
|
114 |
+
raw_dtype=gguf.GGMLQuantizationType.F32)
|
115 |
+
|
116 |
+
class OptimizationParamsV0:
|
117 |
+
def __init__(self):
|
118 |
+
pass
|
119 |
+
|
120 |
+
def load(self, data, offset):
|
121 |
+
self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
122 |
+
self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
123 |
+
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
124 |
+
self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
125 |
+
self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
126 |
+
self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
127 |
+
self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
128 |
+
self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
129 |
+
self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
130 |
+
self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
131 |
+
self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
132 |
+
self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
133 |
+
self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
134 |
+
self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
135 |
+
self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
136 |
+
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
137 |
+
self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
138 |
+
self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
139 |
+
self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
140 |
+
self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
141 |
+
self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
142 |
+
self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
143 |
+
self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
144 |
+
self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
145 |
+
return offset
|
146 |
+
|
147 |
+
class OptimizationContext:
|
148 |
+
def __init__(self):
|
149 |
+
pass
|
150 |
+
|
151 |
+
def load(self, data, offset):
|
152 |
+
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
153 |
+
offset += 4
|
154 |
+
|
155 |
+
if self.version == 0:
|
156 |
+
params = OptimizationParamsV0()
|
157 |
+
offset = params.load(data, offset)
|
158 |
+
self.past = params.past
|
159 |
+
self.lbfgs_m = params.lbfgs_m
|
160 |
+
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
161 |
+
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
162 |
+
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
163 |
+
self.type = params.type
|
164 |
+
|
165 |
+
self.adam_m = Tensor('f', [self.nx])
|
166 |
+
self.adam_v = Tensor('f', [self.nx])
|
167 |
+
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
168 |
+
|
169 |
+
self.lbfgs_x = Tensor('f', [self.nx])
|
170 |
+
self.lbfgs_xp = Tensor('f', [self.nx])
|
171 |
+
self.lbfgs_g = Tensor('f', [self.nx])
|
172 |
+
self.lbfgs_gp = Tensor('f', [self.nx])
|
173 |
+
self.lbfgs_d = Tensor('f', [self.nx])
|
174 |
+
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
175 |
+
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
176 |
+
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
177 |
+
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
178 |
+
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
179 |
+
|
180 |
+
if self.type == 0:
|
181 |
+
# these tensors are stored, but we don't need their data
|
182 |
+
x = Tensor('f', [self.nx])
|
183 |
+
g = Tensor('f', [self.nx])
|
184 |
+
g2 = Tensor('f', [self.nx])
|
185 |
+
mh = Tensor('f', [self.nx])
|
186 |
+
vh = Tensor('f', [self.nx])
|
187 |
+
|
188 |
+
offset = x.load(data, offset)
|
189 |
+
offset = g.load(data, offset)
|
190 |
+
offset = g2.load(data, offset)
|
191 |
+
offset = self.adam_m.load(data, offset)
|
192 |
+
offset = self.adam_v.load(data, offset)
|
193 |
+
offset = mh.load(data, offset)
|
194 |
+
offset = vh.load(data, offset)
|
195 |
+
offset = self.adam_pf.load(data, offset)
|
196 |
+
|
197 |
+
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
198 |
+
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
199 |
+
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
200 |
+
|
201 |
+
elif self.type == 1:
|
202 |
+
offset = self.lbfgs_x.load(data, offset)
|
203 |
+
offset = self.lbfgs_xp.load(data, offset)
|
204 |
+
offset = self.lbfgs_g.load(data, offset)
|
205 |
+
offset = self.lbfgs_gp.load(data, offset)
|
206 |
+
offset = self.lbfgs_d.load(data, offset)
|
207 |
+
offset = self.lbfgs_pf.load(data, offset)
|
208 |
+
offset = self.lbfgs_lmal.load(data, offset)
|
209 |
+
offset = self.lbfgs_lmys.load(data, offset)
|
210 |
+
offset = self.lbfgs_lms.load(data, offset)
|
211 |
+
offset = self.lbfgs_lmy.load(data, offset)
|
212 |
+
|
213 |
+
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
214 |
+
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
215 |
+
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
216 |
+
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
217 |
+
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
218 |
+
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
219 |
+
|
220 |
+
else:
|
221 |
+
raise ValueError('Unknown optimizer type')
|
222 |
+
|
223 |
+
|
224 |
+
elif self.version == 1:
|
225 |
+
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
226 |
+
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
227 |
+
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
228 |
+
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
229 |
+
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
230 |
+
|
231 |
+
self.adam_m = Tensor('f', [self.nx])
|
232 |
+
self.adam_v = Tensor('f', [self.nx])
|
233 |
+
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
234 |
+
|
235 |
+
self.lbfgs_x = Tensor('f', [self.nx])
|
236 |
+
self.lbfgs_xp = Tensor('f', [self.nx])
|
237 |
+
self.lbfgs_g = Tensor('f', [self.nx])
|
238 |
+
self.lbfgs_gp = Tensor('f', [self.nx])
|
239 |
+
self.lbfgs_d = Tensor('f', [self.nx])
|
240 |
+
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
241 |
+
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
242 |
+
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
243 |
+
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
244 |
+
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
245 |
+
|
246 |
+
# forgot to save type in version 1:
|
247 |
+
# guess self.type from number of remaining bytes
|
248 |
+
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
249 |
+
[self.adam_m, self.adam_v]
|
250 |
+
+([self.adam_pf] if (self.past > 0) else [])])
|
251 |
+
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
252 |
+
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
253 |
+
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
254 |
+
self.lbfgs_lmal, self.lbfgs_lmys,
|
255 |
+
self.lbfgs_lms, self.lbfgs_lmy]
|
256 |
+
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
257 |
+
# due to alignment padding the size might not by exact
|
258 |
+
# but the difference in size for both types is significant,
|
259 |
+
# so we can just use whichever is closest
|
260 |
+
remaining = len(data) - offset
|
261 |
+
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
262 |
+
self.type = 0
|
263 |
+
else:
|
264 |
+
self.type = 1
|
265 |
+
|
266 |
+
if self.type == 0:
|
267 |
+
offset = self.adam_m.load(data, offset)
|
268 |
+
offset = self.adam_v.load(data, offset)
|
269 |
+
offset = self.adam_pf.load(data,offset)
|
270 |
+
|
271 |
+
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
272 |
+
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
273 |
+
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
274 |
+
|
275 |
+
elif self.type == 1:
|
276 |
+
offset = self.lbfgs_x.load(data, offset)
|
277 |
+
offset = self.lbfgs_xp.load(data, offset)
|
278 |
+
offset = self.lbfgs_g.load(data, offset)
|
279 |
+
offset = self.lbfgs_gp.load(data, offset)
|
280 |
+
offset = self.lbfgs_d.load(data, offset)
|
281 |
+
offset = self.lbfgs_pf.load(data, offset)
|
282 |
+
offset = self.lbfgs_lmal.load(data, offset)
|
283 |
+
offset = self.lbfgs_lmys.load(data, offset)
|
284 |
+
offset = self.lbfgs_lms.load(data, offset)
|
285 |
+
offset = self.lbfgs_lmy.load(data, offset)
|
286 |
+
|
287 |
+
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
288 |
+
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
289 |
+
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
290 |
+
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
291 |
+
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
292 |
+
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
293 |
+
|
294 |
+
else:
|
295 |
+
raise ValueError('Invalid version of checkpoint file')
|
296 |
+
|
297 |
+
return offset
|
298 |
+
|
299 |
+
def save_gguf(self, gguf_writer):
|
300 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
301 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
302 |
+
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
303 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
304 |
+
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
305 |
+
|
306 |
+
if self.type == 0:
|
307 |
+
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
308 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
309 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
310 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
311 |
+
|
312 |
+
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
313 |
+
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
314 |
+
if self.past > 0:
|
315 |
+
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
316 |
+
|
317 |
+
elif self.type == 1:
|
318 |
+
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
319 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
320 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
321 |
+
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
322 |
+
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
323 |
+
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
324 |
+
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
325 |
+
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
326 |
+
|
327 |
+
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
328 |
+
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
329 |
+
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
330 |
+
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
331 |
+
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
332 |
+
if self.past > 0:
|
333 |
+
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
334 |
+
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
335 |
+
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
336 |
+
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
337 |
+
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
338 |
+
else:
|
339 |
+
raise ValueError('Unknown optimizer type')
|
340 |
+
|
341 |
+
class ModelParams:
|
342 |
+
def __init__(self):
|
343 |
+
pass
|
344 |
+
|
345 |
+
def load(self, data, offset):
|
346 |
+
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
347 |
+
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
348 |
+
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
349 |
+
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
350 |
+
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
351 |
+
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
352 |
+
return offset
|
353 |
+
|
354 |
+
def get_n_ff(self):
|
355 |
+
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
356 |
+
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
357 |
+
|
358 |
+
def save_gguf(self, gguf_writer):
|
359 |
+
# self.n_vocab not saved
|
360 |
+
gguf_writer.add_embedding_length(self.n_embd)
|
361 |
+
gguf_writer.add_head_count(self.n_head)
|
362 |
+
gguf_writer.add_block_count(self.n_layer)
|
363 |
+
gguf_writer.add_rope_dimension_count(self.n_rot)
|
364 |
+
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
365 |
+
|
366 |
+
def tensor_name(key, bid=None):
|
367 |
+
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
|
368 |
+
|
369 |
+
class Layer:
|
370 |
+
def __init__(self, params, bid):
|
371 |
+
self.bid = bid
|
372 |
+
self.att_norm = Tensor('f', [params.n_embd])
|
373 |
+
self.wq = Tensor('f', [params.n_embd, params.n_embd])
|
374 |
+
self.wk = Tensor('f', [params.n_embd, params.n_embd])
|
375 |
+
self.wv = Tensor('f', [params.n_embd, params.n_embd])
|
376 |
+
self.wo = Tensor('f', [params.n_embd, params.n_embd])
|
377 |
+
self.ffn_norm = Tensor('f', [params.n_embd])
|
378 |
+
self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
379 |
+
self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
|
380 |
+
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
381 |
+
|
382 |
+
def load(self, data, offset):
|
383 |
+
offset = self.att_norm.load(data, offset)
|
384 |
+
offset = self.wq.load(data, offset)
|
385 |
+
offset = self.wk.load(data, offset)
|
386 |
+
offset = self.wv.load(data, offset)
|
387 |
+
offset = self.wo.load(data, offset)
|
388 |
+
offset = self.ffn_norm.load(data, offset)
|
389 |
+
offset = self.w1.load(data, offset)
|
390 |
+
offset = self.w2.load(data, offset)
|
391 |
+
offset = self.w3.load(data, offset)
|
392 |
+
return offset
|
393 |
+
|
394 |
+
def save_gguf(self, gguf_writer):
|
395 |
+
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
|
396 |
+
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
|
397 |
+
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
|
398 |
+
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
|
399 |
+
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
|
400 |
+
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
|
401 |
+
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
|
402 |
+
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
|
403 |
+
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
|
404 |
+
|
405 |
+
class Model:
|
406 |
+
def __init__(self):
|
407 |
+
self.params = ModelParams()
|
408 |
+
self.layers = []
|
409 |
+
|
410 |
+
def load(self, data, offset):
|
411 |
+
offset = self.params.load(data, offset)
|
412 |
+
|
413 |
+
self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
414 |
+
self.norm = Tensor('f', [self.params.n_embd])
|
415 |
+
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
416 |
+
|
417 |
+
offset = self.tok_embd.load(data, offset)
|
418 |
+
offset = self.norm.load(data, offset)
|
419 |
+
offset = self.output.load(data, offset)
|
420 |
+
|
421 |
+
self.layers.clear()
|
422 |
+
for bid in range(self.params.n_layer):
|
423 |
+
layer = Layer(self.params, bid)
|
424 |
+
offset = layer.load(data, offset)
|
425 |
+
self.layers.append(layer)
|
426 |
+
|
427 |
+
return offset
|
428 |
+
|
429 |
+
def save_gguf(self, gguf_writer):
|
430 |
+
self.params.save_gguf(gguf_writer)
|
431 |
+
|
432 |
+
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
|
433 |
+
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
|
434 |
+
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
|
435 |
+
|
436 |
+
for layer in self.layers:
|
437 |
+
layer.save_gguf(gguf_writer)
|
438 |
+
|
439 |
+
class Checkpoint:
|
440 |
+
def __init__(self):
|
441 |
+
self.model = Model()
|
442 |
+
self.opt_ctx = OptimizationContext()
|
443 |
+
|
444 |
+
def load(self, data, offset):
|
445 |
+
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
446 |
+
if magic != b'ggcp':
|
447 |
+
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
|
448 |
+
|
449 |
+
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
450 |
+
if self.version != 0:
|
451 |
+
raise ValueError('Invalid version of checkpoint file')
|
452 |
+
|
453 |
+
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
454 |
+
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
455 |
+
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
456 |
+
|
457 |
+
offset = self.model.load(data, offset)
|
458 |
+
offset = self.opt_ctx.load(data, offset)
|
459 |
+
|
460 |
+
return offset
|
461 |
+
|
462 |
+
def save_gguf(self, gguf_writer):
|
463 |
+
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
464 |
+
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
465 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
466 |
+
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
|
467 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
468 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
469 |
+
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
470 |
+
self.model.save_gguf(gguf_writer)
|
471 |
+
self.opt_ctx.save_gguf(gguf_writer)
|
472 |
+
|
473 |
+
def handle_args():
|
474 |
+
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
|
475 |
+
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
|
476 |
+
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
|
477 |
+
return parser.parse_args()
|
478 |
+
|
479 |
+
def main():
|
480 |
+
cfg = handle_args()
|
481 |
+
data = np.memmap(cfg.input, mode = 'r')
|
482 |
+
chk = Checkpoint()
|
483 |
+
offset = 0
|
484 |
+
offset = chk.load(data, offset)
|
485 |
+
# we should have read all available data
|
486 |
+
assert(offset == len(data))
|
487 |
+
|
488 |
+
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
489 |
+
chk.save_gguf(gguf_writer)
|
490 |
+
print(" gguf: write header")
|
491 |
+
gguf_writer.write_header_to_file()
|
492 |
+
print(" gguf: write metadata")
|
493 |
+
gguf_writer.write_kv_data_to_file()
|
494 |
+
print(" gguf: write tensors")
|
495 |
+
gguf_writer.write_tensors_to_file()
|
496 |
+
gguf_writer.close()
|
497 |
+
|
498 |
+
if __name__ == '__main__':
|
499 |
+
main()
|
llama.cpp/gguf-py/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Georgi Gerganov
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
llama.cpp/gguf-py/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## gguf
|
2 |
+
|
3 |
+
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
|
4 |
+
(GGML Universal File) format.
|
5 |
+
|
6 |
+
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
|
7 |
+
as an example for its usage.
|
8 |
+
|
9 |
+
## Installation
|
10 |
+
```sh
|
11 |
+
pip install gguf
|
12 |
+
```
|
13 |
+
|
14 |
+
## Development
|
15 |
+
Maintainers who participate in development of this package are advised to install it in editable mode:
|
16 |
+
|
17 |
+
```sh
|
18 |
+
cd /path/to/llama.cpp/gguf-py
|
19 |
+
|
20 |
+
pip install --editable .
|
21 |
+
```
|
22 |
+
|
23 |
+
**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
|
24 |
+
In this case, upgrade Pip to the latest:
|
25 |
+
|
26 |
+
```sh
|
27 |
+
pip install --upgrade pip
|
28 |
+
```
|
29 |
+
|
30 |
+
## Automatic publishing with CI
|
31 |
+
|
32 |
+
There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
|
33 |
+
|
34 |
+
1. Bump the version in `pyproject.toml`.
|
35 |
+
2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
|
36 |
+
|
37 |
+
```sh
|
38 |
+
git tag -a gguf-v1.0.0 -m "Version 1.0 release"
|
39 |
+
```
|
40 |
+
|
41 |
+
3. Push the tags.
|
42 |
+
|
43 |
+
```sh
|
44 |
+
git push origin --tags
|
45 |
+
```
|
46 |
+
|
47 |
+
## Manual publishing
|
48 |
+
If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:
|
49 |
+
|
50 |
+
```sh
|
51 |
+
pip install build twine
|
52 |
+
```
|
53 |
+
|
54 |
+
Then, folow these steps to release a new version:
|
55 |
+
|
56 |
+
1. Bump the version in `pyproject.toml`.
|
57 |
+
2. Build the package:
|
58 |
+
|
59 |
+
```sh
|
60 |
+
python -m build
|
61 |
+
```
|
62 |
+
|
63 |
+
3. Upload the generated distribution archives:
|
64 |
+
|
65 |
+
```sh
|
66 |
+
python -m twine upload dist/*
|
67 |
+
```
|
68 |
+
|
69 |
+
## TODO
|
70 |
+
- [ ] Add tests
|
71 |
+
- [ ] Include conversion scripts as command line entry points in this package.
|
llama.cpp/gguf-py/gguf/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .gguf import *
|
llama.cpp/gguf-py/gguf/__pycache__/gguf.cpython-310.pyc
ADDED
Binary file (31.2 kB). View file
|
|
llama.cpp/gguf-py/gguf/gguf.py
ADDED
@@ -0,0 +1,1070 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
import struct
|
8 |
+
import sys
|
9 |
+
import tempfile
|
10 |
+
from enum import IntEnum, auto
|
11 |
+
from io import BufferedWriter
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import IO, Any, BinaryIO, Callable, Sequence
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
|
17 |
+
#
|
18 |
+
# constants
|
19 |
+
#
|
20 |
+
|
21 |
+
GGUF_MAGIC = 0x46554747
|
22 |
+
GGUF_VERSION = 2
|
23 |
+
GGUF_DEFAULT_ALIGNMENT = 32
|
24 |
+
|
25 |
+
# general
|
26 |
+
KEY_GENERAL_ARCHITECTURE = "general.architecture"
|
27 |
+
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
|
28 |
+
KEY_GENERAL_ALIGNMENT = "general.alignment"
|
29 |
+
KEY_GENERAL_NAME = "general.name"
|
30 |
+
KEY_GENERAL_AUTHOR = "general.author"
|
31 |
+
KEY_GENERAL_URL = "general.url"
|
32 |
+
KEY_GENERAL_DESCRIPTION = "general.description"
|
33 |
+
KEY_GENERAL_LICENSE = "general.license"
|
34 |
+
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
35 |
+
KEY_GENERAL_SOURCE_HF_REPO = "general.source.huggingface.repository"
|
36 |
+
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
37 |
+
|
38 |
+
# LLM
|
39 |
+
KEY_CONTEXT_LENGTH = "{arch}.context_length"
|
40 |
+
KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
41 |
+
KEY_BLOCK_COUNT = "{arch}.block_count"
|
42 |
+
KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
43 |
+
KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
44 |
+
KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
45 |
+
|
46 |
+
# attention
|
47 |
+
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
48 |
+
KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
49 |
+
KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
50 |
+
KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
51 |
+
KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
52 |
+
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
53 |
+
|
54 |
+
# RoPE
|
55 |
+
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
56 |
+
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
|
57 |
+
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
|
58 |
+
|
59 |
+
# tokenization
|
60 |
+
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
61 |
+
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
|
62 |
+
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
|
63 |
+
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
|
64 |
+
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
|
65 |
+
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
|
66 |
+
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
|
67 |
+
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
|
68 |
+
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
|
69 |
+
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
|
70 |
+
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
|
71 |
+
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
72 |
+
|
73 |
+
|
74 |
+
#
|
75 |
+
# recommended mapping of model tensor names for storage in gguf
|
76 |
+
#
|
77 |
+
|
78 |
+
|
79 |
+
class MODEL_ARCH(IntEnum):
|
80 |
+
LLAMA : int = auto()
|
81 |
+
FALCON : int = auto()
|
82 |
+
BAICHUAN : int = auto()
|
83 |
+
GPT2 : int = auto()
|
84 |
+
GPTJ : int = auto()
|
85 |
+
GPTNEOX : int = auto()
|
86 |
+
MPT : int = auto()
|
87 |
+
STARCODER : int = auto()
|
88 |
+
PERSIMMON : int = auto()
|
89 |
+
REFACT : int = auto()
|
90 |
+
BERT : int = auto()
|
91 |
+
BLOOM : int = auto()
|
92 |
+
|
93 |
+
|
94 |
+
class MODEL_TENSOR(IntEnum):
|
95 |
+
TOKEN_EMBD : int = auto()
|
96 |
+
TOKEN_EMBD_NORM : int = auto()
|
97 |
+
TOKEN_TYPES : int = auto()
|
98 |
+
POS_EMBD : int = auto()
|
99 |
+
OUTPUT : int = auto()
|
100 |
+
OUTPUT_NORM : int = auto()
|
101 |
+
ROPE_FREQS : int = auto()
|
102 |
+
ATTN_Q : int = auto()
|
103 |
+
ATTN_K : int = auto()
|
104 |
+
ATTN_V : int = auto()
|
105 |
+
ATTN_QKV : int = auto()
|
106 |
+
ATTN_OUT : int = auto()
|
107 |
+
ATTN_NORM : int = auto()
|
108 |
+
ATTN_NORM_2 : int = auto()
|
109 |
+
ATTN_ROT_EMBD : int = auto()
|
110 |
+
FFN_GATE : int = auto()
|
111 |
+
FFN_DOWN : int = auto()
|
112 |
+
FFN_UP : int = auto()
|
113 |
+
FFN_NORM : int = auto()
|
114 |
+
ATTN_Q_NORM : int = auto()
|
115 |
+
ATTN_K_NORM : int = auto()
|
116 |
+
|
117 |
+
|
118 |
+
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
119 |
+
MODEL_ARCH.LLAMA: "llama",
|
120 |
+
MODEL_ARCH.FALCON: "falcon",
|
121 |
+
MODEL_ARCH.BAICHUAN: "baichuan",
|
122 |
+
MODEL_ARCH.GPT2: "gpt2",
|
123 |
+
MODEL_ARCH.GPTJ: "gptj",
|
124 |
+
MODEL_ARCH.GPTNEOX: "gptneox",
|
125 |
+
MODEL_ARCH.MPT: "mpt",
|
126 |
+
MODEL_ARCH.STARCODER: "starcoder",
|
127 |
+
MODEL_ARCH.PERSIMMON: "persimmon",
|
128 |
+
MODEL_ARCH.REFACT: "refact",
|
129 |
+
MODEL_ARCH.BERT: "bert",
|
130 |
+
MODEL_ARCH.BLOOM: "bloom",
|
131 |
+
}
|
132 |
+
|
133 |
+
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
134 |
+
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
135 |
+
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
136 |
+
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
137 |
+
MODEL_TENSOR.POS_EMBD: "position_embd",
|
138 |
+
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
139 |
+
MODEL_TENSOR.OUTPUT: "output",
|
140 |
+
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
141 |
+
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
142 |
+
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
143 |
+
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
144 |
+
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
145 |
+
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
146 |
+
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
147 |
+
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
148 |
+
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
149 |
+
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
150 |
+
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
151 |
+
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
152 |
+
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
153 |
+
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
154 |
+
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
155 |
+
}
|
156 |
+
|
157 |
+
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
158 |
+
MODEL_ARCH.LLAMA: [
|
159 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
160 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
161 |
+
MODEL_TENSOR.OUTPUT,
|
162 |
+
MODEL_TENSOR.ROPE_FREQS,
|
163 |
+
MODEL_TENSOR.ATTN_NORM,
|
164 |
+
MODEL_TENSOR.ATTN_Q,
|
165 |
+
MODEL_TENSOR.ATTN_K,
|
166 |
+
MODEL_TENSOR.ATTN_V,
|
167 |
+
MODEL_TENSOR.ATTN_OUT,
|
168 |
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
169 |
+
MODEL_TENSOR.FFN_NORM,
|
170 |
+
MODEL_TENSOR.FFN_GATE,
|
171 |
+
MODEL_TENSOR.FFN_DOWN,
|
172 |
+
MODEL_TENSOR.FFN_UP,
|
173 |
+
],
|
174 |
+
MODEL_ARCH.GPTNEOX: [
|
175 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
176 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
177 |
+
MODEL_TENSOR.OUTPUT,
|
178 |
+
MODEL_TENSOR.ATTN_NORM,
|
179 |
+
MODEL_TENSOR.ATTN_QKV,
|
180 |
+
MODEL_TENSOR.ATTN_OUT,
|
181 |
+
MODEL_TENSOR.FFN_NORM,
|
182 |
+
MODEL_TENSOR.FFN_DOWN,
|
183 |
+
MODEL_TENSOR.FFN_UP,
|
184 |
+
],
|
185 |
+
MODEL_ARCH.FALCON: [
|
186 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
187 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
188 |
+
MODEL_TENSOR.OUTPUT,
|
189 |
+
MODEL_TENSOR.ATTN_NORM,
|
190 |
+
MODEL_TENSOR.ATTN_NORM_2,
|
191 |
+
MODEL_TENSOR.ATTN_QKV,
|
192 |
+
MODEL_TENSOR.ATTN_OUT,
|
193 |
+
MODEL_TENSOR.FFN_DOWN,
|
194 |
+
MODEL_TENSOR.FFN_UP,
|
195 |
+
],
|
196 |
+
MODEL_ARCH.BAICHUAN: [
|
197 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
198 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
199 |
+
MODEL_TENSOR.OUTPUT,
|
200 |
+
MODEL_TENSOR.ROPE_FREQS,
|
201 |
+
MODEL_TENSOR.ATTN_NORM,
|
202 |
+
MODEL_TENSOR.ATTN_Q,
|
203 |
+
MODEL_TENSOR.ATTN_K,
|
204 |
+
MODEL_TENSOR.ATTN_V,
|
205 |
+
MODEL_TENSOR.ATTN_OUT,
|
206 |
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
207 |
+
MODEL_TENSOR.FFN_NORM,
|
208 |
+
MODEL_TENSOR.FFN_GATE,
|
209 |
+
MODEL_TENSOR.FFN_DOWN,
|
210 |
+
MODEL_TENSOR.FFN_UP,
|
211 |
+
],
|
212 |
+
MODEL_ARCH.STARCODER: [
|
213 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
214 |
+
MODEL_TENSOR.POS_EMBD,
|
215 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
216 |
+
MODEL_TENSOR.OUTPUT,
|
217 |
+
MODEL_TENSOR.ATTN_NORM,
|
218 |
+
MODEL_TENSOR.ATTN_QKV,
|
219 |
+
MODEL_TENSOR.ATTN_OUT,
|
220 |
+
MODEL_TENSOR.FFN_NORM,
|
221 |
+
MODEL_TENSOR.FFN_DOWN,
|
222 |
+
MODEL_TENSOR.FFN_UP,
|
223 |
+
],
|
224 |
+
MODEL_ARCH.BERT: [
|
225 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
226 |
+
MODEL_TENSOR.TOKEN_TYPES,
|
227 |
+
MODEL_TENSOR.POS_EMBD,
|
228 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
229 |
+
MODEL_TENSOR.ATTN_NORM,
|
230 |
+
MODEL_TENSOR.ATTN_Q,
|
231 |
+
MODEL_TENSOR.ATTN_K,
|
232 |
+
MODEL_TENSOR.ATTN_V,
|
233 |
+
MODEL_TENSOR.ATTN_OUT,
|
234 |
+
MODEL_TENSOR.FFN_NORM,
|
235 |
+
MODEL_TENSOR.FFN_DOWN,
|
236 |
+
MODEL_TENSOR.FFN_UP,
|
237 |
+
],
|
238 |
+
MODEL_ARCH.MPT: [
|
239 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
240 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
241 |
+
MODEL_TENSOR.OUTPUT,
|
242 |
+
MODEL_TENSOR.ATTN_NORM,
|
243 |
+
MODEL_TENSOR.ATTN_QKV,
|
244 |
+
MODEL_TENSOR.ATTN_OUT,
|
245 |
+
MODEL_TENSOR.FFN_NORM,
|
246 |
+
MODEL_TENSOR.FFN_DOWN,
|
247 |
+
MODEL_TENSOR.FFN_UP,
|
248 |
+
],
|
249 |
+
MODEL_ARCH.GPTJ: [
|
250 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
251 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
252 |
+
MODEL_TENSOR.OUTPUT,
|
253 |
+
MODEL_TENSOR.ATTN_NORM,
|
254 |
+
MODEL_TENSOR.ATTN_Q,
|
255 |
+
MODEL_TENSOR.ATTN_K,
|
256 |
+
MODEL_TENSOR.ATTN_V,
|
257 |
+
MODEL_TENSOR.ATTN_OUT,
|
258 |
+
MODEL_TENSOR.FFN_DOWN,
|
259 |
+
MODEL_TENSOR.FFN_UP,
|
260 |
+
],
|
261 |
+
MODEL_ARCH.PERSIMMON: [
|
262 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
263 |
+
MODEL_TENSOR.OUTPUT,
|
264 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
265 |
+
MODEL_TENSOR.ATTN_NORM,
|
266 |
+
MODEL_TENSOR.ATTN_QKV,
|
267 |
+
MODEL_TENSOR.ATTN_OUT,
|
268 |
+
MODEL_TENSOR.FFN_NORM,
|
269 |
+
MODEL_TENSOR.FFN_DOWN,
|
270 |
+
MODEL_TENSOR.FFN_UP,
|
271 |
+
MODEL_TENSOR.ATTN_Q_NORM,
|
272 |
+
MODEL_TENSOR.ATTN_K_NORM,
|
273 |
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
274 |
+
],
|
275 |
+
MODEL_ARCH.REFACT: [
|
276 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
277 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
278 |
+
MODEL_TENSOR.OUTPUT,
|
279 |
+
MODEL_TENSOR.ATTN_NORM,
|
280 |
+
MODEL_TENSOR.ATTN_Q,
|
281 |
+
MODEL_TENSOR.ATTN_K,
|
282 |
+
MODEL_TENSOR.ATTN_V,
|
283 |
+
MODEL_TENSOR.ATTN_OUT,
|
284 |
+
MODEL_TENSOR.FFN_NORM,
|
285 |
+
MODEL_TENSOR.FFN_GATE,
|
286 |
+
MODEL_TENSOR.FFN_DOWN,
|
287 |
+
MODEL_TENSOR.FFN_UP,
|
288 |
+
],
|
289 |
+
MODEL_ARCH.BLOOM: [
|
290 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
291 |
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
292 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
293 |
+
MODEL_TENSOR.OUTPUT,
|
294 |
+
MODEL_TENSOR.ATTN_NORM,
|
295 |
+
MODEL_TENSOR.ATTN_QKV,
|
296 |
+
MODEL_TENSOR.ATTN_OUT,
|
297 |
+
MODEL_TENSOR.FFN_NORM,
|
298 |
+
MODEL_TENSOR.FFN_DOWN,
|
299 |
+
MODEL_TENSOR.FFN_UP,
|
300 |
+
],
|
301 |
+
MODEL_ARCH.GPT2: [
|
302 |
+
# TODO
|
303 |
+
],
|
304 |
+
# TODO
|
305 |
+
}
|
306 |
+
|
307 |
+
# tensors that will not be serialized
|
308 |
+
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
309 |
+
MODEL_ARCH.LLAMA: [
|
310 |
+
MODEL_TENSOR.ROPE_FREQS,
|
311 |
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
312 |
+
],
|
313 |
+
MODEL_ARCH.BAICHUAN: [
|
314 |
+
MODEL_TENSOR.ROPE_FREQS,
|
315 |
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
316 |
+
],
|
317 |
+
MODEL_ARCH.PERSIMMON: [
|
318 |
+
MODEL_TENSOR.ROPE_FREQS,
|
319 |
+
]
|
320 |
+
}
|
321 |
+
|
322 |
+
|
323 |
+
class TensorNameMap:
|
324 |
+
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
325 |
+
# Token embeddings
|
326 |
+
MODEL_TENSOR.TOKEN_EMBD: (
|
327 |
+
"gpt_neox.embed_in", # gptneox
|
328 |
+
"transformer.wte", # gpt2 gpt-j mpt refact
|
329 |
+
"transformer.word_embeddings", # falcon
|
330 |
+
"word_embeddings", # bloom
|
331 |
+
"model.embed_tokens", # llama-hf
|
332 |
+
"tok_embeddings", # llama-pth
|
333 |
+
"embeddings.word_embeddings", # bert
|
334 |
+
"language_model.embedding.word_embeddings", # persimmon
|
335 |
+
),
|
336 |
+
|
337 |
+
# Token type embeddings
|
338 |
+
MODEL_TENSOR.TOKEN_TYPES: (
|
339 |
+
"embeddings.token_type_embeddings", # bert
|
340 |
+
),
|
341 |
+
|
342 |
+
# Normalization of token embeddings
|
343 |
+
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
344 |
+
"word_embeddings_layernorm", # bloom
|
345 |
+
),
|
346 |
+
|
347 |
+
# Position embeddings
|
348 |
+
MODEL_TENSOR.POS_EMBD: (
|
349 |
+
"transformer.wpe", # gpt2
|
350 |
+
"embeddings.position_embeddings", # bert
|
351 |
+
),
|
352 |
+
|
353 |
+
# Output
|
354 |
+
MODEL_TENSOR.OUTPUT: (
|
355 |
+
"embed_out", # gptneox
|
356 |
+
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
357 |
+
"output", # llama-pth bloom
|
358 |
+
"word_embeddings_for_head", # persimmon
|
359 |
+
),
|
360 |
+
|
361 |
+
# Output norm
|
362 |
+
MODEL_TENSOR.OUTPUT_NORM: (
|
363 |
+
"gpt_neox.final_layer_norm", # gptneox
|
364 |
+
"transformer.ln_f", # gpt2 gpt-j falcon
|
365 |
+
"model.norm", # llama-hf baichuan
|
366 |
+
"norm", # llama-pth
|
367 |
+
"embeddings.LayerNorm", # bert
|
368 |
+
"transformer.norm_f", # mpt
|
369 |
+
"ln_f", # refact bloom
|
370 |
+
"language_model.encoder.final_layernorm", # persimmon
|
371 |
+
),
|
372 |
+
|
373 |
+
# Rope frequencies
|
374 |
+
MODEL_TENSOR.ROPE_FREQS: (
|
375 |
+
"rope.freqs", # llama-pth
|
376 |
+
),
|
377 |
+
}
|
378 |
+
|
379 |
+
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
380 |
+
# Attention norm
|
381 |
+
MODEL_TENSOR.ATTN_NORM: (
|
382 |
+
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
383 |
+
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
384 |
+
"transformer.blocks.{bid}.norm_1", # mpt
|
385 |
+
"transformer.h.{bid}.input_layernorm", # falcon7b
|
386 |
+
"h.{bid}.input_layernorm", # bloom
|
387 |
+
"transformer.h.{bid}.ln_mlp", # falcon40b
|
388 |
+
"model.layers.{bid}.input_layernorm", # llama-hf
|
389 |
+
"layers.{bid}.attention_norm", # llama-pth
|
390 |
+
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
391 |
+
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
392 |
+
),
|
393 |
+
|
394 |
+
# Attention norm 2
|
395 |
+
MODEL_TENSOR.ATTN_NORM_2: (
|
396 |
+
"transformer.h.{bid}.ln_attn", # falcon40b
|
397 |
+
),
|
398 |
+
|
399 |
+
# Attention query-key-value
|
400 |
+
MODEL_TENSOR.ATTN_QKV: (
|
401 |
+
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
402 |
+
"transformer.h.{bid}.attn.c_attn", # gpt2
|
403 |
+
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
404 |
+
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
405 |
+
"h.{bid}.self_attention.query_key_value", # bloom
|
406 |
+
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
407 |
+
),
|
408 |
+
|
409 |
+
# Attention query
|
410 |
+
MODEL_TENSOR.ATTN_Q: (
|
411 |
+
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
412 |
+
"layers.{bid}.attention.wq", # llama-pth
|
413 |
+
"encoder.layer.{bid}.attention.self.query", # bert
|
414 |
+
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
415 |
+
),
|
416 |
+
|
417 |
+
# Attention key
|
418 |
+
MODEL_TENSOR.ATTN_K: (
|
419 |
+
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
420 |
+
"layers.{bid}.attention.wk", # llama-pth
|
421 |
+
"encoder.layer.{bid}.attention.self.key", # bert
|
422 |
+
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
423 |
+
),
|
424 |
+
|
425 |
+
# Attention value
|
426 |
+
MODEL_TENSOR.ATTN_V: (
|
427 |
+
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
428 |
+
"layers.{bid}.attention.wv", # llama-pth
|
429 |
+
"encoder.layer.{bid}.attention.self.value", # bert
|
430 |
+
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
431 |
+
),
|
432 |
+
|
433 |
+
# Attention output
|
434 |
+
MODEL_TENSOR.ATTN_OUT: (
|
435 |
+
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
436 |
+
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
437 |
+
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
438 |
+
"transformer.h.{bid}.self_attention.dense", # falcon
|
439 |
+
"h.{bid}.self_attention.dense", # bloom
|
440 |
+
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
441 |
+
"layers.{bid}.attention.wo", # llama-pth
|
442 |
+
"encoder.layer.{bid}.attention.output.dense", # bert
|
443 |
+
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
444 |
+
"language_model.encoder.layers.{bid}.self_attention.dense" # persimmon
|
445 |
+
),
|
446 |
+
|
447 |
+
# Rotary embeddings
|
448 |
+
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
449 |
+
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
450 |
+
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
451 |
+
),
|
452 |
+
|
453 |
+
# Feed-forward norm
|
454 |
+
MODEL_TENSOR.FFN_NORM: (
|
455 |
+
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
456 |
+
"transformer.h.{bid}.ln_2", # gpt2 refact
|
457 |
+
"h.{bid}.post_attention_layernorm", # bloom
|
458 |
+
"transformer.blocks.{bid}.norm_2", # mpt
|
459 |
+
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
460 |
+
"layers.{bid}.ffn_norm", # llama-pth
|
461 |
+
"encoder.layer.{bid}.output.LayerNorm", # bert
|
462 |
+
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
463 |
+
),
|
464 |
+
|
465 |
+
# Feed-forward up
|
466 |
+
MODEL_TENSOR.FFN_UP: (
|
467 |
+
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
468 |
+
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
469 |
+
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
470 |
+
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
471 |
+
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
472 |
+
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
473 |
+
"layers.{bid}.feed_forward.w3", # llama-pth
|
474 |
+
"encoder.layer.{bid}.intermediate.dense", # bert
|
475 |
+
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
476 |
+
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
477 |
+
),
|
478 |
+
|
479 |
+
# Feed-forward gate
|
480 |
+
MODEL_TENSOR.FFN_GATE: (
|
481 |
+
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
482 |
+
"layers.{bid}.feed_forward.w1", # llama-pth
|
483 |
+
),
|
484 |
+
|
485 |
+
# Feed-forward down
|
486 |
+
MODEL_TENSOR.FFN_DOWN: (
|
487 |
+
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
488 |
+
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
489 |
+
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
490 |
+
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
491 |
+
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
492 |
+
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
493 |
+
"layers.{bid}.feed_forward.w2", # llama-pth
|
494 |
+
"encoder.layer.{bid}.output.dense", # bert
|
495 |
+
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
496 |
+
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
497 |
+
),
|
498 |
+
|
499 |
+
MODEL_TENSOR.ATTN_Q_NORM: (
|
500 |
+
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
501 |
+
),
|
502 |
+
|
503 |
+
MODEL_TENSOR.ATTN_K_NORM: (
|
504 |
+
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
505 |
+
),
|
506 |
+
|
507 |
+
MODEL_TENSOR.ROPE_FREQS: (
|
508 |
+
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
509 |
+
)
|
510 |
+
}
|
511 |
+
|
512 |
+
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
513 |
+
|
514 |
+
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
515 |
+
self.mapping = {}
|
516 |
+
for tensor, keys in self.mappings_cfg.items():
|
517 |
+
if tensor not in MODEL_TENSORS[arch]:
|
518 |
+
continue
|
519 |
+
tensor_name = TENSOR_NAMES[tensor]
|
520 |
+
self.mapping[tensor_name] = (tensor, tensor_name)
|
521 |
+
for key in keys:
|
522 |
+
self.mapping[key] = (tensor, tensor_name)
|
523 |
+
for bid in range(n_blocks):
|
524 |
+
for tensor, keys in self.block_mappings_cfg.items():
|
525 |
+
if tensor not in MODEL_TENSORS[arch]:
|
526 |
+
continue
|
527 |
+
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
528 |
+
self.mapping[tensor_name] = (tensor, tensor_name)
|
529 |
+
for key in keys:
|
530 |
+
key = key.format(bid = bid)
|
531 |
+
self.mapping[key] = (tensor, tensor_name)
|
532 |
+
|
533 |
+
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
534 |
+
result = self.mapping.get(key)
|
535 |
+
if result is not None:
|
536 |
+
return result
|
537 |
+
for suffix in try_suffixes:
|
538 |
+
if key.endswith(suffix):
|
539 |
+
result = self.mapping.get(key[:-len(suffix)])
|
540 |
+
if result is not None:
|
541 |
+
return (result[0], result[1] + suffix)
|
542 |
+
return None
|
543 |
+
|
544 |
+
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
545 |
+
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
546 |
+
if result is None:
|
547 |
+
return None
|
548 |
+
return result[1]
|
549 |
+
|
550 |
+
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
551 |
+
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
552 |
+
if result is None:
|
553 |
+
return None
|
554 |
+
return result[0]
|
555 |
+
|
556 |
+
def __getitem__(self, key: str) -> str:
|
557 |
+
try:
|
558 |
+
return self.mapping[key][1]
|
559 |
+
except KeyError:
|
560 |
+
raise KeyError(key)
|
561 |
+
|
562 |
+
def __contains__(self, key: str) -> bool:
|
563 |
+
return key in self.mapping
|
564 |
+
|
565 |
+
def __repr__(self) -> str:
|
566 |
+
return repr(self.mapping)
|
567 |
+
|
568 |
+
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
|
569 |
+
return TensorNameMap(arch, n_blocks)
|
570 |
+
|
571 |
+
class TokenType(IntEnum):
|
572 |
+
NORMAL = 1
|
573 |
+
UNKNOWN = 2
|
574 |
+
CONTROL = 3
|
575 |
+
USER_DEFINED = 4
|
576 |
+
UNUSED = 5
|
577 |
+
BYTE = 6
|
578 |
+
|
579 |
+
#
|
580 |
+
# implementation
|
581 |
+
#
|
582 |
+
|
583 |
+
|
584 |
+
class GGMLQuantizationType(IntEnum):
|
585 |
+
F32 = 0
|
586 |
+
F16 = 1
|
587 |
+
Q4_0 = 2
|
588 |
+
Q4_1 = 3
|
589 |
+
Q5_0 = 6
|
590 |
+
Q5_1 = 7
|
591 |
+
Q8_0 = 8
|
592 |
+
Q8_1 = 9
|
593 |
+
Q2_K = 10
|
594 |
+
Q3_K = 11
|
595 |
+
Q4_K = 12
|
596 |
+
Q5_K = 13
|
597 |
+
Q6_K = 14
|
598 |
+
Q8_K = 15
|
599 |
+
|
600 |
+
|
601 |
+
class GGUFValueType(IntEnum):
|
602 |
+
UINT8 = 0
|
603 |
+
INT8 = 1
|
604 |
+
UINT16 = 2
|
605 |
+
INT16 = 3
|
606 |
+
UINT32 = 4
|
607 |
+
INT32 = 5
|
608 |
+
FLOAT32 = 6
|
609 |
+
BOOL = 7
|
610 |
+
STRING = 8
|
611 |
+
ARRAY = 9
|
612 |
+
UINT64 = 10
|
613 |
+
INT64 = 11
|
614 |
+
FLOAT64 = 12
|
615 |
+
|
616 |
+
@staticmethod
|
617 |
+
def get_type(val):
|
618 |
+
if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
|
619 |
+
return GGUFValueType.STRING
|
620 |
+
elif isinstance(val, list):
|
621 |
+
return GGUFValueType.ARRAY
|
622 |
+
elif isinstance(val, float):
|
623 |
+
return GGUFValueType.FLOAT32
|
624 |
+
elif isinstance(val, bool):
|
625 |
+
return GGUFValueType.BOOL
|
626 |
+
elif isinstance(val, int):
|
627 |
+
return GGUFValueType.INT32
|
628 |
+
# TODO: need help with 64-bit types in Python
|
629 |
+
else:
|
630 |
+
print("Unknown type: "+str(type(val)))
|
631 |
+
sys.exit()
|
632 |
+
|
633 |
+
|
634 |
+
class GGUFWriter:
|
635 |
+
fout: BufferedWriter
|
636 |
+
arch: str
|
637 |
+
offset_tensor = 0
|
638 |
+
data_alignment = GGUF_DEFAULT_ALIGNMENT
|
639 |
+
kv_data = b""
|
640 |
+
kv_data_count = 0
|
641 |
+
ti_data = b""
|
642 |
+
ti_data_count = 0
|
643 |
+
use_temp_file: bool
|
644 |
+
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
|
645 |
+
tensors: list[tuple[np.ndarray[Any, Any], int]]
|
646 |
+
|
647 |
+
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
|
648 |
+
self.fout = open(path, "wb")
|
649 |
+
self.arch = arch
|
650 |
+
self.add_architecture()
|
651 |
+
self.use_temp_file = use_temp_file
|
652 |
+
self.tensors = []
|
653 |
+
|
654 |
+
def write_header_to_file(self):
|
655 |
+
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
656 |
+
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
657 |
+
self.fout.write(struct.pack("<Q", self.ti_data_count))
|
658 |
+
self.fout.write(struct.pack("<Q", self.kv_data_count))
|
659 |
+
self.flush()
|
660 |
+
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
661 |
+
|
662 |
+
def write_kv_data_to_file(self):
|
663 |
+
self.fout.write(self.kv_data)
|
664 |
+
self.flush()
|
665 |
+
|
666 |
+
def write_ti_data_to_file(self):
|
667 |
+
self.fout.write(self.ti_data)
|
668 |
+
self.flush()
|
669 |
+
|
670 |
+
def add_key(self, key: str):
|
671 |
+
self.add_val(key, GGUFValueType.STRING, add_vtype=False)
|
672 |
+
|
673 |
+
def add_uint8(self, key: str, val: int):
|
674 |
+
self.add_key(key)
|
675 |
+
self.add_val(val, GGUFValueType.UINT8)
|
676 |
+
|
677 |
+
def add_int8(self, key: str, val: int):
|
678 |
+
self.add_key(key)
|
679 |
+
self.add_val(val, GGUFValueType.INT8)
|
680 |
+
|
681 |
+
def add_uint16(self, key: str, val: int):
|
682 |
+
self.add_key(key)
|
683 |
+
self.add_val(val, GGUFValueType.UINT16)
|
684 |
+
|
685 |
+
def add_int16(self, key: str, val: int):
|
686 |
+
self.add_key(key)
|
687 |
+
self.add_val(val, GGUFValueType.INT16)
|
688 |
+
|
689 |
+
def add_uint32(self, key: str, val: int):
|
690 |
+
self.add_key(key)
|
691 |
+
self.add_val(val, GGUFValueType.UINT32)
|
692 |
+
|
693 |
+
def add_int32(self, key: str, val: int):
|
694 |
+
self.add_key(key)
|
695 |
+
self.add_val(val, GGUFValueType.INT32)
|
696 |
+
|
697 |
+
def add_float32(self, key: str, val: float):
|
698 |
+
self.add_key(key)
|
699 |
+
self.add_val(val, GGUFValueType.FLOAT32)
|
700 |
+
|
701 |
+
def add_uint64(self, key: str, val: int):
|
702 |
+
self.add_key(key)
|
703 |
+
self.add_val(val, GGUFValueType.UINT64)
|
704 |
+
|
705 |
+
def add_int64(self, key: str, val: int):
|
706 |
+
self.add_key(key)
|
707 |
+
self.add_val(val, GGUFValueType.INT64)
|
708 |
+
|
709 |
+
def add_float64(self, key: str, val: float):
|
710 |
+
self.add_key(key)
|
711 |
+
self.add_val(val, GGUFValueType.FLOAT64)
|
712 |
+
|
713 |
+
def add_bool(self, key: str, val: bool):
|
714 |
+
self.add_key(key)
|
715 |
+
self.add_val(val, GGUFValueType.BOOL)
|
716 |
+
|
717 |
+
def add_string(self, key: str, val: str):
|
718 |
+
if len(val) == 0:
|
719 |
+
return
|
720 |
+
self.add_key(key)
|
721 |
+
self.add_val(val, GGUFValueType.STRING)
|
722 |
+
|
723 |
+
def add_array(self, key: str, val: Sequence[Any]):
|
724 |
+
if not isinstance(val, Sequence):
|
725 |
+
raise ValueError("Value must be a sequence for array type")
|
726 |
+
|
727 |
+
self.add_key(key)
|
728 |
+
self.add_val(val, GGUFValueType.ARRAY)
|
729 |
+
|
730 |
+
_simple_value_packing = {
|
731 |
+
GGUFValueType.UINT8: "<B",
|
732 |
+
GGUFValueType.INT8: "<b",
|
733 |
+
GGUFValueType.UINT16: "<H",
|
734 |
+
GGUFValueType.INT16: "<h",
|
735 |
+
GGUFValueType.UINT32: "<I",
|
736 |
+
GGUFValueType.INT32: "<i",
|
737 |
+
GGUFValueType.FLOAT32: "<f",
|
738 |
+
GGUFValueType.UINT64: "<Q",
|
739 |
+
GGUFValueType.INT64: "<q",
|
740 |
+
GGUFValueType.FLOAT64: "<d",
|
741 |
+
GGUFValueType.BOOL: "?" ,
|
742 |
+
}
|
743 |
+
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
|
744 |
+
if vtype is None:
|
745 |
+
vtype = GGUFValueType.get_type(val)
|
746 |
+
|
747 |
+
if add_vtype:
|
748 |
+
self.kv_data += struct.pack("<I", vtype)
|
749 |
+
self.kv_data_count += 1
|
750 |
+
|
751 |
+
pack_fmt = self._simple_value_packing.get(vtype)
|
752 |
+
if pack_fmt is not None:
|
753 |
+
self.kv_data += struct.pack(pack_fmt, val)
|
754 |
+
elif vtype == GGUFValueType.STRING:
|
755 |
+
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
756 |
+
self.kv_data += struct.pack("<Q", len(encoded_val))
|
757 |
+
self.kv_data += encoded_val
|
758 |
+
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
|
759 |
+
ltype = GGUFValueType.get_type(val[0])
|
760 |
+
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
761 |
+
raise ValueError("All items in a GGUF array should be of the same type")
|
762 |
+
self.kv_data += struct.pack("<I", ltype)
|
763 |
+
self.kv_data += struct.pack("<Q", len(val))
|
764 |
+
for item in val:
|
765 |
+
self.add_val(item, add_vtype=False)
|
766 |
+
else:
|
767 |
+
raise ValueError("Invalid GGUF metadata value type or value")
|
768 |
+
|
769 |
+
@staticmethod
|
770 |
+
def ggml_pad(x: int, n: int) -> int:
|
771 |
+
return ((x + n - 1) // n) * n
|
772 |
+
|
773 |
+
def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
|
774 |
+
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
775 |
+
|
776 |
+
encoded_name = name.encode("utf8")
|
777 |
+
self.ti_data += struct.pack("<Q", len(encoded_name))
|
778 |
+
self.ti_data += encoded_name
|
779 |
+
n_dims = len(tensor_shape)
|
780 |
+
self.ti_data += struct.pack("<I", n_dims)
|
781 |
+
for i in range(n_dims):
|
782 |
+
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
|
783 |
+
if raw_dtype is None:
|
784 |
+
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
785 |
+
else:
|
786 |
+
dtype = raw_dtype
|
787 |
+
self.ti_data += struct.pack("<I", dtype)
|
788 |
+
self.ti_data += struct.pack("<Q", self.offset_tensor)
|
789 |
+
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
790 |
+
self.ti_data_count += 1
|
791 |
+
|
792 |
+
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
|
793 |
+
if self.use_temp_file and self.temp_file is None:
|
794 |
+
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
795 |
+
fp.seek(0)
|
796 |
+
self.temp_file = fp
|
797 |
+
|
798 |
+
shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
|
799 |
+
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
|
800 |
+
|
801 |
+
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
802 |
+
|
803 |
+
if self.temp_file is None:
|
804 |
+
self.tensors.append((tensor, pad))
|
805 |
+
return
|
806 |
+
|
807 |
+
tensor.tofile(self.temp_file)
|
808 |
+
|
809 |
+
if pad != 0:
|
810 |
+
self.temp_file.write(bytes([0] * pad))
|
811 |
+
|
812 |
+
def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
|
813 |
+
pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
|
814 |
+
if pad != 0:
|
815 |
+
fp.write(bytes([0] * pad))
|
816 |
+
|
817 |
+
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
|
818 |
+
self.write_padding(self.fout, self.fout.tell())
|
819 |
+
tensor.tofile(self.fout)
|
820 |
+
self.write_padding(self.fout, tensor.nbytes)
|
821 |
+
|
822 |
+
def write_tensors_to_file(self):
|
823 |
+
self.write_ti_data_to_file()
|
824 |
+
|
825 |
+
self.write_padding(self.fout, self.fout.tell())
|
826 |
+
|
827 |
+
if self.temp_file is None:
|
828 |
+
for (currtensor, currpad) in self.tensors:
|
829 |
+
currtensor.tofile(self.fout)
|
830 |
+
if currpad != 0:
|
831 |
+
self.fout.write(bytes([0] * currpad))
|
832 |
+
return
|
833 |
+
|
834 |
+
self.temp_file.seek(0)
|
835 |
+
|
836 |
+
shutil.copyfileobj(self.temp_file, self.fout)
|
837 |
+
self.flush()
|
838 |
+
self.temp_file.close()
|
839 |
+
|
840 |
+
def flush(self):
|
841 |
+
self.fout.flush()
|
842 |
+
|
843 |
+
def close(self):
|
844 |
+
self.fout.close()
|
845 |
+
|
846 |
+
def add_architecture(self):
|
847 |
+
self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
|
848 |
+
|
849 |
+
def add_author(self, author: str):
|
850 |
+
self.add_string(KEY_GENERAL_AUTHOR, author)
|
851 |
+
|
852 |
+
def add_tensor_data_layout(self, layout: str):
|
853 |
+
self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
854 |
+
|
855 |
+
def add_url(self, url: str):
|
856 |
+
self.add_string(KEY_GENERAL_URL, url)
|
857 |
+
|
858 |
+
def add_description(self, description: str):
|
859 |
+
self.add_string(KEY_GENERAL_DESCRIPTION, description)
|
860 |
+
|
861 |
+
def add_source_url(self, url: str):
|
862 |
+
self.add_string(KEY_GENERAL_SOURCE_URL, url)
|
863 |
+
|
864 |
+
def add_source_hf_repo(self, repo: str):
|
865 |
+
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
|
866 |
+
|
867 |
+
def add_file_type(self, ftype: int):
|
868 |
+
self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
|
869 |
+
|
870 |
+
def add_name(self, name: str):
|
871 |
+
self.add_string(KEY_GENERAL_NAME, name)
|
872 |
+
|
873 |
+
def add_quantization_version(self, quantization_version: GGMLQuantizationType):
|
874 |
+
self.add_uint32(
|
875 |
+
KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
|
876 |
+
|
877 |
+
def add_custom_alignment(self, alignment: int):
|
878 |
+
self.data_alignment = alignment
|
879 |
+
self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
|
880 |
+
|
881 |
+
def add_context_length(self, length: int):
|
882 |
+
self.add_uint32(
|
883 |
+
KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
|
884 |
+
|
885 |
+
def add_embedding_length(self, length: int):
|
886 |
+
self.add_uint32(
|
887 |
+
KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
888 |
+
|
889 |
+
def add_block_count(self, length: int):
|
890 |
+
self.add_uint32(
|
891 |
+
KEY_BLOCK_COUNT.format(arch=self.arch), length)
|
892 |
+
|
893 |
+
def add_feed_forward_length(self, length: int):
|
894 |
+
self.add_uint32(
|
895 |
+
KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
896 |
+
|
897 |
+
def add_parallel_residual(self, use: bool):
|
898 |
+
self.add_bool(
|
899 |
+
KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
900 |
+
|
901 |
+
def add_head_count(self, count: int):
|
902 |
+
self.add_uint32(
|
903 |
+
KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
|
904 |
+
|
905 |
+
def add_head_count_kv(self, count: int):
|
906 |
+
self.add_uint32(
|
907 |
+
KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
|
908 |
+
|
909 |
+
def add_max_alibi_bias(self, bias: float):
|
910 |
+
self.add_float32(
|
911 |
+
KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
912 |
+
|
913 |
+
def add_clamp_kqv(self, value: float):
|
914 |
+
self.add_float32(
|
915 |
+
KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
|
916 |
+
|
917 |
+
def add_layer_norm_eps(self, value: float):
|
918 |
+
self.add_float32(
|
919 |
+
KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
|
920 |
+
|
921 |
+
def add_layer_norm_rms_eps(self, value: float):
|
922 |
+
self.add_float32(
|
923 |
+
KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
924 |
+
|
925 |
+
def add_rope_dimension_count(self, count: int):
|
926 |
+
self.add_uint32(
|
927 |
+
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
|
928 |
+
|
929 |
+
def add_rope_freq_base(self, value: float):
|
930 |
+
self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
|
931 |
+
|
932 |
+
def add_rope_scale_linear(self, value: float):
|
933 |
+
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
|
934 |
+
|
935 |
+
def add_tokenizer_model(self, model: str):
|
936 |
+
self.add_string(KEY_TOKENIZER_MODEL, model)
|
937 |
+
|
938 |
+
def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
|
939 |
+
self.add_array(KEY_TOKENIZER_LIST, tokens)
|
940 |
+
|
941 |
+
def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
|
942 |
+
self.add_array(KEY_TOKENIZER_MERGES, merges)
|
943 |
+
|
944 |
+
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]):
|
945 |
+
self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
|
946 |
+
|
947 |
+
def add_token_scores(self, scores: Sequence[float]):
|
948 |
+
self.add_array(KEY_TOKENIZER_SCORES, scores)
|
949 |
+
|
950 |
+
def add_bos_token_id(self, id: int):
|
951 |
+
self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
|
952 |
+
|
953 |
+
def add_eos_token_id(self, id: int):
|
954 |
+
self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
|
955 |
+
|
956 |
+
def add_unk_token_id(self, id: int):
|
957 |
+
self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
|
958 |
+
|
959 |
+
def add_sep_token_id(self, id: int):
|
960 |
+
self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
|
961 |
+
|
962 |
+
def add_pad_token_id(self, id: int):
|
963 |
+
self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
|
964 |
+
|
965 |
+
|
966 |
+
class SpecialVocab:
|
967 |
+
load_merges: bool = False
|
968 |
+
merges: list[str] = []
|
969 |
+
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
970 |
+
special_token_ids: dict[str, int] = {}
|
971 |
+
|
972 |
+
def __init__(
|
973 |
+
self, path: str | os.PathLike[str], load_merges: bool = False,
|
974 |
+
special_token_types: tuple[str, ...] | None = None,
|
975 |
+
):
|
976 |
+
self.special_token_ids = {}
|
977 |
+
self.load_merges = load_merges
|
978 |
+
if special_token_types is not None:
|
979 |
+
self.special_token_types = special_token_types
|
980 |
+
self._load(Path(path))
|
981 |
+
|
982 |
+
def _load(self, path: Path) -> None:
|
983 |
+
if not self._try_load_from_tokenizer_json(path):
|
984 |
+
self._try_load_from_config_json(path)
|
985 |
+
|
986 |
+
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
987 |
+
tokenizer_file = path / 'tokenizer.json'
|
988 |
+
if not tokenizer_file.is_file():
|
989 |
+
return False
|
990 |
+
with open(tokenizer_file, encoding = 'utf-8') as f:
|
991 |
+
tokenizer = json.load(f)
|
992 |
+
if self.load_merges:
|
993 |
+
merges = tokenizer.get('model', {}).get('merges')
|
994 |
+
if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
|
995 |
+
self.merges = merges
|
996 |
+
tokenizer_config_file = path / 'tokenizer_config.json'
|
997 |
+
added_tokens = tokenizer.get('added_tokens')
|
998 |
+
if added_tokens is None or not tokenizer_config_file.is_file():
|
999 |
+
return True
|
1000 |
+
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
1001 |
+
tokenizer_config = json.load(f)
|
1002 |
+
for typ in self.special_token_types:
|
1003 |
+
entry = tokenizer_config.get(f'{typ}_token')
|
1004 |
+
if isinstance(entry, str):
|
1005 |
+
tc_content = entry
|
1006 |
+
elif isinstance(entry, dict):
|
1007 |
+
entry_content = entry.get('content')
|
1008 |
+
if not isinstance(entry_content, str):
|
1009 |
+
continue
|
1010 |
+
tc_content = entry_content
|
1011 |
+
else:
|
1012 |
+
continue
|
1013 |
+
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
|
1014 |
+
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
|
1015 |
+
self.special_token_ids[typ] = maybe_token_id
|
1016 |
+
break
|
1017 |
+
return True
|
1018 |
+
|
1019 |
+
def _try_load_from_config_json(self, path: Path) -> bool:
|
1020 |
+
config_file = path / 'config.json'
|
1021 |
+
if not config_file.is_file():
|
1022 |
+
return False
|
1023 |
+
with open(config_file, encoding = 'utf-8') as f:
|
1024 |
+
config = json.load(f)
|
1025 |
+
for typ in self.special_token_types:
|
1026 |
+
maybe_token_id = config.get(f'{typ}_token_id')
|
1027 |
+
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
|
1028 |
+
self.special_token_ids[typ] = maybe_token_id
|
1029 |
+
return True
|
1030 |
+
|
1031 |
+
def add_to_gguf(self, gw: GGUFWriter) -> None:
|
1032 |
+
if len(self.merges) > 0:
|
1033 |
+
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
1034 |
+
gw.add_token_merges(self.merges)
|
1035 |
+
for typ, tokid in self.special_token_ids.items():
|
1036 |
+
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
1037 |
+
if handler is None:
|
1038 |
+
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
|
1039 |
+
continue
|
1040 |
+
print(f'gguf: Setting special token type {typ} to {tokid}')
|
1041 |
+
handler(tokid)
|
1042 |
+
|
1043 |
+
def __repr__(self) -> str:
|
1044 |
+
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
|
1045 |
+
|
1046 |
+
|
1047 |
+
# Example usage:
|
1048 |
+
if __name__ == "__main__":
|
1049 |
+
# Example usage with a file
|
1050 |
+
gguf_writer = GGUFWriter("example.gguf", "llama")
|
1051 |
+
|
1052 |
+
gguf_writer.add_architecture()
|
1053 |
+
gguf_writer.add_block_count(12)
|
1054 |
+
gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
|
1055 |
+
gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
|
1056 |
+
gguf_writer.add_custom_alignment(64)
|
1057 |
+
|
1058 |
+
tensor1 = np.ones((32,), dtype=np.float32) * 100.0
|
1059 |
+
tensor2 = np.ones((64,), dtype=np.float32) * 101.0
|
1060 |
+
tensor3 = np.ones((96,), dtype=np.float32) * 102.0
|
1061 |
+
|
1062 |
+
gguf_writer.add_tensor("tensor1", tensor1)
|
1063 |
+
gguf_writer.add_tensor("tensor2", tensor2)
|
1064 |
+
gguf_writer.add_tensor("tensor3", tensor3)
|
1065 |
+
|
1066 |
+
gguf_writer.write_header_to_file()
|
1067 |
+
gguf_writer.write_kv_data_to_file()
|
1068 |
+
gguf_writer.write_tensors_to_file()
|
1069 |
+
|
1070 |
+
gguf_writer.close()
|
llama.cpp/gguf-py/gguf/py.typed
ADDED
File without changes
|
llama.cpp/gguf-py/pyproject.toml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "gguf"
|
3 |
+
version = "0.4.4"
|
4 |
+
description = "Write ML models in GGUF for GGML"
|
5 |
+
authors = ["GGML <ggml@ggml.ai>"]
|
6 |
+
packages = [
|
7 |
+
{include = "gguf"},
|
8 |
+
{include = "gguf/py.typed"},
|
9 |
+
]
|
10 |
+
readme = "README.md"
|
11 |
+
homepage = "https://ggml.ai"
|
12 |
+
repository = "https://github.com/ggerganov/llama.cpp"
|
13 |
+
keywords = ["ggml", "gguf", "llama.cpp"]
|
14 |
+
classifiers = [
|
15 |
+
"Programming Language :: Python :: 3",
|
16 |
+
"License :: OSI Approved :: MIT License",
|
17 |
+
"Operating System :: OS Independent",
|
18 |
+
]
|
19 |
+
|
20 |
+
[tool.poetry.dependencies]
|
21 |
+
python = ">=3.8"
|
22 |
+
numpy = ">=1.17"
|
23 |
+
|
24 |
+
[tool.poetry.dev-dependencies]
|
25 |
+
pytest = "^5.2"
|
26 |
+
|
27 |
+
[build-system]
|
28 |
+
requires = ["poetry-core>=1.0.0"]
|
29 |
+
build-backend = "poetry.core.masonry.api"
|
llama.cpp/gguf-py/tests/test_gguf.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gguf
|
2 |
+
|
3 |
+
# TODO: add tests
|
4 |
+
|
5 |
+
|
6 |
+
def test_write_gguf():
|
7 |
+
pass
|
llama.cpp/models/ggml-vocab-aquila.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c53c3c516ac67c7ca12977b9690fdea3d2ef13bbaed6378f98191a13ef5ca00
|
3 |
+
size 4825676
|
llama.cpp/models/ggml-vocab-falcon.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffbc7c119de7e9aab8f4257d617e3fa55f942a9f9ca84139ef3f5b1ca53836a8
|
3 |
+
size 2547782
|
llama.cpp/models/ggml-vocab-llama.gguf
ADDED
Binary file (595 kB). View file
|
|