indialcristi commited on
Commit
97a961c
1 Parent(s): a9192b0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llama.cpp/models/ggml-vocab-aquila.gguf filter=lfs diff=lfs merge=lfs -text
37
+ llama.cpp/models/ggml-vocab-falcon.gguf filter=lfs diff=lfs merge=lfs -text
gguf-model-4-f16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7228c76f1035d9e8e3347f53649d705984dcd100bbf7ab918086e9b27ffe36b3
3
+ size 14484764256
llama.cpp/.github/workflows/gguf-publish.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will upload a Python Package using Twine when a GGUF release is created
2
+ # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3
+
4
+ # See `gguf-py/README.md` for how to make a release.
5
+
6
+ # This workflow uses actions that are not certified by GitHub.
7
+ # They are provided by a third-party and are governed by
8
+ # separate terms of service, privacy policy, and support
9
+ # documentation.
10
+
11
+ name: Upload Python Package
12
+
13
+ on:
14
+ workflow_dispatch:
15
+ push:
16
+ # Pattern matched against refs/tags
17
+ tags:
18
+ - 'gguf-v*' # Push events to every version tag
19
+
20
+
21
+ jobs:
22
+ deploy:
23
+
24
+ runs-on: ubuntu-latest
25
+
26
+ steps:
27
+ - uses: actions/checkout@v3
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v2
30
+ with:
31
+ python-version: '3.9.x'
32
+ - name: Install dependencies
33
+ run: |
34
+ cd gguf-py
35
+ python -m pip install poetry
36
+ poetry install
37
+
38
+ - name: Build package
39
+ run: cd gguf-py && poetry build
40
+ - name: Publish package
41
+ uses: pypa/gh-action-pypi-publish@release/v1
42
+ with:
43
+ password: ${{ secrets.PYPI_API_TOKEN }}
44
+ packages-dir: gguf-py/dist
llama.cpp/convert-baichuan-hf-to-gguf.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF baichuan --> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING, Any
13
+ import itertools
14
+ import numpy as np
15
+ import torch
16
+ from sentencepiece import SentencePieceProcessor # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+
23
+ if TYPE_CHECKING:
24
+ from typing import TypeAlias
25
+
26
+ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
27
+
28
+ # reverse HF permute back to original pth layout
29
+
30
+
31
+ def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
32
+ if n_kv_head is not None and n_head != n_kv_head:
33
+ n_head //= n_kv_head
34
+
35
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
36
+ .swapaxes(1, 2)
37
+ .reshape(weights.shape))
38
+
39
+ def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
40
+ r = weights.shape[0] // 3
41
+ return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
42
+
43
+ def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
44
+ r = weights.shape[0] // 3
45
+ return weights[r * n_part : r * n_part + r, ...]
46
+
47
+ def count_model_parts(dir_model: str) -> int:
48
+ num_parts = 0
49
+
50
+ for filename in os.listdir(dir_model):
51
+ if filename.startswith("pytorch_model-"):
52
+ num_parts += 1
53
+
54
+ if num_parts > 0:
55
+ print("gguf: found " + str(num_parts) + " model parts")
56
+
57
+ return num_parts
58
+
59
+
60
+
61
+ def parse_args() -> argparse.Namespace:
62
+ parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
63
+ parser.add_argument(
64
+ "--vocab-only", action="store_true",
65
+ help="extract only the vocab",
66
+ )
67
+ parser.add_argument(
68
+ "--outfile", type=Path,
69
+ help="path to write to; default: based on input",
70
+ )
71
+ parser.add_argument(
72
+ "model", type=Path,
73
+ help="directory containing model file, or model file itself (*.bin)",
74
+ )
75
+ parser.add_argument(
76
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
77
+ help="output format - use 0 for float32, 1 for float16",
78
+ )
79
+ return parser.parse_args()
80
+
81
+ args = parse_args()
82
+
83
+ dir_model = args.model
84
+ ftype = args.ftype
85
+ if not dir_model.is_dir():
86
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
87
+ sys.exit(1)
88
+
89
+ # possible tensor data types
90
+ # ftype == 0 -> float32
91
+ # ftype == 1 -> float16
92
+
93
+ # map from ftype to string
94
+ ftype_str = ["f32", "f16"]
95
+
96
+ if args.outfile is not None:
97
+ fname_out = args.outfile
98
+ else:
99
+ # output in the same directory as the model by default
100
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
101
+
102
+ print("gguf: loading model "+dir_model.name)
103
+
104
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
105
+ hparams = json.load(f)
106
+ print("hello print: ",hparams["architectures"][0])
107
+ if hparams["architectures"][0] != "BaichuanForCausalLM":
108
+ print("Model architecture not supported: " + hparams["architectures"][0])
109
+
110
+ sys.exit()
111
+
112
+ # get number of model parts
113
+ num_parts = count_model_parts(dir_model)
114
+ print(f"num_parts:{num_parts}\n")
115
+ ARCH=gguf.MODEL_ARCH.BAICHUAN
116
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
117
+
118
+ print("gguf: get model metadata")
119
+
120
+ block_count = hparams["num_hidden_layers"]
121
+ head_count = hparams["num_attention_heads"]
122
+
123
+ if "num_key_value_heads" in hparams:
124
+ head_count_kv = hparams["num_key_value_heads"]
125
+ else:
126
+ head_count_kv = head_count
127
+
128
+ if "_name_or_path" in hparams:
129
+ hf_repo = hparams["_name_or_path"]
130
+ else:
131
+ hf_repo = ""
132
+
133
+ if "max_sequence_length" in hparams:
134
+ ctx_length = hparams["max_sequence_length"]
135
+ elif "max_position_embeddings" in hparams:
136
+ ctx_length = hparams["max_position_embeddings"]
137
+ elif "model_max_length" in hparams:
138
+ ctx_length = hparams["model_max_length"]
139
+ else:
140
+ print("gguf: can not find ctx length parameter.")
141
+
142
+ sys.exit()
143
+
144
+
145
+ gguf_writer.add_name(dir_model.name)
146
+ gguf_writer.add_source_hf_repo(hf_repo)
147
+ gguf_writer.add_tensor_data_layout("Meta AI original pth")
148
+ gguf_writer.add_context_length(ctx_length)
149
+ gguf_writer.add_embedding_length(hparams["hidden_size"])
150
+ gguf_writer.add_block_count(block_count)
151
+ gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
152
+ gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
153
+ gguf_writer.add_head_count(head_count)
154
+ gguf_writer.add_head_count_kv(head_count_kv)
155
+ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
156
+
157
+ if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
158
+ if "type" in hparams["rope_scaling"]:
159
+ if hparams["rope_scaling"]["type"] == "linear":
160
+ gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
161
+
162
+
163
+ # TOKENIZATION
164
+
165
+ print("gguf: get tokenizer metadata")
166
+
167
+ tokens: list[bytes] = []
168
+ scores: list[float] = []
169
+ toktypes: list[int] = []
170
+
171
+ tokenizer_model_file = dir_model / 'tokenizer.model'
172
+ if not tokenizer_model_file.is_file():
173
+ print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
174
+ sys.exit(1)
175
+
176
+ # vocab type sentencepiece
177
+ print("gguf: get sentencepiece tokenizer vocab, scores and token types")
178
+
179
+ tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
180
+ vocab_size = hparams.get('vocab_size')
181
+ if vocab_size is None:
182
+ vocab_size = tokenizer.vocab_size()
183
+
184
+ for i in range(vocab_size):
185
+ text: bytes
186
+ score: float
187
+
188
+ piece = tokenizer.id_to_piece(i)
189
+ text = piece.encode("utf-8")
190
+ score = tokenizer.get_score(i)
191
+
192
+ toktype = 1 # defualt to normal token type
193
+ if tokenizer.is_unknown(i):
194
+ toktype = 2
195
+ if tokenizer.is_control(i):
196
+ toktype = 3
197
+
198
+ # toktype = 4 is user-defined = tokens from added_tokens.json
199
+
200
+ if tokenizer.is_unused(i):
201
+ toktype = 5
202
+ if tokenizer.is_byte(i):
203
+ toktype = 6
204
+
205
+ tokens.append(text)
206
+ scores.append(score)
207
+ toktypes.append(toktype)
208
+
209
+ added_tokens_file = dir_model / 'added_tokens.json'
210
+ if added_tokens_file.is_file():
211
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
212
+ addtokens_json = json.load(f)
213
+
214
+ print("gguf: get added tokens")
215
+
216
+ for key in addtokens_json:
217
+ tokens.append( key.encode("utf-8") )
218
+ scores.append(-1000.0)
219
+ toktypes.append(4) # user-defined token type
220
+
221
+
222
+ gguf_writer.add_tokenizer_model("llama")
223
+ gguf_writer.add_token_list(tokens)
224
+ gguf_writer.add_token_scores(scores)
225
+ gguf_writer.add_token_types(toktypes)
226
+
227
+ special_vocab = gguf.SpecialVocab(dir_model)
228
+ special_vocab.add_to_gguf(gguf_writer)
229
+
230
+ # TENSORS
231
+
232
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
233
+
234
+ # tensor info
235
+ print("gguf: get tensor metadata")
236
+
237
+ if num_parts == 0:
238
+ part_names = iter(("pytorch_model.bin",))
239
+ else:
240
+ part_names = (
241
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
242
+ )
243
+
244
+
245
+ for part_name in part_names:
246
+ if args.vocab_only:
247
+ break
248
+ print("gguf: loading model part '" + part_name + "'")
249
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
250
+
251
+ tmp=model_part
252
+ for i in range(block_count):
253
+ if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
254
+ print(f"Unpacking and permuting layer {i}")
255
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
256
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
257
+ tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
258
+ del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
259
+
260
+ for name in model_part.keys():
261
+ data = model_part[name]
262
+ # we don't need these
263
+ if name.endswith(".rotary_emb.inv_freq"):
264
+ continue
265
+
266
+ old_dtype = data.dtype
267
+
268
+ # convert any unsupported data types to float32
269
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
270
+ data = data.to(torch.float32)
271
+
272
+ data = data.squeeze().numpy()
273
+
274
+ # map tensor names
275
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
276
+ if new_name is None:
277
+ print("Can not map tensor '" + name + "'")
278
+ sys.exit()
279
+
280
+ n_dims = len(data.shape)
281
+ data_dtype = data.dtype
282
+
283
+ # if f32 desired, convert any float16 to float32
284
+ if ftype == 0 and data_dtype == np.float16:
285
+ data = data.astype(np.float32)
286
+
287
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
288
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
289
+ data = data.astype(np.float32)
290
+
291
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
292
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
293
+ data = data.astype(np.float16)
294
+
295
+ print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
296
+ gguf_writer.add_tensor(new_name, data)
297
+
298
+
299
+ print("gguf: write header")
300
+ gguf_writer.write_header_to_file()
301
+ print("gguf: write metadata")
302
+ gguf_writer.write_kv_data_to_file()
303
+ if not args.vocab_only:
304
+ print("gguf: write tensors")
305
+ gguf_writer.write_tensors_to_file()
306
+
307
+ gguf_writer.close()
308
+
309
+ print(f"gguf: model successfully exported to '{fname_out}'")
310
+ print("")
llama.cpp/convert-bloom-hf-to-gguf.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF bloom --> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import re
10
+ import struct
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import numpy as np
16
+ import torch
17
+ from transformers import AutoTokenizer # type: ignore[import]
18
+
19
+ if 'NO_LOCAL_GGUF' not in os.environ:
20
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
21
+ import gguf
22
+
23
+
24
+ def count_model_parts(dir_model: Path) -> int:
25
+ num_parts = 0
26
+ for filename in os.listdir(dir_model):
27
+ if filename.startswith("pytorch_model-"):
28
+ num_parts += 1
29
+
30
+ if num_parts > 0:
31
+ print("gguf: found " + str(num_parts) + " model parts")
32
+ return num_parts
33
+
34
+
35
+ # Supported Models:
36
+ # https://huggingface.co/bigscience/bloom-1b7
37
+ # https://huggingface.co/bigscience/bloom-3b
38
+ # https://huggingface.co/bigscience/bloom-7b1
39
+ # https://huggingface.co/Langboat/bloom-1b4-zh
40
+ def parse_args() -> argparse.Namespace:
41
+ parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
42
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
43
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
44
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
45
+ parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
46
+ return parser.parse_args()
47
+
48
+ args = parse_args()
49
+
50
+ dir_model = args.model
51
+ ftype = args.ftype
52
+ if not dir_model.is_dir():
53
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
54
+ sys.exit(1)
55
+
56
+ # possible tensor data types
57
+ # ftype == 0 -> float32
58
+ # ftype == 1 -> float16
59
+
60
+ # map from ftype to string
61
+ ftype_str = ["f32", "f16"]
62
+
63
+ if args.outfile is not None:
64
+ fname_out = args.outfile
65
+ else:
66
+ # output in the same directory as the model by default
67
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
68
+
69
+ print("gguf: loading model "+dir_model.name)
70
+
71
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
72
+ hparams = json.load(f)
73
+
74
+ if hparams["architectures"][0] != "BloomForCausalLM":
75
+ print("Model architecture not supported: " + hparams["architectures"][0])
76
+ sys.exit(1)
77
+
78
+ # get number of model parts
79
+ num_parts = count_model_parts(dir_model)
80
+
81
+ ARCH=gguf.MODEL_ARCH.BLOOM
82
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
83
+
84
+ print("gguf: get model metadata")
85
+
86
+ block_count = hparams["n_layer"]
87
+
88
+ gguf_writer.add_name("Bloom")
89
+ n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
90
+ n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
91
+ gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
92
+ gguf_writer.add_embedding_length(n_embed)
93
+ gguf_writer.add_feed_forward_length(4 * n_embed)
94
+ gguf_writer.add_block_count(block_count)
95
+ gguf_writer.add_head_count(n_head)
96
+ gguf_writer.add_head_count_kv(n_head)
97
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
98
+ gguf_writer.add_file_type(ftype)
99
+
100
+ # TOKENIZATION
101
+
102
+ print("gguf: get tokenizer metadata")
103
+
104
+ tokens: list[bytearray] = []
105
+ scores: list[float] = []
106
+ toktypes: list[int] = []
107
+
108
+ # gpt2 tokenizer
109
+ gguf_writer.add_tokenizer_model("gpt2")
110
+
111
+ print("gguf: get gpt2 tokenizer vocab")
112
+
113
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
114
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
115
+
116
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
117
+ # This causes downstream issues with mismatched tensor sizes when running the inference
118
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
119
+ assert max(tokenizer.vocab.values()) < vocab_size
120
+
121
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
122
+
123
+ for i in range(vocab_size):
124
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
125
+ scores.append(0.0) # dummy
126
+ toktypes.append(gguf.TokenType.NORMAL)
127
+
128
+ gguf_writer.add_token_list(tokens)
129
+ gguf_writer.add_token_scores(scores)
130
+ gguf_writer.add_token_types(toktypes)
131
+
132
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
133
+ special_vocab.add_to_gguf(gguf_writer)
134
+
135
+ # TENSORS
136
+
137
+ tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
138
+
139
+ # params for qkv transform
140
+ n_head_kv = hparams.get("n_head_kv", n_head)
141
+ head_dim = n_embed // n_head
142
+
143
+ # tensor info
144
+ print("gguf: get tensor metadata")
145
+
146
+ if num_parts == 0:
147
+ part_names = iter(("pytorch_model.bin",))
148
+ else:
149
+ part_names = (
150
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
151
+ )
152
+
153
+ for part_name in part_names:
154
+ if args.vocab_only:
155
+ break
156
+ print("gguf: loading model part '" + part_name + "'")
157
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
158
+
159
+ has_lm_head = True
160
+ if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
161
+ has_lm_head = False
162
+
163
+ for original_name in model_part.keys():
164
+ data = model_part[original_name]
165
+ name = re.sub(r'transformer\.', '', original_name)
166
+
167
+ old_dtype = data.dtype
168
+
169
+ # convert any unsupported data types to float32
170
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
171
+ data = data.to(torch.float32)
172
+
173
+ data = data.squeeze().numpy()
174
+
175
+ if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
176
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
177
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
178
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
179
+ qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
180
+ data = np.concatenate(
181
+ (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
182
+ qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
183
+ qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
184
+ axis=0
185
+ )
186
+ print("re-format attention.linear_qkv.weight")
187
+ elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
188
+ qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
189
+ data = np.concatenate(
190
+ (qkv_bias[:, 0, :].reshape((n_embed,)),
191
+ qkv_bias[:, 1, :].reshape((n_embed,)),
192
+ qkv_bias[:, 2, :].reshape((n_embed,))),
193
+ axis=0
194
+ )
195
+ print("re-format attention.linear_qkv.bias")
196
+
197
+ # map tensor names
198
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
199
+ if new_name is None:
200
+ print("Can not map tensor '" + name + "'")
201
+ sys.exit()
202
+
203
+ n_dims = len(data.shape)
204
+ data_dtype = data.dtype
205
+
206
+ # if f32 desired, convert any float16 to float32
207
+ if ftype == 0 and data_dtype == np.float16:
208
+ data = data.astype(np.float32)
209
+
210
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
211
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
212
+ data = data.astype(np.float32)
213
+
214
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
215
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
216
+ data = data.astype(np.float16)
217
+
218
+ print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
219
+
220
+ gguf_writer.add_tensor(new_name, data)
221
+
222
+ if not has_lm_head and name == "word_embeddings.weight":
223
+ gguf_writer.add_tensor("output.weight", data)
224
+ print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
225
+
226
+
227
+ print("gguf: write header")
228
+ gguf_writer.write_header_to_file()
229
+ print("gguf: write metadata")
230
+ gguf_writer.write_kv_data_to_file()
231
+ if not args.vocab_only:
232
+ print("gguf: write tensors")
233
+ gguf_writer.write_tensors_to_file()
234
+
235
+ gguf_writer.close()
236
+
237
+ print(f"gguf: model successfully exported to '{fname_out}'")
238
+ print("")
llama.cpp/convert-falcon-hf-to-gguf.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF falcon--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import contextlib
8
+ import json
9
+ import os
10
+ import struct
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import numpy as np
16
+ import torch
17
+ from transformers import AutoTokenizer # type: ignore[import]
18
+
19
+ if 'NO_LOCAL_GGUF' not in os.environ:
20
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
21
+ import gguf
22
+
23
+
24
+ def count_model_parts(dir_model: Path, prefix: str) -> int:
25
+ num_parts = 0
26
+ for filename in os.listdir(dir_model):
27
+ if filename.startswith(prefix):
28
+ num_parts += 1
29
+
30
+ if num_parts > 0:
31
+ print("gguf: found " + str(num_parts) + " model parts")
32
+ return num_parts
33
+
34
+
35
+ def parse_args() -> argparse.Namespace:
36
+ parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
37
+ parser.add_argument(
38
+ "--vocab-only", action="store_true",
39
+ help="extract only the vocab",
40
+ )
41
+ parser.add_argument(
42
+ "--outfile", type=Path,
43
+ help="path to write to; default: based on input",
44
+ )
45
+ parser.add_argument(
46
+ "model", type=Path,
47
+ help="directory containing model file, or model file itself (*.bin)",
48
+ )
49
+ parser.add_argument(
50
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
51
+ help="output format - use 0 for float32, 1 for float16",
52
+ )
53
+ return parser.parse_args()
54
+
55
+ args = parse_args()
56
+
57
+ dir_model = args.model
58
+ ftype = args.ftype
59
+ if not dir_model.is_dir():
60
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
61
+ sys.exit(1)
62
+
63
+ # possible tensor data types
64
+ # ftype == 0 -> float32
65
+ # ftype == 1 -> float16
66
+
67
+ # map from ftype to string
68
+ ftype_str = ["f32", "f16"]
69
+
70
+ if args.outfile is not None:
71
+ fname_out = args.outfile
72
+ else:
73
+ # output in the same directory as the model by default
74
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
75
+
76
+ print("gguf: loading model "+dir_model.name)
77
+
78
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
79
+ hparams = json.load(f)
80
+
81
+ if hparams["architectures"][0] != "FalconForCausalLM":
82
+ print("Model architecture not supported: " + hparams["architectures"][0])
83
+
84
+ sys.exit(1)
85
+
86
+ # get number of model parts
87
+ num_parts = count_model_parts(dir_model, "model-00")
88
+ if num_parts:
89
+ is_safetensors = True
90
+ from safetensors import safe_open
91
+ else:
92
+ is_safetensors = False
93
+ num_parts = count_model_parts(dir_model, "pytorch_model-")
94
+
95
+ ARCH=gguf.MODEL_ARCH.FALCON
96
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
97
+
98
+ print("gguf: get model metadata")
99
+
100
+ block_count = hparams["num_hidden_layers"]
101
+
102
+ gguf_writer.add_name("Falcon")
103
+ gguf_writer.add_context_length(2048) # not in config.json
104
+ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
105
+ gguf_writer.add_embedding_length(hparams["hidden_size"])
106
+ gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
107
+ gguf_writer.add_block_count(block_count)
108
+ gguf_writer.add_head_count(hparams["num_attention_heads"])
109
+ if "num_kv_heads" in hparams:
110
+ gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
111
+ else:
112
+ gguf_writer.add_head_count_kv(1)
113
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
114
+ gguf_writer.add_file_type(ftype)
115
+
116
+ # TOKENIZATION
117
+
118
+ print("gguf: get tokenizer metadata")
119
+
120
+ tokens: list[bytearray] = []
121
+ scores: list[float] = []
122
+ toktypes: list[int] = []
123
+
124
+ # gpt2 tokenizer
125
+ gguf_writer.add_tokenizer_model("gpt2")
126
+
127
+ print("gguf: get gpt2 tokenizer vocab")
128
+
129
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
130
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
131
+
132
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
133
+ # This causes downstream issues with mismatched tensor sizes when running the inference
134
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
135
+ assert max(tokenizer.vocab.values()) < vocab_size
136
+
137
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
138
+
139
+ for i in range(vocab_size):
140
+ tokens.append(reverse_vocab[i])
141
+ scores.append(0.0) # dummy
142
+ toktypes.append(gguf.TokenType.NORMAL)
143
+
144
+ gguf_writer.add_token_list(tokens)
145
+ gguf_writer.add_token_scores(scores)
146
+ gguf_writer.add_token_types(toktypes)
147
+
148
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
149
+ special_vocab.add_to_gguf(gguf_writer)
150
+
151
+ # TENSORS
152
+
153
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
154
+
155
+ # params for qkv transform
156
+ n_head = hparams["num_attention_heads"]
157
+ n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
158
+
159
+ head_dim = hparams["hidden_size"] // n_head
160
+
161
+ # tensor info
162
+ print("gguf: get tensor metadata")
163
+
164
+ if num_parts == 0:
165
+ part_names = iter(("pytorch_model.bin",))
166
+ elif is_safetensors:
167
+ part_names = (
168
+ f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
169
+ )
170
+ else:
171
+ part_names = (
172
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
173
+ )
174
+
175
+ for part_name in part_names:
176
+ if args.vocab_only:
177
+ break
178
+ print("gguf: loading model part '" + part_name + "'")
179
+ if is_safetensors:
180
+ ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
181
+ else:
182
+ ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
183
+
184
+ with ctx as model_part:
185
+ for name in model_part.keys():
186
+ data = model_part.get_tensor(name) if is_safetensors else model_part[name]
187
+
188
+ old_dtype = data.dtype
189
+
190
+ # convert any unsupported data types to float32
191
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
192
+ data = data.to(torch.float32)
193
+
194
+ # QKV tensor transform
195
+ # The original query_key_value tensor contains n_head_kv "kv groups",
196
+ # each consisting of n_head/n_head_kv query weights followed by one key
197
+ # and one value weight (shared by all query heads in the kv group).
198
+ # This layout makes it a big pain to work with in GGML.
199
+ # So we rearrange them here,, so that we have n_head query weights
200
+ # followed by n_head_kv key weights followed by n_head_kv value weights,
201
+ # in contiguous fashion.
202
+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
203
+
204
+ if "query_key_value" in name:
205
+ qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
206
+ q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
207
+ k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
208
+ v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
209
+ data = torch.cat((q,k,v)).reshape_as(data)
210
+
211
+ data = data.squeeze().numpy()
212
+
213
+ # map tensor names
214
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
215
+ if new_name is None:
216
+ print("Can not map tensor '" + name + "'")
217
+ sys.exit()
218
+
219
+ n_dims = len(data.shape)
220
+ data_dtype = data.dtype
221
+
222
+ # if f32 desired, convert any float16 to float32
223
+ if ftype == 0 and data_dtype == np.float16:
224
+ data = data.astype(np.float32)
225
+
226
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
227
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
228
+ data = data.astype(np.float32)
229
+
230
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
231
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
232
+ data = data.astype(np.float16)
233
+
234
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
235
+
236
+ gguf_writer.add_tensor(new_name, data)
237
+
238
+
239
+ print("gguf: write header")
240
+ gguf_writer.write_header_to_file()
241
+ print("gguf: write metadata")
242
+ gguf_writer.write_kv_data_to_file()
243
+ if not args.vocab_only:
244
+ print("gguf: write tensors")
245
+ gguf_writer.write_tensors_to_file()
246
+
247
+ gguf_writer.close()
248
+
249
+ print(f"gguf: model successfully exported to '{fname_out}'")
250
+ print("")
llama.cpp/convert-gptneox-hf-to-gguf.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF gptneox--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import torch
16
+ from transformers import AutoTokenizer # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+
23
+ def count_model_parts(dir_model: Path) -> int:
24
+ num_parts = 0
25
+ for filename in os.listdir(dir_model):
26
+ if filename.startswith("pytorch_model-"):
27
+ num_parts += 1
28
+
29
+ if num_parts > 0:
30
+ print("gguf: found " + str(num_parts) + " model parts")
31
+ return num_parts
32
+
33
+
34
+ def parse_args() -> argparse.Namespace:
35
+ parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
36
+ parser.add_argument(
37
+ "--vocab-only", action="store_true",
38
+ help="extract only the vocab",
39
+ )
40
+ parser.add_argument(
41
+ "--outfile", type=Path,
42
+ help="path to write to; default: based on input",
43
+ )
44
+ parser.add_argument(
45
+ "model", type=Path,
46
+ help="directory containing model file, or model file itself (*.bin)",
47
+ )
48
+ parser.add_argument(
49
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
50
+ help="output format - use 0 for float32, 1 for float16",
51
+ )
52
+ return parser.parse_args()
53
+
54
+ args = parse_args()
55
+
56
+ dir_model = args.model
57
+ ftype = args.ftype
58
+ if not dir_model.is_dir():
59
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
60
+ sys.exit(1)
61
+
62
+ # possible tensor data types
63
+ # ftype == 0 -> float32
64
+ # ftype == 1 -> float16
65
+
66
+ # map from ftype to string
67
+ ftype_str = ["f32", "f16"]
68
+
69
+ if args.outfile is not None:
70
+ fname_out = args.outfile
71
+ else:
72
+ # output in the same directory as the model by default
73
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
74
+
75
+ print("gguf: loading model "+dir_model.name)
76
+
77
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
78
+ hparams = json.load(f)
79
+
80
+ if hparams["architectures"][0] != "GPTNeoXForCausalLM":
81
+ print("Model architecture not supported: " + hparams["architectures"][0])
82
+
83
+ sys.exit()
84
+
85
+ # get number of model parts
86
+ num_parts = count_model_parts(dir_model)
87
+
88
+ ARCH=gguf.MODEL_ARCH.GPTNEOX
89
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
90
+
91
+ print("gguf: get model metadata")
92
+
93
+ block_count = hparams["num_hidden_layers"]
94
+
95
+ gguf_writer.add_name(dir_model.name)
96
+ gguf_writer.add_context_length(hparams["max_position_embeddings"])
97
+ gguf_writer.add_embedding_length(hparams["hidden_size"])
98
+ gguf_writer.add_block_count(block_count)
99
+ gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
100
+ gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
101
+ gguf_writer.add_head_count(hparams["num_attention_heads"])
102
+ gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
103
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
104
+
105
+ # TOKENIZATION
106
+
107
+ print("gguf: get tokenizer metadata")
108
+
109
+ tokens: list[bytearray] = []
110
+ scores: list[float] = []
111
+ toktypes: list[int] = []
112
+
113
+ # gpt2 tokenizer
114
+ gguf_writer.add_tokenizer_model("gpt2")
115
+
116
+ print("gguf: get gpt2 tokenizer vocab")
117
+
118
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
119
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
120
+
121
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
122
+ # This causes downstream issues with mismatched tensor sizes when running the inference
123
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
124
+ assert max(tokenizer.vocab.values()) < vocab_size
125
+
126
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
127
+
128
+ for i in range(vocab_size):
129
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
130
+ scores.append(0.0) # dummy
131
+ toktypes.append(gguf.TokenType.NORMAL)
132
+
133
+ gguf_writer.add_token_list(tokens)
134
+ gguf_writer.add_token_scores(scores)
135
+ gguf_writer.add_token_types(toktypes)
136
+
137
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
138
+ special_vocab.add_to_gguf(gguf_writer)
139
+
140
+ # TENSORS
141
+
142
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
143
+
144
+ # tensor info
145
+ print("gguf: get tensor metadata")
146
+
147
+ if num_parts == 0:
148
+ part_names = iter(("pytorch_model.bin",))
149
+ else:
150
+ part_names = (
151
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
152
+ )
153
+
154
+ for part_name in part_names:
155
+ if args.vocab_only:
156
+ break
157
+ print("gguf: loading model part '" + part_name + "'")
158
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
159
+
160
+ for name in model_part.keys():
161
+ data = model_part[name]
162
+
163
+ # we don't need these
164
+ if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
165
+ continue
166
+
167
+ old_dtype = data.dtype
168
+
169
+ # convert any unsupported data types to float32
170
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
171
+ data = data.to(torch.float32)
172
+
173
+ data = data.squeeze().numpy()
174
+
175
+ # map tensor names
176
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
177
+ if new_name is None:
178
+ print("Can not map tensor '" + name + "'")
179
+ sys.exit()
180
+
181
+ n_dims = len(data.shape)
182
+ data_dtype = data.dtype
183
+
184
+ # if f32 desired, convert any float16 to float32
185
+ if ftype == 0 and data_dtype == np.float16:
186
+ data = data.astype(np.float32)
187
+
188
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
189
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
190
+ data = data.astype(np.float32)
191
+
192
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
193
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
194
+ data = data.astype(np.float16)
195
+
196
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
197
+
198
+ gguf_writer.add_tensor(new_name, data)
199
+
200
+
201
+ print("gguf: write header")
202
+ gguf_writer.write_header_to_file()
203
+ print("gguf: write metadata")
204
+ gguf_writer.write_kv_data_to_file()
205
+ if not args.vocab_only:
206
+ print("gguf: write tensors")
207
+ gguf_writer.write_tensors_to_file()
208
+
209
+ gguf_writer.close()
210
+
211
+ print(f"gguf: model successfully exported to '{fname_out}'")
212
+ print("")
llama.cpp/convert-llama-ggml-to-gguf.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import math
6
+ import struct
7
+ import sys
8
+ from enum import IntEnum
9
+ from pathlib import Path
10
+
11
+ import numpy as np
12
+
13
+ import os
14
+ if 'NO_LOCAL_GGUF' not in os.environ:
15
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
16
+ import gguf
17
+
18
+ # Note: Does not support GGML_QKK_64
19
+ QK_K = 256
20
+ # Items here are (block size, type size)
21
+ GGML_QUANT_SIZES = {
22
+ gguf.GGMLQuantizationType.F32 : (1, 4),
23
+ gguf.GGMLQuantizationType.F16 : (1, 2),
24
+ gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
25
+ gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
26
+ gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
27
+ gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
28
+ gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
29
+ gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
30
+ gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
31
+ gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
32
+ gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
33
+ gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
34
+ gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
35
+ gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
36
+ }
37
+
38
+ class GGMLFormat(IntEnum):
39
+ GGML = 0
40
+ GGMF = 1
41
+ GGJT = 2
42
+
43
+ class GGMLFType(IntEnum):
44
+ ALL_F32 = 0
45
+ MOSTLY_F16 = 1
46
+ MOSTLY_Q4_0 = 2
47
+ MOSTLY_Q4_1 = 3
48
+ MOSTLY_Q4_1_SOME_F16 = 4
49
+ MOSTLY_Q8_0 = 7
50
+ MOSTLY_Q5_0 = 8
51
+ MOSTLY_Q5_1 = 9
52
+ MOSTLY_Q2_K = 10
53
+ MOSTLY_Q3_K_S = 11
54
+ MOSTLY_Q3_K_M = 12
55
+ MOSTLY_Q3_K_L = 13
56
+ MOSTLY_Q4_K_S = 14
57
+ MOSTLY_Q4_K_M = 15
58
+ MOSTLY_Q5_K_S = 16
59
+ MOSTLY_Q5_K_M = 17
60
+ MOSTLY_Q6_K = 18
61
+
62
+ class Hyperparameters:
63
+ def __init__(self):
64
+ self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
65
+ self.n_layer = self.n_rot = self.n_ff = 0
66
+ self.ftype = GGMLFType.ALL_F32
67
+
68
+ def set_n_ff(self, model):
69
+ ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
70
+ assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
71
+ ff_tensor = model.tensors[ff_tensor_idx]
72
+ self.n_ff = ff_tensor.dims[1]
73
+
74
+ def load(self, data, offset):
75
+ (
76
+ self.n_vocab,
77
+ self.n_embd,
78
+ self.n_mult,
79
+ self.n_head,
80
+ self.n_layer,
81
+ self.n_rot,
82
+ ftype,
83
+ ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
84
+ try:
85
+ self.ftype = GGMLFType(ftype)
86
+ except ValueError:
87
+ raise ValueError(f'Invalid ftype {ftype}')
88
+ return 4 * 7
89
+
90
+ def __str__(self):
91
+ return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
92
+
93
+ class Vocab:
94
+ def __init__(self, load_scores = True):
95
+ self.items = []
96
+ self.load_scores = load_scores
97
+
98
+ def load(self, data, offset, n_vocab):
99
+ orig_offset = offset
100
+ for _ in range(n_vocab):
101
+ itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
102
+ assert itemlen < 4096, 'Absurd vocab item length'
103
+ offset += 4
104
+ item_text = bytes(data[offset:offset + itemlen])
105
+ offset += itemlen
106
+ if self.load_scores:
107
+ item_score = struct.unpack('<f', data[offset:offset + 4])[0]
108
+ offset += 4
109
+ else:
110
+ item_score = 0.0
111
+ self.items.append((item_text, item_score))
112
+ return offset - orig_offset
113
+
114
+ class Tensor:
115
+ def __init__(self, use_padding = True):
116
+ self.name = None
117
+ self.dims: tuple[int, ...] = ()
118
+ self.dtype = None
119
+ self.start_offset = 0
120
+ self.len_bytes = np.int64(0)
121
+ self.use_padding = use_padding
122
+
123
+ def load(self, data, offset):
124
+ orig_offset = offset
125
+ (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
126
+ assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
127
+ assert name_len < 4096, 'Absurd tensor name length'
128
+ quant = GGML_QUANT_SIZES.get(dtype)
129
+ assert quant is not None, 'Unknown tensor type'
130
+ (blksize, tysize) = quant
131
+ offset += 12
132
+ self.dtype= dtype
133
+ self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
134
+ offset += 4 * n_dims
135
+ self.name = bytes(data[offset:offset + name_len])
136
+ offset += name_len
137
+ pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
138
+ offset += pad
139
+ n_elems = np.prod(self.dims)
140
+ n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
141
+ self.start_offset = offset
142
+ self.len_bytes = n_bytes
143
+ offset += n_bytes
144
+ # print(n_dims, name_len, dtype, self.dims, self.name, pad)
145
+ return offset - orig_offset
146
+
147
+ class GGMLModel:
148
+ def __init__(self):
149
+ self.hyperparameters = None
150
+ self.vocab = None
151
+ self.tensor_map = {}
152
+ self.tensors = []
153
+
154
+ def validate_header(self, data, offset):
155
+ magic = bytes(data[offset:offset + 4])
156
+ if magic == b'GGUF':
157
+ raise ValueError('File is already in GGUF format.')
158
+ if magic == b'lmgg':
159
+ self.file_format = GGMLFormat.GGML
160
+ self.format_version = 1
161
+ return 4
162
+ version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
163
+ if magic == b'fmgg':
164
+ if version != 1:
165
+ raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
166
+ self.file_format = GGMLFormat.GGMF
167
+ self.format_version = version
168
+ return 8
169
+ if magic == b'tjgg':
170
+ if version < 1 or version > 3:
171
+ raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
172
+ self.file_format = GGMLFormat.GGJT
173
+ self.format_version = version
174
+ return 8
175
+ raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
176
+
177
+ def validate_conversion(self, ftype):
178
+ err = ''
179
+ if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
180
+ if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
181
+ err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
182
+ elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
183
+ if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
184
+ GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
185
+ err = 'Q4 and Q8 quantizations changed in GGJTv3.'
186
+ if len(err) > 0:
187
+ raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
188
+
189
+ def load(self, data, offset):
190
+ offset += self.validate_header(data, offset)
191
+ hp = Hyperparameters()
192
+ offset += hp.load(data, offset)
193
+ print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
194
+ self.validate_conversion(hp.ftype)
195
+ vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
196
+ offset += vocab.load(data, offset, hp.n_vocab)
197
+ tensors: list[Tensor] = []
198
+ tensor_map = {}
199
+ while offset < len(data):
200
+ tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
201
+ offset += tensor.load(data, offset)
202
+ tensor_map[tensor.name] = len(tensors)
203
+ tensors.append(tensor)
204
+ self.hyperparameters = hp
205
+ self.vocab = vocab
206
+ self.tensors = tensors
207
+ self.tensor_map = tensor_map
208
+ hp.set_n_ff(self)
209
+ return offset
210
+
211
+ class GGMLToGGUF:
212
+ def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
213
+ hp = ggml_model.hyperparameters
214
+ self.model = ggml_model
215
+ self.data = data
216
+ self.cfg = cfg
217
+ self.params_override = params_override
218
+ self.vocab_override = vocab_override
219
+ self.special_vocab = special_vocab
220
+ if params_override is not None:
221
+ n_kv_head = params_override.n_head_kv
222
+ else:
223
+ if cfg.gqa == 1:
224
+ n_kv_head = hp.n_head
225
+ else:
226
+ gqa = float(cfg.gqa)
227
+ n_kv_head = None
228
+ for x in range(1, 256):
229
+ if float(hp.n_head) / float(x) == gqa:
230
+ n_kv_head = x
231
+ assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
232
+ print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
233
+ self.n_kv_head = n_kv_head
234
+ self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
235
+
236
+ def save(self):
237
+ print('* Preparing to save GGUF file')
238
+ gguf_writer = gguf.GGUFWriter(
239
+ self.cfg.output,
240
+ gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
241
+ use_temp_file = False )
242
+ self.add_params(gguf_writer)
243
+ self.add_vocab(gguf_writer)
244
+ if self.special_vocab is not None:
245
+ self.special_vocab.add_to_gguf(gguf_writer)
246
+ self.add_tensors(gguf_writer)
247
+ print(" gguf: write header")
248
+ gguf_writer.write_header_to_file()
249
+ print(" gguf: write metadata")
250
+ gguf_writer.write_kv_data_to_file()
251
+ print(" gguf: write tensors")
252
+ gguf_writer.write_tensors_to_file()
253
+ gguf_writer.close()
254
+
255
+ def add_params(self, gguf_writer):
256
+ hp = self.model.hyperparameters
257
+ cfg = self.cfg
258
+ if cfg.desc is not None:
259
+ desc = cfg.desc
260
+ else:
261
+ desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
262
+ try:
263
+ # Filenames aren't necessarily valid UTF8.
264
+ name = cfg.name if cfg.name is not None else cfg.input.name
265
+ except UnicodeDecodeError:
266
+ name = None
267
+ print('* Adding model parameters and KV items')
268
+ if name is not None:
269
+ gguf_writer.add_name(name)
270
+ gguf_writer.add_description(desc)
271
+ gguf_writer.add_file_type(int(hp.ftype))
272
+ if self.params_override is not None:
273
+ po = self.params_override
274
+ assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
275
+ assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
276
+ assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
277
+ gguf_writer.add_context_length (po.n_ctx)
278
+ gguf_writer.add_embedding_length (po.n_embd)
279
+ gguf_writer.add_block_count (po.n_layer)
280
+ gguf_writer.add_feed_forward_length (po.n_ff)
281
+ gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
282
+ gguf_writer.add_head_count (po.n_head)
283
+ gguf_writer.add_head_count_kv (po.n_head_kv)
284
+ gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
285
+ return
286
+ gguf_writer.add_context_length(cfg.context_length)
287
+ gguf_writer.add_embedding_length(hp.n_embd)
288
+ gguf_writer.add_block_count(hp.n_layer)
289
+ gguf_writer.add_feed_forward_length(hp.n_ff)
290
+ gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
291
+ gguf_writer.add_head_count(hp.n_head)
292
+ gguf_writer.add_head_count_kv(self.n_kv_head)
293
+ gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
294
+
295
+ def add_vocab(self, gguf_writer):
296
+ hp = self.model.hyperparameters
297
+ gguf_writer.add_tokenizer_model('llama')
298
+ tokens = []
299
+ scores = []
300
+ toktypes = []
301
+ if self.vocab_override is not None:
302
+ vo = self.vocab_override
303
+ print('* Adding vocab item(s)')
304
+ for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
305
+ tokens.append(vbytes)
306
+ scores.append(score)
307
+ toktypes.append(ttype)
308
+ assert len(tokens) == hp.n_vocab, \
309
+ f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
310
+ gguf_writer.add_token_list(tokens)
311
+ gguf_writer.add_token_scores(scores)
312
+ if len(toktypes) > 0:
313
+ gguf_writer.add_token_types(toktypes)
314
+ return
315
+ print(f'* Adding {hp.n_vocab} vocab item(s)')
316
+ assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
317
+ for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
318
+ tt = 1 # Normal
319
+ # Special handling for UNK, BOS, EOS tokens.
320
+ if tokid <= 2:
321
+ if tokid == 0:
322
+ vbytes = b'<unk>'
323
+ tt = 2
324
+ elif tokid == 1:
325
+ vbytes = b'<s>'
326
+ tt = 3
327
+ else:
328
+ vbytes = b'</s>'
329
+ tt = 3
330
+ elif len(vbytes) == 0:
331
+ tt = 3 # Control
332
+ elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
333
+ vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
334
+ tt = 6 # Byte
335
+ else:
336
+ vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
337
+ toktypes.append(tt)
338
+ tokens.append(vbytes)
339
+ scores.append(vscore)
340
+ gguf_writer.add_token_list(tokens)
341
+ gguf_writer.add_token_scores(scores)
342
+ gguf_writer.add_token_types(toktypes)
343
+ gguf_writer.add_unk_token_id(0)
344
+ gguf_writer.add_bos_token_id(1)
345
+ gguf_writer.add_eos_token_id(2)
346
+
347
+ def add_tensors(self, gguf_writer):
348
+ tensor_map = self.name_map
349
+ data = self.data
350
+ print(f'* Adding {len(self.model.tensors)} tensor(s)')
351
+ for tensor in self.model.tensors:
352
+ name = str(tensor.name, 'UTF-8')
353
+ mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
354
+ assert mapped_name is not None, f'Bad name {name}'
355
+ tempdims = list(tensor.dims[:])
356
+ if len(tempdims) > 1:
357
+ temp = tempdims[1]
358
+ tempdims[1] = tempdims[0]
359
+ tempdims[0] = temp
360
+ # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
361
+ gguf_writer.add_tensor(
362
+ mapped_name,
363
+ data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
364
+ raw_shape = tempdims,
365
+ raw_dtype = tensor.dtype )
366
+
367
+ def handle_metadata(cfg, hp):
368
+ import convert
369
+ assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
370
+ hf_config_path = cfg.model_metadata_dir / "config.json"
371
+ orig_config_path = cfg.model_metadata_dir / "params.json"
372
+ # We pass a fake model here. "original" mode will check the shapes of some
373
+ # tensors if information is missing in the .json file: other than that, the
374
+ # model data isn't used so this should be safe (at least for now).
375
+ fakemodel = {
376
+ 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
377
+ 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
378
+ }
379
+ fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
380
+ fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
381
+ if hf_config_path.exists():
382
+ params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
383
+ elif orig_config_path.exists():
384
+ params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
385
+ else:
386
+ raise ValueError('Unable to load metadata')
387
+ vocab = convert.load_vocab(
388
+ cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
389
+ cfg.vocabtype )
390
+ # FIXME: Respect cfg.vocab_dir?
391
+ svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
392
+ convert.check_vocab_size(params, vocab)
393
+ return (params, vocab, svocab)
394
+
395
+ def handle_args():
396
+ parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
397
+ parser.add_argument('--input', '-i', type = Path, required = True,
398
+ help = 'Input GGMLv3 filename')
399
+ parser.add_argument('--output', '-o', type = Path, required = True,
400
+ help ='Output GGUF filename')
401
+ parser.add_argument('--name',
402
+ help = 'Set model name')
403
+ parser.add_argument('--desc',
404
+ help = 'Set model description')
405
+ parser.add_argument('--gqa', type = int, default = 1,
406
+ help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
407
+ parser.add_argument('--eps', default = '5.0e-06',
408
+ help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
409
+ parser.add_argument('--context-length', '-c', type=int, default = 2048,
410
+ help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
411
+ parser.add_argument('--model-metadata-dir', '-m', type = Path,
412
+ help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
413
+ parser.add_argument("--vocab-dir", type=Path,
414
+ help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
415
+ parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
416
+ help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
417
+ return parser.parse_args()
418
+
419
+ def main():
420
+ cfg = handle_args()
421
+ print(f'* Using config: {cfg}')
422
+ print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
423
+ if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
424
+ print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
425
+ data = np.memmap(cfg.input, mode = 'r')
426
+ model = GGMLModel()
427
+ print('* Scanning GGML input file')
428
+ offset = model.load(data, 0)
429
+ print(f'* GGML model hyperparameters: {model.hyperparameters}')
430
+ vocab_override = None
431
+ params_override = None
432
+ special_vocab = None
433
+ if cfg.model_metadata_dir is not None:
434
+ (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
435
+ print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
436
+ print(f'* Overriding params: {params_override}')
437
+ print(f'* Overriding vocab: {vocab_override}')
438
+ print(f'* Special vocab: {special_vocab}')
439
+ else:
440
+ print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
441
+ if model.file_format == GGMLFormat.GGML:
442
+ print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
443
+ converter = GGMLToGGUF(model, data, cfg,
444
+ params_override = params_override,
445
+ vocab_override = vocab_override,
446
+ special_vocab = special_vocab )
447
+ converter.save()
448
+ print(f'* Successful completion. Output saved to: {cfg.output}')
449
+
450
+ if __name__ == '__main__':
451
+ main()
llama.cpp/convert-mpt-hf-to-gguf.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF mpt--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import torch
16
+ from transformers import AutoTokenizer # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+
23
+ def count_model_parts(dir_model: Path) -> int:
24
+ num_parts = 0
25
+ for filename in os.listdir(dir_model):
26
+ if filename.startswith("pytorch_model-"):
27
+ num_parts += 1
28
+
29
+ if num_parts > 0:
30
+ print("gguf: found " + str(num_parts) + " model parts")
31
+ return num_parts
32
+
33
+
34
+ def parse_args() -> argparse.Namespace:
35
+ parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
36
+ parser.add_argument(
37
+ "--vocab-only", action="store_true",
38
+ help="extract only the vocab",
39
+ )
40
+ parser.add_argument(
41
+ "--outfile", type=Path,
42
+ help="path to write to; default: based on input",
43
+ )
44
+ parser.add_argument(
45
+ "model", type=Path,
46
+ help="directory containing model file, or model file itself (*.bin)",
47
+ )
48
+ parser.add_argument(
49
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
50
+ help="output format - use 0 for float32, 1 for float16",
51
+ )
52
+ return parser.parse_args()
53
+
54
+ args = parse_args()
55
+
56
+ dir_model = args.model
57
+ ftype = args.ftype
58
+ if not dir_model.is_dir():
59
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
60
+ sys.exit(1)
61
+
62
+ # possible tensor data types
63
+ # ftype == 0 -> float32
64
+ # ftype == 1 -> float16
65
+
66
+ # map from ftype to string
67
+ ftype_str = ["f32", "f16"]
68
+
69
+ if args.outfile is not None:
70
+ fname_out = args.outfile
71
+ else:
72
+ # output in the same directory as the model by default
73
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
74
+
75
+ print("gguf: loading model "+dir_model.name)
76
+
77
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
78
+ hparams = json.load(f)
79
+
80
+ if hparams["architectures"][0] != "MPTForCausalLM":
81
+ print("Model architecture not supported: " + hparams["architectures"][0])
82
+
83
+ sys.exit()
84
+
85
+ # get number of model parts
86
+ num_parts = count_model_parts(dir_model)
87
+
88
+ ARCH=gguf.MODEL_ARCH.MPT
89
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
90
+
91
+ print("gguf: get model metadata")
92
+
93
+ block_count = hparams["n_layers"]
94
+
95
+ gguf_writer.add_name(dir_model.name)
96
+ gguf_writer.add_context_length(hparams["max_seq_len"])
97
+ gguf_writer.add_embedding_length(hparams["d_model"])
98
+ gguf_writer.add_block_count(block_count)
99
+ gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
100
+ gguf_writer.add_head_count(hparams["n_heads"])
101
+ if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
102
+ gguf_writer.add_head_count_kv(kv_n_heads)
103
+ gguf_writer.add_layer_norm_eps(1e-05)
104
+ if hparams["attn_config"]["clip_qkv"] is not None:
105
+ gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
106
+ gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
107
+
108
+ # TOKENIZATION
109
+
110
+ print("gguf: get tokenizer metadata")
111
+
112
+ tokens: list[bytearray] = []
113
+ scores: list[float] = []
114
+ toktypes: list[int] = []
115
+
116
+ # gpt2 tokenizer
117
+ gguf_writer.add_tokenizer_model("gpt2")
118
+
119
+ print("gguf: get gpt2 tokenizer vocab")
120
+
121
+ # MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
122
+ # there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
123
+ # accomodate some "reserved" tokens; this is causing problems down the line in
124
+ # llama.cpp, so we pad the vocab with dummy tokens:
125
+
126
+ vocab_size = hparams["vocab_size"]
127
+
128
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
129
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
130
+
131
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
132
+
133
+ for i in range(vocab_size):
134
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
135
+ scores.append(0.0) # dummy
136
+ toktypes.append(gguf.TokenType.NORMAL)
137
+
138
+ gguf_writer.add_token_list(tokens)
139
+ gguf_writer.add_token_scores(scores)
140
+ gguf_writer.add_token_types(toktypes)
141
+
142
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
143
+ special_vocab.add_to_gguf(gguf_writer)
144
+
145
+ # TENSORS
146
+
147
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
148
+
149
+ # tensor info
150
+ print("gguf: get tensor metadata")
151
+
152
+ if num_parts == 0:
153
+ part_names = iter(("pytorch_model.bin",))
154
+ else:
155
+ part_names = (
156
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
157
+ )
158
+
159
+ for part_name in part_names:
160
+ if args.vocab_only:
161
+ break
162
+ print("gguf: loading model part '" + part_name + "'")
163
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
164
+
165
+ for name in model_part.keys():
166
+ data = model_part[name]
167
+
168
+ old_dtype = data.dtype
169
+
170
+ # convert any unsupported data types to float32
171
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
172
+ data = data.to(torch.float32)
173
+
174
+ data = data.squeeze().numpy()
175
+
176
+ # map tensor names
177
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
178
+ if new_name is None:
179
+ print("Cannot map tensor '" + name + "'")
180
+ continue # for the sake of compatibility with some old published models, don't quit
181
+ sys.exit()
182
+
183
+ n_dims = len(data.shape)
184
+ data_dtype = data.dtype
185
+
186
+ # if f32 desired, convert any float16 to float32
187
+ if ftype == 0 and data_dtype == np.float16:
188
+ data = data.astype(np.float32)
189
+
190
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
191
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
192
+ data = data.astype(np.float32)
193
+
194
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
195
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
196
+ data = data.astype(np.float16)
197
+
198
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
199
+
200
+ gguf_writer.add_tensor(new_name, data)
201
+
202
+ # note: MPT output is tied to (same as) wte in original model;
203
+ # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
204
+ if new_name == "token_embd.weight":
205
+ gguf_writer.add_tensor("output.weight", data)
206
+
207
+ print("gguf: write header")
208
+ gguf_writer.write_header_to_file()
209
+ print("gguf: write metadata")
210
+ gguf_writer.write_kv_data_to_file()
211
+ if not args.vocab_only:
212
+ print("gguf: write tensors")
213
+ gguf_writer.write_tensors_to_file()
214
+
215
+ gguf_writer.close()
216
+
217
+ print(f"gguf: model successfully exported to '{fname_out}'")
218
+ print("")
llama.cpp/convert-persimmon-to-gguf.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ from pprint import pprint
4
+ import sys
5
+ import argparse
6
+ from pathlib import Path
7
+ from sentencepiece import SentencePieceProcessor
8
+ if 'NO_LOCAL_GGUF' not in os.environ:
9
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
10
+ import gguf
11
+
12
+ def _flatten_dict(dct, tensors, prefix=None):
13
+ assert isinstance(dct, dict)
14
+ for key in dct.keys():
15
+ new_prefix = prefix + '.' + key if prefix is not None else key
16
+ if isinstance(dct[key], torch.Tensor):
17
+ tensors[new_prefix] = dct[key]
18
+ elif isinstance(dct[key], dict):
19
+ _flatten_dict(dct[key], tensors, new_prefix)
20
+ else:
21
+ raise ValueError(type(dct[key]))
22
+ return None
23
+
24
+ def _get_sentencepiece_tokenizer_info(dir_model: Path):
25
+ tokenizer_path = dir_model / 'adept_vocab.model'
26
+ print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
27
+ tokenizer = SentencePieceProcessor(str(tokenizer_path))
28
+ print('gguf: adding tokens')
29
+ tokens: list[bytes] = []
30
+ scores: list[float] = []
31
+ toktypes: list[int] = []
32
+
33
+ for i in range(tokenizer.vocab_size()):
34
+ text: bytes
35
+ score: float
36
+
37
+ piece = tokenizer.id_to_piece(i)
38
+ text = piece.encode("utf-8")
39
+ score = tokenizer.get_score(i)
40
+
41
+ toktype = 1
42
+ if tokenizer.is_unknown(i):
43
+ toktype = 2
44
+ if tokenizer.is_control(i):
45
+ toktype = 3
46
+ if tokenizer.is_unused(i):
47
+ toktype = 5
48
+ if tokenizer.is_byte(i):
49
+ toktype = 6
50
+
51
+ tokens.append(text)
52
+ scores.append(score)
53
+ toktypes.append(toktype)
54
+ pass
55
+ return tokens, scores, toktypes
56
+
57
+ def main():
58
+ parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
59
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
60
+ parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
61
+ parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
62
+ parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
63
+ args = parser.parse_args()
64
+ sys.path.append(str(args.adept_inference_dir))
65
+ persimmon_model = torch.load(args.ckpt_path)
66
+ hparams = persimmon_model['args']
67
+ pprint(hparams)
68
+ tensors = {}
69
+ _flatten_dict(persimmon_model['model'], tensors, None)
70
+
71
+ arch = gguf.MODEL_ARCH.PERSIMMON
72
+ gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
73
+
74
+ block_count = hparams.num_layers
75
+ head_count = hparams.num_attention_heads
76
+ head_count_kv = head_count
77
+ ctx_length = hparams.seq_length
78
+ hidden_size = hparams.hidden_size
79
+
80
+ gguf_writer.add_name('persimmon-8b-chat')
81
+ gguf_writer.add_context_length(ctx_length)
82
+ gguf_writer.add_embedding_length(hidden_size)
83
+ gguf_writer.add_block_count(block_count)
84
+ gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
85
+ gguf_writer.add_rope_dimension_count(hidden_size // head_count)
86
+ gguf_writer.add_head_count(head_count)
87
+ gguf_writer.add_head_count_kv(head_count_kv)
88
+ gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
89
+ gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
90
+
91
+ tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
92
+ gguf_writer.add_tokenizer_model('llama')
93
+ gguf_writer.add_token_list(tokens)
94
+ gguf_writer.add_token_scores(scores)
95
+ gguf_writer.add_token_types(toktypes)
96
+ gguf_writer.add_bos_token_id(71013)
97
+ gguf_writer.add_eos_token_id(71013)
98
+
99
+ tensor_map = gguf.get_tensor_name_map(arch, block_count)
100
+ print(tensor_map)
101
+ for name in tensors.keys():
102
+ data = tensors[name]
103
+ if name.endswith(".self_attention.rotary_emb.inv_freq"):
104
+ continue
105
+ old_dtype = data.dtype
106
+ # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
107
+ data = data.to(torch.float32).squeeze().numpy()
108
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
109
+ if new_name is None:
110
+ print("Can not map tensor '" + name + "'")
111
+ sys.exit()
112
+ n_dims = len(data.shape)
113
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
114
+ gguf_writer.add_tensor(new_name, data)
115
+ print("gguf: write header")
116
+ gguf_writer.write_header_to_file()
117
+ print("gguf: write metadata")
118
+ gguf_writer.write_kv_data_to_file()
119
+ print("gguf: write tensors")
120
+ gguf_writer.write_tensors_to_file()
121
+
122
+ gguf_writer.close()
123
+
124
+ print(f"gguf: model successfully exported to '{args.outfile}'")
125
+ print("")
126
+
127
+
128
+
129
+ if __name__ == '__main__':
130
+ main()
llama.cpp/convert-refact-hf-to-gguf.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF refact--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ import torch
14
+ from transformers import AutoTokenizer # type: ignore[import]
15
+
16
+ if "NO_LOCAL_GGUF" not in os.environ:
17
+ sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
18
+ import gguf
19
+
20
+ def count_model_parts(dir_model: Path) -> int:
21
+ num_parts = 0
22
+ for filename in os.listdir(dir_model):
23
+ if filename.startswith("pytorch_model-"):
24
+ num_parts += 1
25
+
26
+ if num_parts > 0:
27
+ print("gguf: found " + str(num_parts) + " model parts")
28
+ return num_parts
29
+
30
+
31
+ def parse_args() -> argparse.Namespace:
32
+ parser = argparse.ArgumentParser(
33
+ description="Convert a Refact model to a GGML compatible file"
34
+ )
35
+ parser.add_argument(
36
+ "--vocab-only",
37
+ action="store_true",
38
+ help="extract only the vocab",
39
+ )
40
+ parser.add_argument(
41
+ "--outfile",
42
+ type=Path,
43
+ help="path to write to; default: based on input",
44
+ )
45
+ parser.add_argument(
46
+ "model",
47
+ type=Path,
48
+ help="directory containing model file, or model file itself (*.bin)",
49
+ )
50
+ parser.add_argument(
51
+ "ftype",
52
+ type=int,
53
+ choices=[0, 1],
54
+ default=1,
55
+ nargs="?",
56
+ help="output format - use 0 for float32, 1 for float16",
57
+ )
58
+ return parser.parse_args()
59
+
60
+
61
+ args = parse_args()
62
+
63
+ dir_model = args.model
64
+ ftype = args.ftype
65
+ if not dir_model.is_dir():
66
+ print(f"Error: {args.model} is not a directory", file=sys.stderr)
67
+ sys.exit(1)
68
+
69
+ # possible tensor data types
70
+ # ftype == 0 -> float32
71
+ # ftype == 1 -> float16
72
+
73
+ # map from ftype to string
74
+ ftype_str = ["f32", "f16"]
75
+
76
+ if args.outfile is not None:
77
+ fname_out = args.outfile
78
+ else:
79
+ # output in the same directory as the model by default
80
+ fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
81
+
82
+ print("gguf: loading model " + dir_model.name)
83
+
84
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
85
+ hparams = json.load(f)
86
+
87
+ if hparams["architectures"][0] != "GPTRefactForCausalLM":
88
+ print("Model architecture not supported: " + hparams["architectures"][0])
89
+
90
+ sys.exit(1)
91
+
92
+ # get number of model parts
93
+ num_parts = count_model_parts(dir_model)
94
+
95
+ ARCH = gguf.MODEL_ARCH.REFACT
96
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
97
+
98
+ print("gguf: get model metadata")
99
+
100
+ # Get refact feed forward dimension
101
+ hidden_dim = hparams["n_embd"]
102
+ inner_dim = 4 * hidden_dim
103
+ hidden_dim = int(2 * inner_dim / 3)
104
+ multiple_of = 256
105
+ ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
106
+
107
+ block_count = hparams["n_layer"]
108
+
109
+ gguf_writer.add_name("Refact")
110
+ # refact uses Alibi. So this is from config.json which might be used by training.
111
+ gguf_writer.add_context_length(hparams["n_positions"])
112
+ gguf_writer.add_embedding_length(hparams["n_embd"])
113
+
114
+ gguf_writer.add_feed_forward_length(ff_dim)
115
+ gguf_writer.add_block_count(block_count)
116
+ gguf_writer.add_head_count(hparams["n_head"])
117
+ gguf_writer.add_head_count_kv(1)
118
+ gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
119
+ gguf_writer.add_file_type(ftype)
120
+
121
+ # TOKENIZATION
122
+
123
+ print("gguf: get tokenizer metadata")
124
+
125
+ tokens: list[bytearray] = []
126
+ scores: list[float] = []
127
+ toktypes: list[int] = []
128
+
129
+ # gpt2 tokenizer
130
+ gguf_writer.add_tokenizer_model("gpt2")
131
+
132
+ print("gguf: get gpt2 tokenizer vocab")
133
+
134
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
135
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
136
+
137
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
138
+ # This causes downstream issues with mismatched tensor sizes when running the inference
139
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
140
+ assert max(tokenizer.vocab.values()) < vocab_size
141
+
142
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
143
+
144
+ for i in range(vocab_size):
145
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
146
+ scores.append(0.0) # dummy
147
+ toktypes.append(gguf.TokenType.NORMAL)
148
+
149
+ gguf_writer.add_token_list(tokens)
150
+ gguf_writer.add_token_scores(scores)
151
+ gguf_writer.add_token_types(toktypes)
152
+
153
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
154
+ special_vocab.add_to_gguf(gguf_writer)
155
+
156
+ # TENSORS
157
+
158
+ tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
159
+
160
+ # params for qkv transform
161
+ n_head = hparams["n_head"]
162
+ n_head_kv = 1
163
+
164
+ head_dim = hparams["n_embd"] // n_head
165
+
166
+ # tensor info
167
+ print("gguf: get tensor metadata")
168
+
169
+ if num_parts == 0:
170
+ part_names = iter(("pytorch_model.bin",))
171
+ else:
172
+ part_names = (
173
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
174
+ )
175
+ for part_name in part_names:
176
+ if args.vocab_only:
177
+ break
178
+ print("gguf: loading model part '" + part_name + "'")
179
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
180
+
181
+ for i in range(block_count):
182
+ if f"transformer.h.{i}.attn.kv.weight" in model_part:
183
+ data = model_part[f"transformer.h.{i}.attn.kv.weight"]
184
+ model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
185
+ : n_head_kv * head_dim
186
+ ]
187
+ model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
188
+ n_head_kv * head_dim :
189
+ ]
190
+ del model_part[f"transformer.h.{i}.attn.kv.weight"]
191
+ if f"transformer.h.{i}.attn.q.weight" in model_part:
192
+ model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
193
+ f"transformer.h.{i}.attn.q.weight"
194
+ ]
195
+ del model_part[f"transformer.h.{i}.attn.q.weight"]
196
+ if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
197
+ data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
198
+ model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
199
+ model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
200
+ del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
201
+
202
+ for name in model_part.keys():
203
+ data = model_part[name]
204
+
205
+ old_dtype = data.dtype
206
+
207
+ # convert any unsupported data types to float32
208
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
209
+ data = data.to(torch.float32)
210
+
211
+ data = data.squeeze().numpy()
212
+
213
+ # map tensor names
214
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
215
+ if new_name is None:
216
+ print("Can not map tensor '" + name + "'")
217
+ sys.exit()
218
+
219
+ n_dims = len(data.shape)
220
+ data_dtype = data.dtype
221
+
222
+ # if f32 desired, convert any float16 to float32
223
+ if ftype == 0 and data_dtype == np.float16:
224
+ data = data.astype(np.float32)
225
+
226
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
227
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
228
+ data = data.astype(np.float32)
229
+
230
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
231
+ if (
232
+ ftype == 1
233
+ and data_dtype == np.float32
234
+ and name.endswith(".weight")
235
+ and n_dims == 2
236
+ ):
237
+ data = data.astype(np.float16)
238
+
239
+ print(
240
+ new_name
241
+ + ", n_dims = "
242
+ + str(n_dims)
243
+ + ", "
244
+ + str(old_dtype)
245
+ + " --> "
246
+ + str(data.dtype)
247
+ )
248
+
249
+ gguf_writer.add_tensor(new_name, data)
250
+
251
+
252
+ print("gguf: write header")
253
+ gguf_writer.write_header_to_file()
254
+ print("gguf: write metadata")
255
+ gguf_writer.write_kv_data_to_file()
256
+ if not args.vocab_only:
257
+ print("gguf: write tensors")
258
+ gguf_writer.write_tensors_to_file()
259
+
260
+ gguf_writer.close()
261
+
262
+ print(f"gguf: model successfully exported to '{fname_out}'")
263
+ print("")
llama.cpp/convert-starcoder-hf-to-gguf.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF starcoder --> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import torch
16
+ from transformers import AutoTokenizer # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+
23
+ def count_model_parts(dir_model: Path) -> int:
24
+ num_parts = 0
25
+ for filename in os.listdir(dir_model):
26
+ if filename.startswith("pytorch_model-"):
27
+ num_parts += 1
28
+
29
+ if num_parts > 0:
30
+ print("gguf: found " + str(num_parts) + " model parts")
31
+ return num_parts
32
+
33
+
34
+ def parse_args() -> argparse.Namespace:
35
+ parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
36
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
37
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
38
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
39
+ parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
40
+ return parser.parse_args()
41
+
42
+ args = parse_args()
43
+
44
+ dir_model = args.model
45
+ ftype = args.ftype
46
+ if not dir_model.is_dir():
47
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
48
+ sys.exit(1)
49
+
50
+ # possible tensor data types
51
+ # ftype == 0 -> float32
52
+ # ftype == 1 -> float16
53
+
54
+ # map from ftype to string
55
+ ftype_str = ["f32", "f16"]
56
+
57
+ if args.outfile is not None:
58
+ fname_out = args.outfile
59
+ else:
60
+ # output in the same directory as the model by default
61
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
62
+
63
+ print("gguf: loading model "+dir_model.name)
64
+
65
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
66
+ hparams = json.load(f)
67
+
68
+ if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
69
+ print("Model architecture not supported: " + hparams["architectures"][0])
70
+
71
+ sys.exit(1)
72
+
73
+ # get number of model parts
74
+ num_parts = count_model_parts(dir_model)
75
+
76
+ ARCH=gguf.MODEL_ARCH.STARCODER
77
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
78
+
79
+ print("gguf: get model metadata")
80
+
81
+ block_count = hparams["n_layer"]
82
+
83
+ gguf_writer.add_name("StarCoder")
84
+ gguf_writer.add_context_length(hparams["n_positions"])
85
+ gguf_writer.add_embedding_length(hparams["n_embd"])
86
+ gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
87
+ gguf_writer.add_block_count(block_count)
88
+ gguf_writer.add_head_count(hparams["n_head"])
89
+ gguf_writer.add_head_count_kv(1)
90
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
91
+ gguf_writer.add_file_type(ftype)
92
+
93
+ # TOKENIZATION
94
+
95
+ print("gguf: get tokenizer metadata")
96
+
97
+ tokens: list[bytearray] = []
98
+ scores: list[float] = []
99
+ toktypes: list[int] = []
100
+
101
+ # gpt2 tokenizer
102
+ gguf_writer.add_tokenizer_model("gpt2")
103
+
104
+ print("gguf: get gpt2 tokenizer vocab")
105
+
106
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
107
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
108
+
109
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
110
+ # This causes downstream issues with mismatched tensor sizes when running the inference
111
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
112
+ assert max(tokenizer.vocab.values()) < vocab_size
113
+
114
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
115
+
116
+ for i in range(vocab_size):
117
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
118
+ scores.append(0.0) # dummy
119
+ toktypes.append(gguf.TokenType.NORMAL)
120
+
121
+ gguf_writer.add_token_list(tokens)
122
+ gguf_writer.add_token_scores(scores)
123
+ gguf_writer.add_token_types(toktypes)
124
+
125
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
126
+ special_vocab.add_to_gguf(gguf_writer)
127
+
128
+ # TENSORS
129
+
130
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
131
+
132
+ # params for qkv transform
133
+ n_head = hparams["n_head"]
134
+ n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
135
+
136
+ head_dim = hparams["n_embd"] // n_head
137
+
138
+ # tensor info
139
+ print("gguf: get tensor metadata")
140
+
141
+ if num_parts == 0:
142
+ part_names = iter(("pytorch_model.bin",))
143
+ else:
144
+ part_names = (
145
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
146
+ )
147
+
148
+ for part_name in part_names:
149
+ if args.vocab_only:
150
+ break
151
+ print("gguf: loading model part '" + part_name + "'")
152
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
153
+
154
+ for name in model_part.keys():
155
+ data = model_part[name]
156
+
157
+ old_dtype = data.dtype
158
+
159
+ # convert any unsupported data types to float32
160
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
161
+ data = data.to(torch.float32)
162
+
163
+ data = data.squeeze().numpy()
164
+
165
+ # map tensor names
166
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
167
+ if new_name is None:
168
+ print("Can not map tensor '" + name + "'")
169
+ sys.exit()
170
+
171
+ n_dims = len(data.shape)
172
+ data_dtype = data.dtype
173
+
174
+ # if f32 desired, convert any float16 to float32
175
+ if ftype == 0 and data_dtype == np.float16:
176
+ data = data.astype(np.float32)
177
+
178
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
179
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
180
+ data = data.astype(np.float32)
181
+
182
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
183
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
184
+ data = data.astype(np.float16)
185
+
186
+ print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
187
+
188
+ gguf_writer.add_tensor(new_name, data)
189
+
190
+
191
+ print("gguf: write header")
192
+ gguf_writer.write_header_to_file()
193
+ print("gguf: write metadata")
194
+ gguf_writer.write_kv_data_to_file()
195
+ if not args.vocab_only:
196
+ print("gguf: write tensors")
197
+ gguf_writer.write_tensors_to_file()
198
+
199
+ gguf_writer.close()
200
+
201
+ print(f"gguf: model successfully exported to '{fname_out}'")
202
+ print("")
llama.cpp/examples/finetune/convert-finetune-checkpoint-to-gguf.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # finetune checkpoint --> gguf conversion
3
+
4
+ import argparse
5
+ import gguf
6
+ import os
7
+ import struct
8
+ import sys
9
+ import numpy as np
10
+ from pathlib import Path
11
+
12
+ # gguf constants
13
+ LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
14
+ LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
15
+ LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
16
+ LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
17
+ LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
18
+ LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
19
+ LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
20
+ LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
21
+ LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
22
+ LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
23
+ LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
24
+ LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
25
+ LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
26
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
27
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
28
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
29
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
30
+ LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
31
+
32
+ LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
33
+ LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
34
+ LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
35
+
36
+ LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
37
+ LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
38
+ LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
39
+ LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
40
+ LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
41
+ LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
42
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
43
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
44
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
45
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
46
+
47
+ LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
48
+ LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
49
+ LLM_KV_TRAINING_TYPE = "training.type"
50
+ LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
51
+ LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
52
+ LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
53
+ LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
54
+
55
+ LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
56
+ LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
57
+ LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
58
+ LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
59
+ LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
60
+ LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
61
+ LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
62
+ LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
63
+ LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
64
+ LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
65
+ LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
66
+ LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
67
+
68
+ class Tensor:
69
+ def __init__(self, dtype='f', ne=None):
70
+ if ne is None:
71
+ ne = []
72
+ self.dtype = dtype
73
+ self.ne = ne
74
+ self.nbytes = 0
75
+ if self.dtype == 'f':
76
+ if len(self.ne) == 0:
77
+ self.nbytes = 0
78
+ else:
79
+ self.nbytes = int(np.product(self.ne)) * 4
80
+ else:
81
+ raise ValueError(f"Unhandled data type '{self.dtype}'")
82
+
83
+ def load(self, data, offset):
84
+ nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
85
+ namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
86
+ dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
87
+
88
+ assert(nd == len(self.ne))
89
+ ne = []
90
+ for d in range(nd):
91
+ n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
92
+ ne.append(n)
93
+
94
+ if tuple(ne) != tuple(self.ne):
95
+ raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
96
+
97
+ if self.dtype == 'f':
98
+ assert(dtype == 0)
99
+ else:
100
+ raise ValueError(f"Unhandled data type '{self.dtype}'")
101
+
102
+ self.name = bytes(data[offset:offset+namelen]); offset += namelen
103
+ # 32-byte alignment
104
+ offset += (0 - offset) & 31
105
+ self.data = data[offset:offset+self.nbytes]
106
+ offset += self.nbytes
107
+ return offset
108
+
109
+ def max_storage_size(self):
110
+ result = 0
111
+ result += 4 # nd
112
+ result += 4 # namelen
113
+ result += 4 # dtype
114
+ result += len(self.ne)*8 # ne
115
+ result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
116
+ result += 31 # 32-byte alignment
117
+ result += self.nbytes
118
+ return result
119
+
120
+ def save_gguf(self, gguf_writer, name):
121
+ gguf_writer.add_tensor(
122
+ name=name,
123
+ tensor=self.data,
124
+ raw_shape=np.array(list(reversed(self.ne))),
125
+ raw_dtype=gguf.GGMLQuantizationType.F32)
126
+
127
+ class OptimizationContext:
128
+ def __init__(self):
129
+ pass
130
+
131
+ def load(self, data, offset):
132
+ self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
133
+ offset += 4
134
+
135
+ if self.version != 1:
136
+ raise ValueError('Invalid version of optimization context in checkpoint file')
137
+
138
+ self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
139
+ self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
140
+ self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
141
+ self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
142
+ self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
143
+
144
+ self.adam_m = Tensor('f', [self.nx])
145
+ self.adam_v = Tensor('f', [self.nx])
146
+ self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
147
+
148
+ self.lbfgs_x = Tensor('f', [self.nx])
149
+ self.lbfgs_xp = Tensor('f', [self.nx])
150
+ self.lbfgs_g = Tensor('f', [self.nx])
151
+ self.lbfgs_gp = Tensor('f', [self.nx])
152
+ self.lbfgs_d = Tensor('f', [self.nx])
153
+ self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
154
+ self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
155
+ self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
156
+ self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
157
+ self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
158
+
159
+ # forgot to save type in version 1:
160
+ # guess self.type from number of remaining bytes
161
+ size_type_0 = 12 + sum([t.max_storage_size() for t in
162
+ [self.adam_m, self.adam_v]
163
+ +([self.adam_pf] if (self.past > 0) else [])])
164
+ size_type_1 = 24 + sum([t.max_storage_size() for t in
165
+ [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
166
+ self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
167
+ self.lbfgs_lmal, self.lbfgs_lmys,
168
+ self.lbfgs_lms, self.lbfgs_lmy]
169
+ +([self.lbfgs_pf] if (self.past > 0) else [])])
170
+ # due to alignment padding the size might not by exact
171
+ # but the difference in size for both types is significant,
172
+ # so we can just use whichever is closest
173
+ remaining = len(data) - offset
174
+ if abs(remaining - size_type_0) < abs(remaining - size_type_1):
175
+ self.type = 0
176
+ else:
177
+ self.type = 1
178
+
179
+ if self.type == 0:
180
+ offset = self.adam_m.load(data, offset)
181
+ offset = self.adam_v.load(data, offset)
182
+ offset = self.adam_pf.load(data,offset)
183
+
184
+ self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
185
+ self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
186
+ self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
187
+
188
+ elif self.type == 1:
189
+ offset = self.lbfgs_x.load(data, offset)
190
+ offset = self.lbfgs_xp.load(data, offset)
191
+ offset = self.lbfgs_g.load(data, offset)
192
+ offset = self.lbfgs_gp.load(data, offset)
193
+ offset = self.lbfgs_d.load(data, offset)
194
+ offset = self.lbfgs_pf.load(data, offset)
195
+ offset = self.lbfgs_lmal.load(data, offset)
196
+ offset = self.lbfgs_lmys.load(data, offset)
197
+ offset = self.lbfgs_lms.load(data, offset)
198
+ offset = self.lbfgs_lmy.load(data, offset)
199
+
200
+ self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
201
+ self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
202
+ self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
203
+ self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
204
+ self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
205
+ self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
206
+
207
+ else:
208
+ raise ValueError(f"Invalid optimizer type '{self.type}'")
209
+
210
+ return offset
211
+
212
+ def save_gguf(self, gguf_writer):
213
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
214
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
215
+ gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
216
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
217
+ gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
218
+
219
+ if self.type == 0:
220
+ gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
221
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
222
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
223
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
224
+
225
+ self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
226
+ self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
227
+ if self.past > 0:
228
+ self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
229
+
230
+ elif self.type == 1:
231
+ gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
232
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
233
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
234
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
235
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
236
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
237
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
238
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
239
+
240
+ self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
241
+ self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
242
+ self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
243
+ self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
244
+ self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
245
+ if self.past > 0:
246
+ self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
247
+ self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
248
+ self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
249
+ self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
250
+ self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
251
+ else:
252
+ raise ValueError('Unknown optimizer type')
253
+
254
+ class LoraParams:
255
+ def __init__(self):
256
+ pass
257
+
258
+ def load(self, data, offset):
259
+ self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
260
+ self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
261
+ self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
262
+ self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
263
+ self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
264
+ self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
265
+ self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
266
+ self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
267
+ self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
268
+ self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
269
+ self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
270
+ self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
271
+ return offset
272
+
273
+ def save_gguf(self, gguf_writer):
274
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
275
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
276
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
277
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
278
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
279
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
280
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
281
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
282
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
283
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
284
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
285
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
286
+
287
+ class ModelParams:
288
+ def __init__(self, n_ff = None):
289
+ self.n_ff = n_ff
290
+
291
+ def load(self, data, offset):
292
+ self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
293
+ self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
294
+ self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
295
+ self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
296
+ self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
297
+ self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
298
+ return offset
299
+
300
+ def get_n_ff(self):
301
+ if self.n_ff is None:
302
+ # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
303
+ return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
304
+ else:
305
+ return self.n_ff
306
+
307
+ def save_gguf(self, gguf_writer):
308
+ # self.n_vocab not saved
309
+ gguf_writer.add_embedding_length(self.n_embd)
310
+ gguf_writer.add_head_count(self.n_head)
311
+ gguf_writer.add_block_count(self.n_layer)
312
+ gguf_writer.add_rope_dimension_count(self.n_rot)
313
+ gguf_writer.add_feed_forward_length(self.get_n_ff())
314
+
315
+ def tensor_name(key, bid=None, suffix=".weight"):
316
+ return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
317
+
318
+ class Layer:
319
+ def __init__(self, params, lora_params, bid):
320
+ self.bid = bid
321
+ self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
322
+ self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
323
+ self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
324
+ self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
325
+ self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
326
+ self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
327
+ self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
328
+ self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
329
+ self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
330
+ self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
331
+ self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
332
+ self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
333
+ self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
334
+ self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
335
+ self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
336
+ self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
337
+ self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
338
+ self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
339
+
340
+ def load(self, data, offset):
341
+ offset = self.att_norm_a.load(data, offset)
342
+ offset = self.att_norm_b.load(data, offset)
343
+ offset = self.wq_a.load(data, offset)
344
+ offset = self.wq_b.load(data, offset)
345
+ offset = self.wk_a.load(data, offset)
346
+ offset = self.wk_b.load(data, offset)
347
+ offset = self.wv_a.load(data, offset)
348
+ offset = self.wv_b.load(data, offset)
349
+ offset = self.wo_a.load(data, offset)
350
+ offset = self.wo_b.load(data, offset)
351
+ offset = self.ffn_norm_a.load(data, offset)
352
+ offset = self.ffn_norm_b.load(data, offset)
353
+ offset = self.w1_a.load(data, offset)
354
+ offset = self.w1_b.load(data, offset)
355
+ offset = self.w2_a.load(data, offset)
356
+ offset = self.w2_b.load(data, offset)
357
+ offset = self.w3_a.load(data, offset)
358
+ offset = self.w3_b.load(data, offset)
359
+ return offset
360
+
361
+ def save_gguf(self, gguf_writer):
362
+ self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
363
+ self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
364
+ self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
365
+ self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
366
+ self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
367
+ self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
368
+ self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
369
+ self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
370
+ self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
371
+ self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
372
+ self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
373
+ self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
374
+ self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
375
+ self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
376
+ self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
377
+ self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
378
+ self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
379
+ self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
380
+
381
+ class LoraModel:
382
+ def __init__(self, n_ff = None):
383
+ self.params = ModelParams(n_ff = n_ff)
384
+ self.lora_params = LoraParams()
385
+ self.layers = []
386
+
387
+ def load(self, data, offset):
388
+ offset = self.params.load(data, offset)
389
+ offset = self.lora_params.load(data, offset)
390
+
391
+ self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
392
+ self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
393
+ self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
394
+ self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
395
+ self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
396
+ self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
397
+
398
+ offset = self.tok_embd_a.load(data, offset)
399
+ offset = self.tok_embd_b.load(data, offset)
400
+ offset = self.norm_a.load(data, offset)
401
+ offset = self.norm_b.load(data, offset)
402
+ offset = self.output_a.load(data, offset)
403
+ offset = self.output_b.load(data, offset)
404
+
405
+ self.layers.clear()
406
+ for bid in range(self.params.n_layer):
407
+ layer = Layer(self.params, self.lora_params, bid)
408
+ offset = layer.load(data, offset)
409
+ self.layers.append(layer)
410
+
411
+ return offset
412
+
413
+ def save_gguf(self, gguf_writer):
414
+ self.params.save_gguf(gguf_writer)
415
+ self.lora_params.save_gguf(gguf_writer)
416
+
417
+ self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
418
+ self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
419
+ self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
420
+ self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
421
+ self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
422
+ self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
423
+
424
+ for layer in self.layers:
425
+ layer.save_gguf(gguf_writer)
426
+
427
+ class LoraCheckpoint:
428
+ def __init__(self, n_ff = None):
429
+ self.model = LoraModel(n_ff = n_ff)
430
+ self.opt_ctx = OptimizationContext()
431
+
432
+ def load(self, data, offset):
433
+ magic = bytes(reversed(data[offset:offset + 4])); offset += 4
434
+ if magic != b'ggcl':
435
+ raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
436
+
437
+ self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
438
+ if self.version != 0:
439
+ raise ValueError('Invalid version of checkpoint file')
440
+
441
+ self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
442
+ self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
443
+ self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
444
+
445
+ offset = self.model.load(data, offset)
446
+ offset = self.opt_ctx.load(data, offset)
447
+
448
+ return offset
449
+
450
+ def save_gguf(self, gguf_writer):
451
+ gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
452
+ gguf_writer.add_layer_norm_rms_eps(1e-5)
453
+ gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
454
+ gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
455
+ gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
456
+ gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
457
+ gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
458
+ self.model.save_gguf(gguf_writer)
459
+ self.opt_ctx.save_gguf(gguf_writer)
460
+
461
+ def handle_args():
462
+ parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
463
+ parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
464
+ parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
465
+ parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
466
+ return parser.parse_args()
467
+
468
+ def main():
469
+ cfg = handle_args()
470
+ print(cfg)
471
+ data = np.memmap(cfg.input, mode = 'r')
472
+ chk = LoraCheckpoint(n_ff = cfg.ff)
473
+ offset = 0
474
+ offset = chk.load(data, offset)
475
+ # we should have read all available data
476
+ assert(offset == len(data))
477
+
478
+ gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
479
+ chk.save_gguf(gguf_writer)
480
+ print(" gguf: write header")
481
+ gguf_writer.write_header_to_file()
482
+ print(" gguf: write metadata")
483
+ gguf_writer.write_kv_data_to_file()
484
+ print(" gguf: write tensors")
485
+ gguf_writer.write_tensors_to_file()
486
+ gguf_writer.close()
487
+
488
+ if __name__ == '__main__':
489
+ main()
llama.cpp/examples/gguf/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set(TARGET gguf)
2
+ add_executable(${TARGET} gguf.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
llama.cpp/examples/gguf/gguf.cpp ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+ #include "llama.h"
3
+
4
+ #include <cstdio>
5
+ #include <cinttypes>
6
+ #include <string>
7
+ #include <sstream>
8
+ #include <fstream>
9
+ #include <vector>
10
+
11
+ #undef MIN
12
+ #undef MAX
13
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
14
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
15
+
16
+ template <typename T>
17
+ static std::string to_string(const T & val) {
18
+ std::stringstream ss;
19
+ ss << val;
20
+ return ss.str();
21
+ }
22
+
23
+ static bool gguf_ex_write(const std::string & fname) {
24
+ struct gguf_context * ctx = gguf_init_empty();
25
+
26
+ gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
27
+ gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
28
+ gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
29
+ gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
30
+ gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
31
+ gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
32
+ gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
33
+ gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull);
34
+ gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll);
35
+ gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789);
36
+ gguf_set_val_bool(ctx, "some.parameter.bool", true);
37
+ gguf_set_val_str (ctx, "some.parameter.string", "hello world");
38
+
39
+ gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
40
+ gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
41
+ gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
42
+
43
+ struct ggml_init_params params = {
44
+ /*.mem_size =*/ 128ull*1024ull*1024ull,
45
+ /*.mem_buffer =*/ NULL,
46
+ /*.no_alloc =*/ false,
47
+ };
48
+
49
+ struct ggml_context * ctx_data = ggml_init(params);
50
+
51
+ const int n_tensors = 10;
52
+
53
+ // tensor infos
54
+ for (int i = 0; i < n_tensors; ++i) {
55
+ const std::string name = "tensor_" + to_string(i);
56
+
57
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
58
+ int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
59
+
60
+ for (int j = 0; j < n_dims; ++j) {
61
+ ne[j] = rand() % 10 + 1;
62
+ }
63
+
64
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
65
+ ggml_set_name(cur, name.c_str());
66
+
67
+ {
68
+ float * data = (float *) cur->data;
69
+ for (int j = 0; j < ggml_nelements(cur); ++j) {
70
+ data[j] = 100 + i;
71
+ }
72
+ }
73
+
74
+ gguf_add_tensor(ctx, cur);
75
+ }
76
+
77
+ gguf_write_to_file(ctx, fname.c_str(), false);
78
+
79
+ printf("%s: wrote file '%s;\n", __func__, fname.c_str());
80
+
81
+ ggml_free(ctx_data);
82
+ gguf_free(ctx);
83
+
84
+ return true;
85
+ }
86
+
87
+ // just read tensor info
88
+ static bool gguf_ex_read_0(const std::string & fname) {
89
+ struct gguf_init_params params = {
90
+ /*.no_alloc = */ false,
91
+ /*.ctx = */ NULL,
92
+ };
93
+
94
+ struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
95
+
96
+ printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
97
+ printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
98
+ printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
99
+
100
+ // kv
101
+ {
102
+ const int n_kv = gguf_get_n_kv(ctx);
103
+
104
+ printf("%s: n_kv: %d\n", __func__, n_kv);
105
+
106
+ for (int i = 0; i < n_kv; ++i) {
107
+ const char * key = gguf_get_key(ctx, i);
108
+
109
+ printf("%s: kv[%d]: key = %s\n", __func__, i, key);
110
+ }
111
+ }
112
+
113
+ // find kv string
114
+ {
115
+ const char * findkey = "some.parameter.string";
116
+
117
+ const int keyidx = gguf_find_key(ctx, findkey);
118
+ if (keyidx == -1) {
119
+ printf("%s: find key: %s not found.\n", __func__, findkey);
120
+ } else {
121
+ const char * key_value = gguf_get_val_str(ctx, keyidx);
122
+ printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
123
+ }
124
+ }
125
+
126
+ // tensor info
127
+ {
128
+ const int n_tensors = gguf_get_n_tensors(ctx);
129
+
130
+ printf("%s: n_tensors: %d\n", __func__, n_tensors);
131
+
132
+ for (int i = 0; i < n_tensors; ++i) {
133
+ const char * name = gguf_get_tensor_name (ctx, i);
134
+ const size_t offset = gguf_get_tensor_offset(ctx, i);
135
+
136
+ printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
137
+ }
138
+ }
139
+
140
+ gguf_free(ctx);
141
+
142
+ return true;
143
+ }
144
+
145
+ // read and create ggml_context containing the tensors and their data
146
+ static bool gguf_ex_read_1(const std::string & fname) {
147
+ struct ggml_context * ctx_data = NULL;
148
+
149
+ struct gguf_init_params params = {
150
+ /*.no_alloc = */ false,
151
+ /*.ctx = */ &ctx_data,
152
+ };
153
+
154
+ struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
155
+
156
+ printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
157
+ printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
158
+ printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
159
+
160
+ // kv
161
+ {
162
+ const int n_kv = gguf_get_n_kv(ctx);
163
+
164
+ printf("%s: n_kv: %d\n", __func__, n_kv);
165
+
166
+ for (int i = 0; i < n_kv; ++i) {
167
+ const char * key = gguf_get_key(ctx, i);
168
+
169
+ printf("%s: kv[%d]: key = %s\n", __func__, i, key);
170
+ }
171
+ }
172
+
173
+ // tensor info
174
+ {
175
+ const int n_tensors = gguf_get_n_tensors(ctx);
176
+
177
+ printf("%s: n_tensors: %d\n", __func__, n_tensors);
178
+
179
+ for (int i = 0; i < n_tensors; ++i) {
180
+ const char * name = gguf_get_tensor_name (ctx, i);
181
+ const size_t offset = gguf_get_tensor_offset(ctx, i);
182
+
183
+ printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
184
+ }
185
+ }
186
+
187
+ // data
188
+ {
189
+ const int n_tensors = gguf_get_n_tensors(ctx);
190
+
191
+ for (int i = 0; i < n_tensors; ++i) {
192
+ printf("%s: reading tensor %d data\n", __func__, i);
193
+
194
+ const char * name = gguf_get_tensor_name(ctx, i);
195
+
196
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
197
+
198
+ printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
199
+
200
+ // print first 10 elements
201
+ const float * data = (const float *) cur->data;
202
+
203
+ printf("%s data[:10] : ", name);
204
+ for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
205
+ printf("%f ", data[j]);
206
+ }
207
+ printf("\n\n");
208
+
209
+ // check data
210
+ {
211
+ const float * data = (const float *) cur->data;
212
+ for (int j = 0; j < ggml_nelements(cur); ++j) {
213
+ if (data[j] != 100 + i) {
214
+ fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
215
+ return false;
216
+ }
217
+ }
218
+ }
219
+ }
220
+ }
221
+
222
+ printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
223
+
224
+ ggml_free(ctx_data);
225
+ gguf_free(ctx);
226
+
227
+ return true;
228
+ }
229
+
230
+ int main(int argc, char ** argv) {
231
+ if (argc < 3) {
232
+ printf("usage: %s data.gguf r|w\n", argv[0]);
233
+ return -1;
234
+ }
235
+
236
+ const std::string fname(argv[1]);
237
+ const std::string mode (argv[2]);
238
+
239
+ GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
240
+
241
+ if (mode == "w") {
242
+ GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
243
+ } else if (mode == "r") {
244
+ GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
245
+ GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
246
+ }
247
+
248
+ return 0;
249
+ }
llama.cpp/examples/llava/convert-image-encoder-to-gguf.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+
5
+ import torch
6
+ import numpy as np
7
+ from gguf import *
8
+ from transformers import CLIPModel, CLIPProcessor
9
+
10
+ TEXT = "clip.text"
11
+ VISION = "clip.vision"
12
+
13
+
14
+ def k(raw_key: str, arch: str) -> str:
15
+ return raw_key.format(arch=arch)
16
+
17
+
18
+ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
19
+ if name in (
20
+ "logit_scale",
21
+ "text_model.embeddings.position_ids",
22
+ "vision_model.embeddings.position_ids",
23
+ ):
24
+ return True
25
+
26
+ if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
27
+ return True
28
+
29
+ if name.startswith("v") and not has_vision:
30
+ return True
31
+
32
+ if name.startswith("t") and not has_text:
33
+ return True
34
+
35
+ return False
36
+
37
+
38
+ def get_tensor_name(name: str) -> str:
39
+ if "projection" in name:
40
+ return name
41
+
42
+ if "mm_projector" in name:
43
+ return name.replace("model.mm_projector", "mm")
44
+
45
+ return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
46
+
47
+
48
+ def bytes_to_unicode():
49
+ """
50
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
51
+ The reversible bpe codes work on unicode strings.
52
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
53
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
54
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
55
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
56
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
57
+ """
58
+ bs = (
59
+ list(range(ord("!"), ord("~") + 1))
60
+ + list(range(ord("¡"), ord("¬") + 1))
61
+ + list(range(ord("®"), ord("ÿ") + 1))
62
+ )
63
+ cs = bs[:]
64
+ n = 0
65
+ for b in range(2**8):
66
+ if b not in bs:
67
+ bs.append(b)
68
+ cs.append(2**8 + n)
69
+ n += 1
70
+ cs = [chr(n) for n in cs]
71
+ return dict(zip(bs, cs))
72
+
73
+
74
+ ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
75
+ ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
76
+ ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
77
+ ap.add_argument("--text-only", action="store_true", required=False,
78
+ help="Save a text-only model. It can't be used to encode images")
79
+ ap.add_argument("--vision-only", action="store_true", required=False,
80
+ help="Save a vision-only model. It can't be used to encode texts")
81
+ ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
82
+ ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
83
+ ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
84
+ ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
85
+
86
+ args = ap.parse_args()
87
+
88
+
89
+ if args.text_only and args.vision_only:
90
+ print("--text-only and --image-only arguments cannot be specified at the same time.")
91
+ exit(1)
92
+
93
+ if args.use_f32:
94
+ print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
95
+
96
+ # output in the same directory as the model if output_dir is None
97
+ dir_model = args.model_dir
98
+
99
+
100
+ with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
101
+ vocab = json.load(f)
102
+ tokens = [key for key in vocab]
103
+
104
+ with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
105
+ config = json.load(f)
106
+ v_hparams = config["vision_config"]
107
+ t_hparams = config["text_config"]
108
+
109
+ # possible data types
110
+ # ftype == 0 -> float32
111
+ # ftype == 1 -> float16
112
+ #
113
+ # map from ftype to string
114
+ ftype_str = ["f32", "f16"]
115
+
116
+ ftype = 1
117
+ if args.use_f32:
118
+ ftype = 0
119
+
120
+
121
+ model = CLIPModel.from_pretrained(dir_model)
122
+ processor = CLIPProcessor.from_pretrained(dir_model)
123
+
124
+ fname_middle = None
125
+ has_text_encoder = True
126
+ has_vision_encoder = True
127
+ has_llava_projector = False
128
+ if args.text_only:
129
+ fname_middle = "text-"
130
+ has_vision_encoder = False
131
+ elif args.vision_only:
132
+ fname_middle = "vision-"
133
+ has_text_encoder = False
134
+ elif args.llava_projector is not None:
135
+ fname_middle = "mmproj-"
136
+ has_text_encoder = False
137
+ has_llava_projector = True
138
+ else:
139
+ fname_middle = ""
140
+
141
+ output_dir = args.output_dir if args.output_dir is not None else dir_model
142
+ os.makedirs(output_dir, exist_ok=True)
143
+ output_prefix = os.path.basename(output_dir).replace("ggml_", "")
144
+ fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
145
+ fout = GGUFWriter(path=fname_out, arch="clip")
146
+
147
+ fout.add_bool("clip.has_text_encoder", has_text_encoder)
148
+ fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
149
+ fout.add_bool("clip.has_llava_projector", has_llava_projector)
150
+ fout.add_file_type(ftype)
151
+ model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
152
+ fout.add_name(model_name)
153
+ if args.text_only:
154
+ fout.add_description("text-only CLIP model")
155
+ elif args.vision_only and not has_llava_projector:
156
+ fout.add_description("vision-only CLIP model")
157
+ elif has_llava_projector:
158
+ fout.add_description("image encoder for LLaVA")
159
+ else:
160
+ fout.add_description("two-tower CLIP model")
161
+
162
+ if has_text_encoder:
163
+ # text_model hparams
164
+ fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
165
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
166
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
167
+ fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
168
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
169
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
170
+ fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
171
+ fout.add_token_list(tokens)
172
+
173
+ if has_vision_encoder:
174
+ # vision_model hparams
175
+ fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
176
+ fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
177
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
178
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
179
+ fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
180
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
181
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
182
+ block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
183
+ fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
184
+
185
+ image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
186
+ image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
187
+ fout.add_array("clip.vision.image_mean", image_mean)
188
+ fout.add_array("clip.vision.image_std", image_std)
189
+
190
+ use_gelu = v_hparams["hidden_act"] == "gelu"
191
+ fout.add_bool("clip.use_gelu", use_gelu)
192
+
193
+
194
+ if has_llava_projector:
195
+ model.vision_model.encoder.layers.pop(-1)
196
+ projector = torch.load(args.llava_projector)
197
+ for name, data in projector.items():
198
+ name = get_tensor_name(name)
199
+ if data.ndim == 2:
200
+ data = data.squeeze().numpy().astype(np.float16)
201
+ else:
202
+ data = data.squeeze().numpy().astype(np.float32)
203
+
204
+ fout.add_tensor(name, data)
205
+
206
+ print("Projector tensors added\n")
207
+
208
+ state_dict = model.state_dict()
209
+ for name, data in state_dict.items():
210
+ if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
211
+ # we don't need this
212
+ print(f"skipping parameter: {name}")
213
+ continue
214
+
215
+ name = get_tensor_name(name)
216
+ data = data.squeeze().numpy()
217
+
218
+ n_dims = len(data.shape)
219
+
220
+ # ftype == 0 -> float32, ftype == 1 -> float16
221
+ ftype_cur = 0
222
+ if n_dims == 4:
223
+ print(f"tensor {name} is always saved in f16")
224
+ data = data.astype(np.float16)
225
+ ftype_cur = 1
226
+ elif ftype == 1:
227
+ if name[-7:] == ".weight" and n_dims == 2:
228
+ print(" Converting to float16")
229
+ data = data.astype(np.float16)
230
+ ftype_cur = 1
231
+ else:
232
+ print(" Converting to float32")
233
+ data = data.astype(np.float32)
234
+ ftype_cur = 0
235
+ else:
236
+ if data.dtype != np.float32:
237
+ print(" Converting to float32")
238
+ data = data.astype(np.float32)
239
+ ftype_cur = 0
240
+
241
+ print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
242
+ fout.add_tensor(name, data)
243
+
244
+
245
+ fout.write_header_to_file()
246
+ fout.write_kv_data_to_file()
247
+ fout.write_tensors_to_file()
248
+ fout.close()
249
+
250
+ print("Done. Output file: " + fname_out)
llama.cpp/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # train-text-from-scratch checkpoint --> gguf conversion
3
+
4
+ import argparse
5
+ import os
6
+ import struct
7
+ import sys
8
+ import numpy as np
9
+ from pathlib import Path
10
+
11
+ if 'NO_LOCAL_GGUF' not in os.environ:
12
+ sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
13
+ import gguf
14
+
15
+ # gguf constants
16
+ LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
17
+ LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
18
+ LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
19
+ LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
20
+ LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
21
+ LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
22
+ LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
23
+ LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
24
+ LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
25
+ LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
26
+ LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
27
+ LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
28
+ LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
29
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
30
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
31
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
32
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
33
+ LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
34
+
35
+ LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
36
+ LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
37
+ LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
38
+
39
+ LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
40
+ LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
41
+ LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
42
+ LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
43
+ LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
44
+ LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
45
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
46
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
47
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
48
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
49
+
50
+ LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
51
+ LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
52
+ LLM_KV_TRAINING_TYPE = "training.type"
53
+ LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
54
+ LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
55
+ LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
56
+ LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
57
+
58
+ class Tensor:
59
+ def __init__(self, dtype='f', ne=None):
60
+ if ne is None:
61
+ ne = []
62
+ self.dtype = dtype
63
+ self.ne = ne
64
+ self.nbytes = 0
65
+ if self.dtype == 'f':
66
+ if len(self.ne) == 0:
67
+ self.nbytes = 0
68
+ else:
69
+ self.nbytes = int(np.product(self.ne)) * 4
70
+ else:
71
+ raise ValueError(f"Unhandled data type '{self.dtype}'")
72
+
73
+ def load(self, data, offset):
74
+ nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
75
+ namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
76
+ dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
77
+
78
+ assert(nd == len(self.ne))
79
+ ne = []
80
+ for d in range(nd):
81
+ n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
82
+ ne.append(n)
83
+
84
+ assert(tuple(ne) == tuple(self.ne))
85
+
86
+ if self.dtype == 'f':
87
+ assert(dtype == 0)
88
+ else:
89
+ raise ValueError(f"Unhandled data type '{self.dtype}'")
90
+
91
+ self.name = bytes(data[offset:offset+namelen]); offset += namelen
92
+ # 32-byte alignment
93
+ offset += (0 - offset) & 31
94
+ self.data = data[offset:offset+self.nbytes]
95
+ offset += self.nbytes
96
+ return offset
97
+
98
+ def max_storage_size(self):
99
+ result = 0
100
+ result += 4 # nd
101
+ result += 4 # namelen
102
+ result += 4 # dtype
103
+ result += len(self.ne)*8 # ne
104
+ result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
105
+ result += 31 # 32-byte alignment
106
+ result += self.nbytes
107
+ return result
108
+
109
+ def save_gguf(self, gguf_writer, name):
110
+ gguf_writer.add_tensor(
111
+ name=name,
112
+ tensor=self.data,
113
+ raw_shape=np.array(list(reversed(self.ne))),
114
+ raw_dtype=gguf.GGMLQuantizationType.F32)
115
+
116
+ class OptimizationParamsV0:
117
+ def __init__(self):
118
+ pass
119
+
120
+ def load(self, data, offset):
121
+ self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
122
+ self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
123
+ self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
124
+ self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
125
+ self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
126
+ self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
127
+ self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
128
+ self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
129
+ self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
130
+ self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
131
+ self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
132
+ self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
133
+ self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
134
+ self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
135
+ self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
136
+ self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
137
+ self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
138
+ self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
139
+ self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
140
+ self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
141
+ self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
142
+ self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
143
+ self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
144
+ self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
145
+ return offset
146
+
147
+ class OptimizationContext:
148
+ def __init__(self):
149
+ pass
150
+
151
+ def load(self, data, offset):
152
+ self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
153
+ offset += 4
154
+
155
+ if self.version == 0:
156
+ params = OptimizationParamsV0()
157
+ offset = params.load(data, offset)
158
+ self.past = params.past
159
+ self.lbfgs_m = params.lbfgs_m
160
+ self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
161
+ self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
162
+ self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
163
+ self.type = params.type
164
+
165
+ self.adam_m = Tensor('f', [self.nx])
166
+ self.adam_v = Tensor('f', [self.nx])
167
+ self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
168
+
169
+ self.lbfgs_x = Tensor('f', [self.nx])
170
+ self.lbfgs_xp = Tensor('f', [self.nx])
171
+ self.lbfgs_g = Tensor('f', [self.nx])
172
+ self.lbfgs_gp = Tensor('f', [self.nx])
173
+ self.lbfgs_d = Tensor('f', [self.nx])
174
+ self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
175
+ self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
176
+ self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
177
+ self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
178
+ self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
179
+
180
+ if self.type == 0:
181
+ # these tensors are stored, but we don't need their data
182
+ x = Tensor('f', [self.nx])
183
+ g = Tensor('f', [self.nx])
184
+ g2 = Tensor('f', [self.nx])
185
+ mh = Tensor('f', [self.nx])
186
+ vh = Tensor('f', [self.nx])
187
+
188
+ offset = x.load(data, offset)
189
+ offset = g.load(data, offset)
190
+ offset = g2.load(data, offset)
191
+ offset = self.adam_m.load(data, offset)
192
+ offset = self.adam_v.load(data, offset)
193
+ offset = mh.load(data, offset)
194
+ offset = vh.load(data, offset)
195
+ offset = self.adam_pf.load(data, offset)
196
+
197
+ self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
198
+ self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
199
+ self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
200
+
201
+ elif self.type == 1:
202
+ offset = self.lbfgs_x.load(data, offset)
203
+ offset = self.lbfgs_xp.load(data, offset)
204
+ offset = self.lbfgs_g.load(data, offset)
205
+ offset = self.lbfgs_gp.load(data, offset)
206
+ offset = self.lbfgs_d.load(data, offset)
207
+ offset = self.lbfgs_pf.load(data, offset)
208
+ offset = self.lbfgs_lmal.load(data, offset)
209
+ offset = self.lbfgs_lmys.load(data, offset)
210
+ offset = self.lbfgs_lms.load(data, offset)
211
+ offset = self.lbfgs_lmy.load(data, offset)
212
+
213
+ self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
214
+ self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
215
+ self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
216
+ self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
217
+ self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
218
+ self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
219
+
220
+ else:
221
+ raise ValueError('Unknown optimizer type')
222
+
223
+
224
+ elif self.version == 1:
225
+ self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
226
+ self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
227
+ self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
228
+ self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
229
+ self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
230
+
231
+ self.adam_m = Tensor('f', [self.nx])
232
+ self.adam_v = Tensor('f', [self.nx])
233
+ self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
234
+
235
+ self.lbfgs_x = Tensor('f', [self.nx])
236
+ self.lbfgs_xp = Tensor('f', [self.nx])
237
+ self.lbfgs_g = Tensor('f', [self.nx])
238
+ self.lbfgs_gp = Tensor('f', [self.nx])
239
+ self.lbfgs_d = Tensor('f', [self.nx])
240
+ self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
241
+ self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
242
+ self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
243
+ self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
244
+ self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
245
+
246
+ # forgot to save type in version 1:
247
+ # guess self.type from number of remaining bytes
248
+ size_type_0 = 12 + sum([t.max_storage_size() for t in
249
+ [self.adam_m, self.adam_v]
250
+ +([self.adam_pf] if (self.past > 0) else [])])
251
+ size_type_1 = 24 + sum([t.max_storage_size() for t in
252
+ [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
253
+ self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
254
+ self.lbfgs_lmal, self.lbfgs_lmys,
255
+ self.lbfgs_lms, self.lbfgs_lmy]
256
+ +([self.lbfgs_pf] if (self.past > 0) else [])])
257
+ # due to alignment padding the size might not by exact
258
+ # but the difference in size for both types is significant,
259
+ # so we can just use whichever is closest
260
+ remaining = len(data) - offset
261
+ if abs(remaining - size_type_0) < abs(remaining - size_type_1):
262
+ self.type = 0
263
+ else:
264
+ self.type = 1
265
+
266
+ if self.type == 0:
267
+ offset = self.adam_m.load(data, offset)
268
+ offset = self.adam_v.load(data, offset)
269
+ offset = self.adam_pf.load(data,offset)
270
+
271
+ self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
272
+ self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
273
+ self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
274
+
275
+ elif self.type == 1:
276
+ offset = self.lbfgs_x.load(data, offset)
277
+ offset = self.lbfgs_xp.load(data, offset)
278
+ offset = self.lbfgs_g.load(data, offset)
279
+ offset = self.lbfgs_gp.load(data, offset)
280
+ offset = self.lbfgs_d.load(data, offset)
281
+ offset = self.lbfgs_pf.load(data, offset)
282
+ offset = self.lbfgs_lmal.load(data, offset)
283
+ offset = self.lbfgs_lmys.load(data, offset)
284
+ offset = self.lbfgs_lms.load(data, offset)
285
+ offset = self.lbfgs_lmy.load(data, offset)
286
+
287
+ self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
288
+ self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
289
+ self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
290
+ self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
291
+ self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
292
+ self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
293
+
294
+ else:
295
+ raise ValueError('Invalid version of checkpoint file')
296
+
297
+ return offset
298
+
299
+ def save_gguf(self, gguf_writer):
300
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
301
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
302
+ gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
303
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
304
+ gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
305
+
306
+ if self.type == 0:
307
+ gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
308
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
309
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
310
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
311
+
312
+ self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
313
+ self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
314
+ if self.past > 0:
315
+ self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
316
+
317
+ elif self.type == 1:
318
+ gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
319
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
320
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
321
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
322
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
323
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
324
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
325
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
326
+
327
+ self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
328
+ self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
329
+ self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
330
+ self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
331
+ self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
332
+ if self.past > 0:
333
+ self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
334
+ self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
335
+ self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
336
+ self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
337
+ self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
338
+ else:
339
+ raise ValueError('Unknown optimizer type')
340
+
341
+ class ModelParams:
342
+ def __init__(self):
343
+ pass
344
+
345
+ def load(self, data, offset):
346
+ self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
347
+ self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
348
+ self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
349
+ self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
350
+ self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
351
+ self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
352
+ return offset
353
+
354
+ def get_n_ff(self):
355
+ # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
356
+ return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
357
+
358
+ def save_gguf(self, gguf_writer):
359
+ # self.n_vocab not saved
360
+ gguf_writer.add_embedding_length(self.n_embd)
361
+ gguf_writer.add_head_count(self.n_head)
362
+ gguf_writer.add_block_count(self.n_layer)
363
+ gguf_writer.add_rope_dimension_count(self.n_rot)
364
+ gguf_writer.add_feed_forward_length(self.get_n_ff())
365
+
366
+ def tensor_name(key, bid=None):
367
+ return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
368
+
369
+ class Layer:
370
+ def __init__(self, params, bid):
371
+ self.bid = bid
372
+ self.att_norm = Tensor('f', [params.n_embd])
373
+ self.wq = Tensor('f', [params.n_embd, params.n_embd])
374
+ self.wk = Tensor('f', [params.n_embd, params.n_embd])
375
+ self.wv = Tensor('f', [params.n_embd, params.n_embd])
376
+ self.wo = Tensor('f', [params.n_embd, params.n_embd])
377
+ self.ffn_norm = Tensor('f', [params.n_embd])
378
+ self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
379
+ self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
380
+ self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
381
+
382
+ def load(self, data, offset):
383
+ offset = self.att_norm.load(data, offset)
384
+ offset = self.wq.load(data, offset)
385
+ offset = self.wk.load(data, offset)
386
+ offset = self.wv.load(data, offset)
387
+ offset = self.wo.load(data, offset)
388
+ offset = self.ffn_norm.load(data, offset)
389
+ offset = self.w1.load(data, offset)
390
+ offset = self.w2.load(data, offset)
391
+ offset = self.w3.load(data, offset)
392
+ return offset
393
+
394
+ def save_gguf(self, gguf_writer):
395
+ self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
396
+ self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
397
+ self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
398
+ self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
399
+ self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
400
+ self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
401
+ self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
402
+ self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
403
+ self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
404
+
405
+ class Model:
406
+ def __init__(self):
407
+ self.params = ModelParams()
408
+ self.layers = []
409
+
410
+ def load(self, data, offset):
411
+ offset = self.params.load(data, offset)
412
+
413
+ self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
414
+ self.norm = Tensor('f', [self.params.n_embd])
415
+ self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
416
+
417
+ offset = self.tok_embd.load(data, offset)
418
+ offset = self.norm.load(data, offset)
419
+ offset = self.output.load(data, offset)
420
+
421
+ self.layers.clear()
422
+ for bid in range(self.params.n_layer):
423
+ layer = Layer(self.params, bid)
424
+ offset = layer.load(data, offset)
425
+ self.layers.append(layer)
426
+
427
+ return offset
428
+
429
+ def save_gguf(self, gguf_writer):
430
+ self.params.save_gguf(gguf_writer)
431
+
432
+ self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
433
+ self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
434
+ self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
435
+
436
+ for layer in self.layers:
437
+ layer.save_gguf(gguf_writer)
438
+
439
+ class Checkpoint:
440
+ def __init__(self):
441
+ self.model = Model()
442
+ self.opt_ctx = OptimizationContext()
443
+
444
+ def load(self, data, offset):
445
+ magic = bytes(reversed(data[offset:offset + 4])); offset += 4
446
+ if magic != b'ggcp':
447
+ raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
448
+
449
+ self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
450
+ if self.version != 0:
451
+ raise ValueError('Invalid version of checkpoint file')
452
+
453
+ self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
454
+ self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
455
+ self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
456
+
457
+ offset = self.model.load(data, offset)
458
+ offset = self.opt_ctx.load(data, offset)
459
+
460
+ return offset
461
+
462
+ def save_gguf(self, gguf_writer):
463
+ gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
464
+ gguf_writer.add_layer_norm_rms_eps(1e-5)
465
+ gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
466
+ gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
467
+ gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
468
+ gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
469
+ gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
470
+ self.model.save_gguf(gguf_writer)
471
+ self.opt_ctx.save_gguf(gguf_writer)
472
+
473
+ def handle_args():
474
+ parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
475
+ parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
476
+ parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
477
+ return parser.parse_args()
478
+
479
+ def main():
480
+ cfg = handle_args()
481
+ data = np.memmap(cfg.input, mode = 'r')
482
+ chk = Checkpoint()
483
+ offset = 0
484
+ offset = chk.load(data, offset)
485
+ # we should have read all available data
486
+ assert(offset == len(data))
487
+
488
+ gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
489
+ chk.save_gguf(gguf_writer)
490
+ print(" gguf: write header")
491
+ gguf_writer.write_header_to_file()
492
+ print(" gguf: write metadata")
493
+ gguf_writer.write_kv_data_to_file()
494
+ print(" gguf: write tensors")
495
+ gguf_writer.write_tensors_to_file()
496
+ gguf_writer.close()
497
+
498
+ if __name__ == '__main__':
499
+ main()
llama.cpp/gguf-py/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Georgi Gerganov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
llama.cpp/gguf-py/README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## gguf
2
+
3
+ This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
4
+ (GGML Universal File) format.
5
+
6
+ See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
7
+ as an example for its usage.
8
+
9
+ ## Installation
10
+ ```sh
11
+ pip install gguf
12
+ ```
13
+
14
+ ## Development
15
+ Maintainers who participate in development of this package are advised to install it in editable mode:
16
+
17
+ ```sh
18
+ cd /path/to/llama.cpp/gguf-py
19
+
20
+ pip install --editable .
21
+ ```
22
+
23
+ **Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
24
+ In this case, upgrade Pip to the latest:
25
+
26
+ ```sh
27
+ pip install --upgrade pip
28
+ ```
29
+
30
+ ## Automatic publishing with CI
31
+
32
+ There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
33
+
34
+ 1. Bump the version in `pyproject.toml`.
35
+ 2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
36
+
37
+ ```sh
38
+ git tag -a gguf-v1.0.0 -m "Version 1.0 release"
39
+ ```
40
+
41
+ 3. Push the tags.
42
+
43
+ ```sh
44
+ git push origin --tags
45
+ ```
46
+
47
+ ## Manual publishing
48
+ If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:
49
+
50
+ ```sh
51
+ pip install build twine
52
+ ```
53
+
54
+ Then, folow these steps to release a new version:
55
+
56
+ 1. Bump the version in `pyproject.toml`.
57
+ 2. Build the package:
58
+
59
+ ```sh
60
+ python -m build
61
+ ```
62
+
63
+ 3. Upload the generated distribution archives:
64
+
65
+ ```sh
66
+ python -m twine upload dist/*
67
+ ```
68
+
69
+ ## TODO
70
+ - [ ] Add tests
71
+ - [ ] Include conversion scripts as command line entry points in this package.
llama.cpp/gguf-py/gguf/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .gguf import *
llama.cpp/gguf-py/gguf/__pycache__/gguf.cpython-310.pyc ADDED
Binary file (31.2 kB). View file
 
llama.cpp/gguf-py/gguf/gguf.py ADDED
@@ -0,0 +1,1070 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+ import shutil
7
+ import struct
8
+ import sys
9
+ import tempfile
10
+ from enum import IntEnum, auto
11
+ from io import BufferedWriter
12
+ from pathlib import Path
13
+ from typing import IO, Any, BinaryIO, Callable, Sequence
14
+
15
+ import numpy as np
16
+
17
+ #
18
+ # constants
19
+ #
20
+
21
+ GGUF_MAGIC = 0x46554747
22
+ GGUF_VERSION = 2
23
+ GGUF_DEFAULT_ALIGNMENT = 32
24
+
25
+ # general
26
+ KEY_GENERAL_ARCHITECTURE = "general.architecture"
27
+ KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
28
+ KEY_GENERAL_ALIGNMENT = "general.alignment"
29
+ KEY_GENERAL_NAME = "general.name"
30
+ KEY_GENERAL_AUTHOR = "general.author"
31
+ KEY_GENERAL_URL = "general.url"
32
+ KEY_GENERAL_DESCRIPTION = "general.description"
33
+ KEY_GENERAL_LICENSE = "general.license"
34
+ KEY_GENERAL_SOURCE_URL = "general.source.url"
35
+ KEY_GENERAL_SOURCE_HF_REPO = "general.source.huggingface.repository"
36
+ KEY_GENERAL_FILE_TYPE = "general.file_type"
37
+
38
+ # LLM
39
+ KEY_CONTEXT_LENGTH = "{arch}.context_length"
40
+ KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
41
+ KEY_BLOCK_COUNT = "{arch}.block_count"
42
+ KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
43
+ KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
44
+ KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
45
+
46
+ # attention
47
+ KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
48
+ KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
49
+ KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
50
+ KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
51
+ KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
52
+ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
53
+
54
+ # RoPE
55
+ KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
56
+ KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
57
+ KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
58
+
59
+ # tokenization
60
+ KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
61
+ KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
62
+ KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
63
+ KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
64
+ KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
65
+ KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
66
+ KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
67
+ KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
68
+ KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
69
+ KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
70
+ KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
71
+ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
72
+
73
+
74
+ #
75
+ # recommended mapping of model tensor names for storage in gguf
76
+ #
77
+
78
+
79
+ class MODEL_ARCH(IntEnum):
80
+ LLAMA : int = auto()
81
+ FALCON : int = auto()
82
+ BAICHUAN : int = auto()
83
+ GPT2 : int = auto()
84
+ GPTJ : int = auto()
85
+ GPTNEOX : int = auto()
86
+ MPT : int = auto()
87
+ STARCODER : int = auto()
88
+ PERSIMMON : int = auto()
89
+ REFACT : int = auto()
90
+ BERT : int = auto()
91
+ BLOOM : int = auto()
92
+
93
+
94
+ class MODEL_TENSOR(IntEnum):
95
+ TOKEN_EMBD : int = auto()
96
+ TOKEN_EMBD_NORM : int = auto()
97
+ TOKEN_TYPES : int = auto()
98
+ POS_EMBD : int = auto()
99
+ OUTPUT : int = auto()
100
+ OUTPUT_NORM : int = auto()
101
+ ROPE_FREQS : int = auto()
102
+ ATTN_Q : int = auto()
103
+ ATTN_K : int = auto()
104
+ ATTN_V : int = auto()
105
+ ATTN_QKV : int = auto()
106
+ ATTN_OUT : int = auto()
107
+ ATTN_NORM : int = auto()
108
+ ATTN_NORM_2 : int = auto()
109
+ ATTN_ROT_EMBD : int = auto()
110
+ FFN_GATE : int = auto()
111
+ FFN_DOWN : int = auto()
112
+ FFN_UP : int = auto()
113
+ FFN_NORM : int = auto()
114
+ ATTN_Q_NORM : int = auto()
115
+ ATTN_K_NORM : int = auto()
116
+
117
+
118
+ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
119
+ MODEL_ARCH.LLAMA: "llama",
120
+ MODEL_ARCH.FALCON: "falcon",
121
+ MODEL_ARCH.BAICHUAN: "baichuan",
122
+ MODEL_ARCH.GPT2: "gpt2",
123
+ MODEL_ARCH.GPTJ: "gptj",
124
+ MODEL_ARCH.GPTNEOX: "gptneox",
125
+ MODEL_ARCH.MPT: "mpt",
126
+ MODEL_ARCH.STARCODER: "starcoder",
127
+ MODEL_ARCH.PERSIMMON: "persimmon",
128
+ MODEL_ARCH.REFACT: "refact",
129
+ MODEL_ARCH.BERT: "bert",
130
+ MODEL_ARCH.BLOOM: "bloom",
131
+ }
132
+
133
+ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
134
+ MODEL_TENSOR.TOKEN_EMBD: "token_embd",
135
+ MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
136
+ MODEL_TENSOR.TOKEN_TYPES: "token_types",
137
+ MODEL_TENSOR.POS_EMBD: "position_embd",
138
+ MODEL_TENSOR.OUTPUT_NORM: "output_norm",
139
+ MODEL_TENSOR.OUTPUT: "output",
140
+ MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
141
+ MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
142
+ MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
143
+ MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
144
+ MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
145
+ MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
146
+ MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
147
+ MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
148
+ MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
149
+ MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
150
+ MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
151
+ MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
152
+ MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
153
+ MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
154
+ MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
155
+ }
156
+
157
+ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
158
+ MODEL_ARCH.LLAMA: [
159
+ MODEL_TENSOR.TOKEN_EMBD,
160
+ MODEL_TENSOR.OUTPUT_NORM,
161
+ MODEL_TENSOR.OUTPUT,
162
+ MODEL_TENSOR.ROPE_FREQS,
163
+ MODEL_TENSOR.ATTN_NORM,
164
+ MODEL_TENSOR.ATTN_Q,
165
+ MODEL_TENSOR.ATTN_K,
166
+ MODEL_TENSOR.ATTN_V,
167
+ MODEL_TENSOR.ATTN_OUT,
168
+ MODEL_TENSOR.ATTN_ROT_EMBD,
169
+ MODEL_TENSOR.FFN_NORM,
170
+ MODEL_TENSOR.FFN_GATE,
171
+ MODEL_TENSOR.FFN_DOWN,
172
+ MODEL_TENSOR.FFN_UP,
173
+ ],
174
+ MODEL_ARCH.GPTNEOX: [
175
+ MODEL_TENSOR.TOKEN_EMBD,
176
+ MODEL_TENSOR.OUTPUT_NORM,
177
+ MODEL_TENSOR.OUTPUT,
178
+ MODEL_TENSOR.ATTN_NORM,
179
+ MODEL_TENSOR.ATTN_QKV,
180
+ MODEL_TENSOR.ATTN_OUT,
181
+ MODEL_TENSOR.FFN_NORM,
182
+ MODEL_TENSOR.FFN_DOWN,
183
+ MODEL_TENSOR.FFN_UP,
184
+ ],
185
+ MODEL_ARCH.FALCON: [
186
+ MODEL_TENSOR.TOKEN_EMBD,
187
+ MODEL_TENSOR.OUTPUT_NORM,
188
+ MODEL_TENSOR.OUTPUT,
189
+ MODEL_TENSOR.ATTN_NORM,
190
+ MODEL_TENSOR.ATTN_NORM_2,
191
+ MODEL_TENSOR.ATTN_QKV,
192
+ MODEL_TENSOR.ATTN_OUT,
193
+ MODEL_TENSOR.FFN_DOWN,
194
+ MODEL_TENSOR.FFN_UP,
195
+ ],
196
+ MODEL_ARCH.BAICHUAN: [
197
+ MODEL_TENSOR.TOKEN_EMBD,
198
+ MODEL_TENSOR.OUTPUT_NORM,
199
+ MODEL_TENSOR.OUTPUT,
200
+ MODEL_TENSOR.ROPE_FREQS,
201
+ MODEL_TENSOR.ATTN_NORM,
202
+ MODEL_TENSOR.ATTN_Q,
203
+ MODEL_TENSOR.ATTN_K,
204
+ MODEL_TENSOR.ATTN_V,
205
+ MODEL_TENSOR.ATTN_OUT,
206
+ MODEL_TENSOR.ATTN_ROT_EMBD,
207
+ MODEL_TENSOR.FFN_NORM,
208
+ MODEL_TENSOR.FFN_GATE,
209
+ MODEL_TENSOR.FFN_DOWN,
210
+ MODEL_TENSOR.FFN_UP,
211
+ ],
212
+ MODEL_ARCH.STARCODER: [
213
+ MODEL_TENSOR.TOKEN_EMBD,
214
+ MODEL_TENSOR.POS_EMBD,
215
+ MODEL_TENSOR.OUTPUT_NORM,
216
+ MODEL_TENSOR.OUTPUT,
217
+ MODEL_TENSOR.ATTN_NORM,
218
+ MODEL_TENSOR.ATTN_QKV,
219
+ MODEL_TENSOR.ATTN_OUT,
220
+ MODEL_TENSOR.FFN_NORM,
221
+ MODEL_TENSOR.FFN_DOWN,
222
+ MODEL_TENSOR.FFN_UP,
223
+ ],
224
+ MODEL_ARCH.BERT: [
225
+ MODEL_TENSOR.TOKEN_EMBD,
226
+ MODEL_TENSOR.TOKEN_TYPES,
227
+ MODEL_TENSOR.POS_EMBD,
228
+ MODEL_TENSOR.OUTPUT_NORM,
229
+ MODEL_TENSOR.ATTN_NORM,
230
+ MODEL_TENSOR.ATTN_Q,
231
+ MODEL_TENSOR.ATTN_K,
232
+ MODEL_TENSOR.ATTN_V,
233
+ MODEL_TENSOR.ATTN_OUT,
234
+ MODEL_TENSOR.FFN_NORM,
235
+ MODEL_TENSOR.FFN_DOWN,
236
+ MODEL_TENSOR.FFN_UP,
237
+ ],
238
+ MODEL_ARCH.MPT: [
239
+ MODEL_TENSOR.TOKEN_EMBD,
240
+ MODEL_TENSOR.OUTPUT_NORM,
241
+ MODEL_TENSOR.OUTPUT,
242
+ MODEL_TENSOR.ATTN_NORM,
243
+ MODEL_TENSOR.ATTN_QKV,
244
+ MODEL_TENSOR.ATTN_OUT,
245
+ MODEL_TENSOR.FFN_NORM,
246
+ MODEL_TENSOR.FFN_DOWN,
247
+ MODEL_TENSOR.FFN_UP,
248
+ ],
249
+ MODEL_ARCH.GPTJ: [
250
+ MODEL_TENSOR.TOKEN_EMBD,
251
+ MODEL_TENSOR.OUTPUT_NORM,
252
+ MODEL_TENSOR.OUTPUT,
253
+ MODEL_TENSOR.ATTN_NORM,
254
+ MODEL_TENSOR.ATTN_Q,
255
+ MODEL_TENSOR.ATTN_K,
256
+ MODEL_TENSOR.ATTN_V,
257
+ MODEL_TENSOR.ATTN_OUT,
258
+ MODEL_TENSOR.FFN_DOWN,
259
+ MODEL_TENSOR.FFN_UP,
260
+ ],
261
+ MODEL_ARCH.PERSIMMON: [
262
+ MODEL_TENSOR.TOKEN_EMBD,
263
+ MODEL_TENSOR.OUTPUT,
264
+ MODEL_TENSOR.OUTPUT_NORM,
265
+ MODEL_TENSOR.ATTN_NORM,
266
+ MODEL_TENSOR.ATTN_QKV,
267
+ MODEL_TENSOR.ATTN_OUT,
268
+ MODEL_TENSOR.FFN_NORM,
269
+ MODEL_TENSOR.FFN_DOWN,
270
+ MODEL_TENSOR.FFN_UP,
271
+ MODEL_TENSOR.ATTN_Q_NORM,
272
+ MODEL_TENSOR.ATTN_K_NORM,
273
+ MODEL_TENSOR.ATTN_ROT_EMBD,
274
+ ],
275
+ MODEL_ARCH.REFACT: [
276
+ MODEL_TENSOR.TOKEN_EMBD,
277
+ MODEL_TENSOR.OUTPUT_NORM,
278
+ MODEL_TENSOR.OUTPUT,
279
+ MODEL_TENSOR.ATTN_NORM,
280
+ MODEL_TENSOR.ATTN_Q,
281
+ MODEL_TENSOR.ATTN_K,
282
+ MODEL_TENSOR.ATTN_V,
283
+ MODEL_TENSOR.ATTN_OUT,
284
+ MODEL_TENSOR.FFN_NORM,
285
+ MODEL_TENSOR.FFN_GATE,
286
+ MODEL_TENSOR.FFN_DOWN,
287
+ MODEL_TENSOR.FFN_UP,
288
+ ],
289
+ MODEL_ARCH.BLOOM: [
290
+ MODEL_TENSOR.TOKEN_EMBD,
291
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
292
+ MODEL_TENSOR.OUTPUT_NORM,
293
+ MODEL_TENSOR.OUTPUT,
294
+ MODEL_TENSOR.ATTN_NORM,
295
+ MODEL_TENSOR.ATTN_QKV,
296
+ MODEL_TENSOR.ATTN_OUT,
297
+ MODEL_TENSOR.FFN_NORM,
298
+ MODEL_TENSOR.FFN_DOWN,
299
+ MODEL_TENSOR.FFN_UP,
300
+ ],
301
+ MODEL_ARCH.GPT2: [
302
+ # TODO
303
+ ],
304
+ # TODO
305
+ }
306
+
307
+ # tensors that will not be serialized
308
+ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
309
+ MODEL_ARCH.LLAMA: [
310
+ MODEL_TENSOR.ROPE_FREQS,
311
+ MODEL_TENSOR.ATTN_ROT_EMBD,
312
+ ],
313
+ MODEL_ARCH.BAICHUAN: [
314
+ MODEL_TENSOR.ROPE_FREQS,
315
+ MODEL_TENSOR.ATTN_ROT_EMBD,
316
+ ],
317
+ MODEL_ARCH.PERSIMMON: [
318
+ MODEL_TENSOR.ROPE_FREQS,
319
+ ]
320
+ }
321
+
322
+
323
+ class TensorNameMap:
324
+ mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
325
+ # Token embeddings
326
+ MODEL_TENSOR.TOKEN_EMBD: (
327
+ "gpt_neox.embed_in", # gptneox
328
+ "transformer.wte", # gpt2 gpt-j mpt refact
329
+ "transformer.word_embeddings", # falcon
330
+ "word_embeddings", # bloom
331
+ "model.embed_tokens", # llama-hf
332
+ "tok_embeddings", # llama-pth
333
+ "embeddings.word_embeddings", # bert
334
+ "language_model.embedding.word_embeddings", # persimmon
335
+ ),
336
+
337
+ # Token type embeddings
338
+ MODEL_TENSOR.TOKEN_TYPES: (
339
+ "embeddings.token_type_embeddings", # bert
340
+ ),
341
+
342
+ # Normalization of token embeddings
343
+ MODEL_TENSOR.TOKEN_EMBD_NORM: (
344
+ "word_embeddings_layernorm", # bloom
345
+ ),
346
+
347
+ # Position embeddings
348
+ MODEL_TENSOR.POS_EMBD: (
349
+ "transformer.wpe", # gpt2
350
+ "embeddings.position_embeddings", # bert
351
+ ),
352
+
353
+ # Output
354
+ MODEL_TENSOR.OUTPUT: (
355
+ "embed_out", # gptneox
356
+ "lm_head", # gpt2 mpt falcon llama-hf baichuan
357
+ "output", # llama-pth bloom
358
+ "word_embeddings_for_head", # persimmon
359
+ ),
360
+
361
+ # Output norm
362
+ MODEL_TENSOR.OUTPUT_NORM: (
363
+ "gpt_neox.final_layer_norm", # gptneox
364
+ "transformer.ln_f", # gpt2 gpt-j falcon
365
+ "model.norm", # llama-hf baichuan
366
+ "norm", # llama-pth
367
+ "embeddings.LayerNorm", # bert
368
+ "transformer.norm_f", # mpt
369
+ "ln_f", # refact bloom
370
+ "language_model.encoder.final_layernorm", # persimmon
371
+ ),
372
+
373
+ # Rope frequencies
374
+ MODEL_TENSOR.ROPE_FREQS: (
375
+ "rope.freqs", # llama-pth
376
+ ),
377
+ }
378
+
379
+ block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
380
+ # Attention norm
381
+ MODEL_TENSOR.ATTN_NORM: (
382
+ "gpt_neox.layers.{bid}.input_layernorm", # gptneox
383
+ "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
384
+ "transformer.blocks.{bid}.norm_1", # mpt
385
+ "transformer.h.{bid}.input_layernorm", # falcon7b
386
+ "h.{bid}.input_layernorm", # bloom
387
+ "transformer.h.{bid}.ln_mlp", # falcon40b
388
+ "model.layers.{bid}.input_layernorm", # llama-hf
389
+ "layers.{bid}.attention_norm", # llama-pth
390
+ "encoder.layer.{bid}.attention.output.LayerNorm", # bert
391
+ "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
392
+ ),
393
+
394
+ # Attention norm 2
395
+ MODEL_TENSOR.ATTN_NORM_2: (
396
+ "transformer.h.{bid}.ln_attn", # falcon40b
397
+ ),
398
+
399
+ # Attention query-key-value
400
+ MODEL_TENSOR.ATTN_QKV: (
401
+ "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
402
+ "transformer.h.{bid}.attn.c_attn", # gpt2
403
+ "transformer.blocks.{bid}.attn.Wqkv", # mpt
404
+ "transformer.h.{bid}.self_attention.query_key_value", # falcon
405
+ "h.{bid}.self_attention.query_key_value", # bloom
406
+ "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
407
+ ),
408
+
409
+ # Attention query
410
+ MODEL_TENSOR.ATTN_Q: (
411
+ "model.layers.{bid}.self_attn.q_proj", # llama-hf
412
+ "layers.{bid}.attention.wq", # llama-pth
413
+ "encoder.layer.{bid}.attention.self.query", # bert
414
+ "transformer.h.{bid}.attn.q_proj", # gpt-j
415
+ ),
416
+
417
+ # Attention key
418
+ MODEL_TENSOR.ATTN_K: (
419
+ "model.layers.{bid}.self_attn.k_proj", # llama-hf
420
+ "layers.{bid}.attention.wk", # llama-pth
421
+ "encoder.layer.{bid}.attention.self.key", # bert
422
+ "transformer.h.{bid}.attn.k_proj", # gpt-j
423
+ ),
424
+
425
+ # Attention value
426
+ MODEL_TENSOR.ATTN_V: (
427
+ "model.layers.{bid}.self_attn.v_proj", # llama-hf
428
+ "layers.{bid}.attention.wv", # llama-pth
429
+ "encoder.layer.{bid}.attention.self.value", # bert
430
+ "transformer.h.{bid}.attn.v_proj", # gpt-j
431
+ ),
432
+
433
+ # Attention output
434
+ MODEL_TENSOR.ATTN_OUT: (
435
+ "gpt_neox.layers.{bid}.attention.dense", # gptneox
436
+ "transformer.h.{bid}.attn.c_proj", # gpt2 refact
437
+ "transformer.blocks.{bid}.attn.out_proj", # mpt
438
+ "transformer.h.{bid}.self_attention.dense", # falcon
439
+ "h.{bid}.self_attention.dense", # bloom
440
+ "model.layers.{bid}.self_attn.o_proj", # llama-hf
441
+ "layers.{bid}.attention.wo", # llama-pth
442
+ "encoder.layer.{bid}.attention.output.dense", # bert
443
+ "transformer.h.{bid}.attn.out_proj", # gpt-j
444
+ "language_model.encoder.layers.{bid}.self_attention.dense" # persimmon
445
+ ),
446
+
447
+ # Rotary embeddings
448
+ MODEL_TENSOR.ATTN_ROT_EMBD: (
449
+ "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
450
+ "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
451
+ ),
452
+
453
+ # Feed-forward norm
454
+ MODEL_TENSOR.FFN_NORM: (
455
+ "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
456
+ "transformer.h.{bid}.ln_2", # gpt2 refact
457
+ "h.{bid}.post_attention_layernorm", # bloom
458
+ "transformer.blocks.{bid}.norm_2", # mpt
459
+ "model.layers.{bid}.post_attention_layernorm", # llama-hf
460
+ "layers.{bid}.ffn_norm", # llama-pth
461
+ "encoder.layer.{bid}.output.LayerNorm", # bert
462
+ "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
463
+ ),
464
+
465
+ # Feed-forward up
466
+ MODEL_TENSOR.FFN_UP: (
467
+ "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
468
+ "transformer.h.{bid}.mlp.c_fc", # gpt2
469
+ "transformer.blocks.{bid}.ffn.up_proj", # mpt
470
+ "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
471
+ "h.{bid}.mlp.dense_h_to_4h", # bloom
472
+ "model.layers.{bid}.mlp.up_proj", # llama-hf refact
473
+ "layers.{bid}.feed_forward.w3", # llama-pth
474
+ "encoder.layer.{bid}.intermediate.dense", # bert
475
+ "transformer.h.{bid}.mlp.fc_in", # gpt-j
476
+ "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
477
+ ),
478
+
479
+ # Feed-forward gate
480
+ MODEL_TENSOR.FFN_GATE: (
481
+ "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
482
+ "layers.{bid}.feed_forward.w1", # llama-pth
483
+ ),
484
+
485
+ # Feed-forward down
486
+ MODEL_TENSOR.FFN_DOWN: (
487
+ "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
488
+ "transformer.h.{bid}.mlp.c_proj", # gpt2 refact
489
+ "transformer.blocks.{bid}.ffn.down_proj", # mpt
490
+ "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
491
+ "h.{bid}.mlp.dense_4h_to_h", # bloom
492
+ "model.layers.{bid}.mlp.down_proj", # llama-hf
493
+ "layers.{bid}.feed_forward.w2", # llama-pth
494
+ "encoder.layer.{bid}.output.dense", # bert
495
+ "transformer.h.{bid}.mlp.fc_out", # gpt-j
496
+ "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
497
+ ),
498
+
499
+ MODEL_TENSOR.ATTN_Q_NORM: (
500
+ "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
501
+ ),
502
+
503
+ MODEL_TENSOR.ATTN_K_NORM: (
504
+ "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
505
+ ),
506
+
507
+ MODEL_TENSOR.ROPE_FREQS: (
508
+ "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
509
+ )
510
+ }
511
+
512
+ mapping: dict[str, tuple[MODEL_TENSOR, str]]
513
+
514
+ def __init__(self, arch: MODEL_ARCH, n_blocks: int):
515
+ self.mapping = {}
516
+ for tensor, keys in self.mappings_cfg.items():
517
+ if tensor not in MODEL_TENSORS[arch]:
518
+ continue
519
+ tensor_name = TENSOR_NAMES[tensor]
520
+ self.mapping[tensor_name] = (tensor, tensor_name)
521
+ for key in keys:
522
+ self.mapping[key] = (tensor, tensor_name)
523
+ for bid in range(n_blocks):
524
+ for tensor, keys in self.block_mappings_cfg.items():
525
+ if tensor not in MODEL_TENSORS[arch]:
526
+ continue
527
+ tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
528
+ self.mapping[tensor_name] = (tensor, tensor_name)
529
+ for key in keys:
530
+ key = key.format(bid = bid)
531
+ self.mapping[key] = (tensor, tensor_name)
532
+
533
+ def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
534
+ result = self.mapping.get(key)
535
+ if result is not None:
536
+ return result
537
+ for suffix in try_suffixes:
538
+ if key.endswith(suffix):
539
+ result = self.mapping.get(key[:-len(suffix)])
540
+ if result is not None:
541
+ return (result[0], result[1] + suffix)
542
+ return None
543
+
544
+ def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
545
+ result = self.get_type_and_name(key, try_suffixes = try_suffixes)
546
+ if result is None:
547
+ return None
548
+ return result[1]
549
+
550
+ def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
551
+ result = self.get_type_and_name(key, try_suffixes = try_suffixes)
552
+ if result is None:
553
+ return None
554
+ return result[0]
555
+
556
+ def __getitem__(self, key: str) -> str:
557
+ try:
558
+ return self.mapping[key][1]
559
+ except KeyError:
560
+ raise KeyError(key)
561
+
562
+ def __contains__(self, key: str) -> bool:
563
+ return key in self.mapping
564
+
565
+ def __repr__(self) -> str:
566
+ return repr(self.mapping)
567
+
568
+ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
569
+ return TensorNameMap(arch, n_blocks)
570
+
571
+ class TokenType(IntEnum):
572
+ NORMAL = 1
573
+ UNKNOWN = 2
574
+ CONTROL = 3
575
+ USER_DEFINED = 4
576
+ UNUSED = 5
577
+ BYTE = 6
578
+
579
+ #
580
+ # implementation
581
+ #
582
+
583
+
584
+ class GGMLQuantizationType(IntEnum):
585
+ F32 = 0
586
+ F16 = 1
587
+ Q4_0 = 2
588
+ Q4_1 = 3
589
+ Q5_0 = 6
590
+ Q5_1 = 7
591
+ Q8_0 = 8
592
+ Q8_1 = 9
593
+ Q2_K = 10
594
+ Q3_K = 11
595
+ Q4_K = 12
596
+ Q5_K = 13
597
+ Q6_K = 14
598
+ Q8_K = 15
599
+
600
+
601
+ class GGUFValueType(IntEnum):
602
+ UINT8 = 0
603
+ INT8 = 1
604
+ UINT16 = 2
605
+ INT16 = 3
606
+ UINT32 = 4
607
+ INT32 = 5
608
+ FLOAT32 = 6
609
+ BOOL = 7
610
+ STRING = 8
611
+ ARRAY = 9
612
+ UINT64 = 10
613
+ INT64 = 11
614
+ FLOAT64 = 12
615
+
616
+ @staticmethod
617
+ def get_type(val):
618
+ if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
619
+ return GGUFValueType.STRING
620
+ elif isinstance(val, list):
621
+ return GGUFValueType.ARRAY
622
+ elif isinstance(val, float):
623
+ return GGUFValueType.FLOAT32
624
+ elif isinstance(val, bool):
625
+ return GGUFValueType.BOOL
626
+ elif isinstance(val, int):
627
+ return GGUFValueType.INT32
628
+ # TODO: need help with 64-bit types in Python
629
+ else:
630
+ print("Unknown type: "+str(type(val)))
631
+ sys.exit()
632
+
633
+
634
+ class GGUFWriter:
635
+ fout: BufferedWriter
636
+ arch: str
637
+ offset_tensor = 0
638
+ data_alignment = GGUF_DEFAULT_ALIGNMENT
639
+ kv_data = b""
640
+ kv_data_count = 0
641
+ ti_data = b""
642
+ ti_data_count = 0
643
+ use_temp_file: bool
644
+ temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
645
+ tensors: list[tuple[np.ndarray[Any, Any], int]]
646
+
647
+ def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
648
+ self.fout = open(path, "wb")
649
+ self.arch = arch
650
+ self.add_architecture()
651
+ self.use_temp_file = use_temp_file
652
+ self.tensors = []
653
+
654
+ def write_header_to_file(self):
655
+ self.fout.write(struct.pack("<I", GGUF_MAGIC))
656
+ self.fout.write(struct.pack("<I", GGUF_VERSION))
657
+ self.fout.write(struct.pack("<Q", self.ti_data_count))
658
+ self.fout.write(struct.pack("<Q", self.kv_data_count))
659
+ self.flush()
660
+ # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
661
+
662
+ def write_kv_data_to_file(self):
663
+ self.fout.write(self.kv_data)
664
+ self.flush()
665
+
666
+ def write_ti_data_to_file(self):
667
+ self.fout.write(self.ti_data)
668
+ self.flush()
669
+
670
+ def add_key(self, key: str):
671
+ self.add_val(key, GGUFValueType.STRING, add_vtype=False)
672
+
673
+ def add_uint8(self, key: str, val: int):
674
+ self.add_key(key)
675
+ self.add_val(val, GGUFValueType.UINT8)
676
+
677
+ def add_int8(self, key: str, val: int):
678
+ self.add_key(key)
679
+ self.add_val(val, GGUFValueType.INT8)
680
+
681
+ def add_uint16(self, key: str, val: int):
682
+ self.add_key(key)
683
+ self.add_val(val, GGUFValueType.UINT16)
684
+
685
+ def add_int16(self, key: str, val: int):
686
+ self.add_key(key)
687
+ self.add_val(val, GGUFValueType.INT16)
688
+
689
+ def add_uint32(self, key: str, val: int):
690
+ self.add_key(key)
691
+ self.add_val(val, GGUFValueType.UINT32)
692
+
693
+ def add_int32(self, key: str, val: int):
694
+ self.add_key(key)
695
+ self.add_val(val, GGUFValueType.INT32)
696
+
697
+ def add_float32(self, key: str, val: float):
698
+ self.add_key(key)
699
+ self.add_val(val, GGUFValueType.FLOAT32)
700
+
701
+ def add_uint64(self, key: str, val: int):
702
+ self.add_key(key)
703
+ self.add_val(val, GGUFValueType.UINT64)
704
+
705
+ def add_int64(self, key: str, val: int):
706
+ self.add_key(key)
707
+ self.add_val(val, GGUFValueType.INT64)
708
+
709
+ def add_float64(self, key: str, val: float):
710
+ self.add_key(key)
711
+ self.add_val(val, GGUFValueType.FLOAT64)
712
+
713
+ def add_bool(self, key: str, val: bool):
714
+ self.add_key(key)
715
+ self.add_val(val, GGUFValueType.BOOL)
716
+
717
+ def add_string(self, key: str, val: str):
718
+ if len(val) == 0:
719
+ return
720
+ self.add_key(key)
721
+ self.add_val(val, GGUFValueType.STRING)
722
+
723
+ def add_array(self, key: str, val: Sequence[Any]):
724
+ if not isinstance(val, Sequence):
725
+ raise ValueError("Value must be a sequence for array type")
726
+
727
+ self.add_key(key)
728
+ self.add_val(val, GGUFValueType.ARRAY)
729
+
730
+ _simple_value_packing = {
731
+ GGUFValueType.UINT8: "<B",
732
+ GGUFValueType.INT8: "<b",
733
+ GGUFValueType.UINT16: "<H",
734
+ GGUFValueType.INT16: "<h",
735
+ GGUFValueType.UINT32: "<I",
736
+ GGUFValueType.INT32: "<i",
737
+ GGUFValueType.FLOAT32: "<f",
738
+ GGUFValueType.UINT64: "<Q",
739
+ GGUFValueType.INT64: "<q",
740
+ GGUFValueType.FLOAT64: "<d",
741
+ GGUFValueType.BOOL: "?" ,
742
+ }
743
+ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
744
+ if vtype is None:
745
+ vtype = GGUFValueType.get_type(val)
746
+
747
+ if add_vtype:
748
+ self.kv_data += struct.pack("<I", vtype)
749
+ self.kv_data_count += 1
750
+
751
+ pack_fmt = self._simple_value_packing.get(vtype)
752
+ if pack_fmt is not None:
753
+ self.kv_data += struct.pack(pack_fmt, val)
754
+ elif vtype == GGUFValueType.STRING:
755
+ encoded_val = val.encode("utf8") if isinstance(val, str) else val
756
+ self.kv_data += struct.pack("<Q", len(encoded_val))
757
+ self.kv_data += encoded_val
758
+ elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
759
+ ltype = GGUFValueType.get_type(val[0])
760
+ if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
761
+ raise ValueError("All items in a GGUF array should be of the same type")
762
+ self.kv_data += struct.pack("<I", ltype)
763
+ self.kv_data += struct.pack("<Q", len(val))
764
+ for item in val:
765
+ self.add_val(item, add_vtype=False)
766
+ else:
767
+ raise ValueError("Invalid GGUF metadata value type or value")
768
+
769
+ @staticmethod
770
+ def ggml_pad(x: int, n: int) -> int:
771
+ return ((x + n - 1) // n) * n
772
+
773
+ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
774
+ assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
775
+
776
+ encoded_name = name.encode("utf8")
777
+ self.ti_data += struct.pack("<Q", len(encoded_name))
778
+ self.ti_data += encoded_name
779
+ n_dims = len(tensor_shape)
780
+ self.ti_data += struct.pack("<I", n_dims)
781
+ for i in range(n_dims):
782
+ self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
783
+ if raw_dtype is None:
784
+ dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
785
+ else:
786
+ dtype = raw_dtype
787
+ self.ti_data += struct.pack("<I", dtype)
788
+ self.ti_data += struct.pack("<Q", self.offset_tensor)
789
+ self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
790
+ self.ti_data_count += 1
791
+
792
+ def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
793
+ if self.use_temp_file and self.temp_file is None:
794
+ fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
795
+ fp.seek(0)
796
+ self.temp_file = fp
797
+
798
+ shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
799
+ self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
800
+
801
+ pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
802
+
803
+ if self.temp_file is None:
804
+ self.tensors.append((tensor, pad))
805
+ return
806
+
807
+ tensor.tofile(self.temp_file)
808
+
809
+ if pad != 0:
810
+ self.temp_file.write(bytes([0] * pad))
811
+
812
+ def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
813
+ pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
814
+ if pad != 0:
815
+ fp.write(bytes([0] * pad))
816
+
817
+ def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
818
+ self.write_padding(self.fout, self.fout.tell())
819
+ tensor.tofile(self.fout)
820
+ self.write_padding(self.fout, tensor.nbytes)
821
+
822
+ def write_tensors_to_file(self):
823
+ self.write_ti_data_to_file()
824
+
825
+ self.write_padding(self.fout, self.fout.tell())
826
+
827
+ if self.temp_file is None:
828
+ for (currtensor, currpad) in self.tensors:
829
+ currtensor.tofile(self.fout)
830
+ if currpad != 0:
831
+ self.fout.write(bytes([0] * currpad))
832
+ return
833
+
834
+ self.temp_file.seek(0)
835
+
836
+ shutil.copyfileobj(self.temp_file, self.fout)
837
+ self.flush()
838
+ self.temp_file.close()
839
+
840
+ def flush(self):
841
+ self.fout.flush()
842
+
843
+ def close(self):
844
+ self.fout.close()
845
+
846
+ def add_architecture(self):
847
+ self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
848
+
849
+ def add_author(self, author: str):
850
+ self.add_string(KEY_GENERAL_AUTHOR, author)
851
+
852
+ def add_tensor_data_layout(self, layout: str):
853
+ self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
854
+
855
+ def add_url(self, url: str):
856
+ self.add_string(KEY_GENERAL_URL, url)
857
+
858
+ def add_description(self, description: str):
859
+ self.add_string(KEY_GENERAL_DESCRIPTION, description)
860
+
861
+ def add_source_url(self, url: str):
862
+ self.add_string(KEY_GENERAL_SOURCE_URL, url)
863
+
864
+ def add_source_hf_repo(self, repo: str):
865
+ self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
866
+
867
+ def add_file_type(self, ftype: int):
868
+ self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
869
+
870
+ def add_name(self, name: str):
871
+ self.add_string(KEY_GENERAL_NAME, name)
872
+
873
+ def add_quantization_version(self, quantization_version: GGMLQuantizationType):
874
+ self.add_uint32(
875
+ KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
876
+
877
+ def add_custom_alignment(self, alignment: int):
878
+ self.data_alignment = alignment
879
+ self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
880
+
881
+ def add_context_length(self, length: int):
882
+ self.add_uint32(
883
+ KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
884
+
885
+ def add_embedding_length(self, length: int):
886
+ self.add_uint32(
887
+ KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
888
+
889
+ def add_block_count(self, length: int):
890
+ self.add_uint32(
891
+ KEY_BLOCK_COUNT.format(arch=self.arch), length)
892
+
893
+ def add_feed_forward_length(self, length: int):
894
+ self.add_uint32(
895
+ KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
896
+
897
+ def add_parallel_residual(self, use: bool):
898
+ self.add_bool(
899
+ KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
900
+
901
+ def add_head_count(self, count: int):
902
+ self.add_uint32(
903
+ KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
904
+
905
+ def add_head_count_kv(self, count: int):
906
+ self.add_uint32(
907
+ KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
908
+
909
+ def add_max_alibi_bias(self, bias: float):
910
+ self.add_float32(
911
+ KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
912
+
913
+ def add_clamp_kqv(self, value: float):
914
+ self.add_float32(
915
+ KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
916
+
917
+ def add_layer_norm_eps(self, value: float):
918
+ self.add_float32(
919
+ KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
920
+
921
+ def add_layer_norm_rms_eps(self, value: float):
922
+ self.add_float32(
923
+ KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
924
+
925
+ def add_rope_dimension_count(self, count: int):
926
+ self.add_uint32(
927
+ KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
928
+
929
+ def add_rope_freq_base(self, value: float):
930
+ self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
931
+
932
+ def add_rope_scale_linear(self, value: float):
933
+ self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
934
+
935
+ def add_tokenizer_model(self, model: str):
936
+ self.add_string(KEY_TOKENIZER_MODEL, model)
937
+
938
+ def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
939
+ self.add_array(KEY_TOKENIZER_LIST, tokens)
940
+
941
+ def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
942
+ self.add_array(KEY_TOKENIZER_MERGES, merges)
943
+
944
+ def add_token_types(self, types: Sequence[TokenType] | Sequence[int]):
945
+ self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
946
+
947
+ def add_token_scores(self, scores: Sequence[float]):
948
+ self.add_array(KEY_TOKENIZER_SCORES, scores)
949
+
950
+ def add_bos_token_id(self, id: int):
951
+ self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
952
+
953
+ def add_eos_token_id(self, id: int):
954
+ self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
955
+
956
+ def add_unk_token_id(self, id: int):
957
+ self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
958
+
959
+ def add_sep_token_id(self, id: int):
960
+ self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
961
+
962
+ def add_pad_token_id(self, id: int):
963
+ self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
964
+
965
+
966
+ class SpecialVocab:
967
+ load_merges: bool = False
968
+ merges: list[str] = []
969
+ special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
970
+ special_token_ids: dict[str, int] = {}
971
+
972
+ def __init__(
973
+ self, path: str | os.PathLike[str], load_merges: bool = False,
974
+ special_token_types: tuple[str, ...] | None = None,
975
+ ):
976
+ self.special_token_ids = {}
977
+ self.load_merges = load_merges
978
+ if special_token_types is not None:
979
+ self.special_token_types = special_token_types
980
+ self._load(Path(path))
981
+
982
+ def _load(self, path: Path) -> None:
983
+ if not self._try_load_from_tokenizer_json(path):
984
+ self._try_load_from_config_json(path)
985
+
986
+ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
987
+ tokenizer_file = path / 'tokenizer.json'
988
+ if not tokenizer_file.is_file():
989
+ return False
990
+ with open(tokenizer_file, encoding = 'utf-8') as f:
991
+ tokenizer = json.load(f)
992
+ if self.load_merges:
993
+ merges = tokenizer.get('model', {}).get('merges')
994
+ if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
995
+ self.merges = merges
996
+ tokenizer_config_file = path / 'tokenizer_config.json'
997
+ added_tokens = tokenizer.get('added_tokens')
998
+ if added_tokens is None or not tokenizer_config_file.is_file():
999
+ return True
1000
+ with open(tokenizer_config_file, encoding = 'utf-8') as f:
1001
+ tokenizer_config = json.load(f)
1002
+ for typ in self.special_token_types:
1003
+ entry = tokenizer_config.get(f'{typ}_token')
1004
+ if isinstance(entry, str):
1005
+ tc_content = entry
1006
+ elif isinstance(entry, dict):
1007
+ entry_content = entry.get('content')
1008
+ if not isinstance(entry_content, str):
1009
+ continue
1010
+ tc_content = entry_content
1011
+ else:
1012
+ continue
1013
+ for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
1014
+ if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
1015
+ self.special_token_ids[typ] = maybe_token_id
1016
+ break
1017
+ return True
1018
+
1019
+ def _try_load_from_config_json(self, path: Path) -> bool:
1020
+ config_file = path / 'config.json'
1021
+ if not config_file.is_file():
1022
+ return False
1023
+ with open(config_file, encoding = 'utf-8') as f:
1024
+ config = json.load(f)
1025
+ for typ in self.special_token_types:
1026
+ maybe_token_id = config.get(f'{typ}_token_id')
1027
+ if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
1028
+ self.special_token_ids[typ] = maybe_token_id
1029
+ return True
1030
+
1031
+ def add_to_gguf(self, gw: GGUFWriter) -> None:
1032
+ if len(self.merges) > 0:
1033
+ print(f'gguf: Adding {len(self.merges)} merge(s).')
1034
+ gw.add_token_merges(self.merges)
1035
+ for typ, tokid in self.special_token_ids.items():
1036
+ handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
1037
+ if handler is None:
1038
+ print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
1039
+ continue
1040
+ print(f'gguf: Setting special token type {typ} to {tokid}')
1041
+ handler(tokid)
1042
+
1043
+ def __repr__(self) -> str:
1044
+ return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
1045
+
1046
+
1047
+ # Example usage:
1048
+ if __name__ == "__main__":
1049
+ # Example usage with a file
1050
+ gguf_writer = GGUFWriter("example.gguf", "llama")
1051
+
1052
+ gguf_writer.add_architecture()
1053
+ gguf_writer.add_block_count(12)
1054
+ gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
1055
+ gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
1056
+ gguf_writer.add_custom_alignment(64)
1057
+
1058
+ tensor1 = np.ones((32,), dtype=np.float32) * 100.0
1059
+ tensor2 = np.ones((64,), dtype=np.float32) * 101.0
1060
+ tensor3 = np.ones((96,), dtype=np.float32) * 102.0
1061
+
1062
+ gguf_writer.add_tensor("tensor1", tensor1)
1063
+ gguf_writer.add_tensor("tensor2", tensor2)
1064
+ gguf_writer.add_tensor("tensor3", tensor3)
1065
+
1066
+ gguf_writer.write_header_to_file()
1067
+ gguf_writer.write_kv_data_to_file()
1068
+ gguf_writer.write_tensors_to_file()
1069
+
1070
+ gguf_writer.close()
llama.cpp/gguf-py/gguf/py.typed ADDED
File without changes
llama.cpp/gguf-py/pyproject.toml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "gguf"
3
+ version = "0.4.4"
4
+ description = "Write ML models in GGUF for GGML"
5
+ authors = ["GGML <ggml@ggml.ai>"]
6
+ packages = [
7
+ {include = "gguf"},
8
+ {include = "gguf/py.typed"},
9
+ ]
10
+ readme = "README.md"
11
+ homepage = "https://ggml.ai"
12
+ repository = "https://github.com/ggerganov/llama.cpp"
13
+ keywords = ["ggml", "gguf", "llama.cpp"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ ]
19
+
20
+ [tool.poetry.dependencies]
21
+ python = ">=3.8"
22
+ numpy = ">=1.17"
23
+
24
+ [tool.poetry.dev-dependencies]
25
+ pytest = "^5.2"
26
+
27
+ [build-system]
28
+ requires = ["poetry-core>=1.0.0"]
29
+ build-backend = "poetry.core.masonry.api"
llama.cpp/gguf-py/tests/test_gguf.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import gguf
2
+
3
+ # TODO: add tests
4
+
5
+
6
+ def test_write_gguf():
7
+ pass
llama.cpp/models/ggml-vocab-aquila.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c53c3c516ac67c7ca12977b9690fdea3d2ef13bbaed6378f98191a13ef5ca00
3
+ size 4825676
llama.cpp/models/ggml-vocab-falcon.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffbc7c119de7e9aab8f4257d617e3fa55f942a9f9ca84139ef3f5b1ca53836a8
3
+ size 2547782
llama.cpp/models/ggml-vocab-llama.gguf ADDED
Binary file (595 kB). View file