Arrcttacsrks commited on
Commit
16c390a
1 Parent(s): 1f888e3

Upload llama.cpp/convert_lora_to_gguf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. llama.cpp/convert_lora_to_gguf.py +433 -0
llama.cpp/convert_lora_to_gguf.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from __future__ import annotations
5
+
6
+ from dataclasses import dataclass
7
+ import logging
8
+ import argparse
9
+ import os
10
+ import sys
11
+ import json
12
+ from math import prod
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15
+ from transformers import AutoConfig
16
+
17
+ import torch
18
+
19
+ if TYPE_CHECKING:
20
+ from torch import Tensor
21
+
22
+ if 'NO_LOCAL_GGUF' not in os.environ:
23
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
24
+ import gguf
25
+
26
+ # reuse model definitions from convert_hf_to_gguf.py
27
+ from convert_hf_to_gguf import LazyTorchTensor, Model
28
+
29
+ logger = logging.getLogger("lora-to-gguf")
30
+
31
+
32
+ @dataclass
33
+ class PartialLoraTensor:
34
+ A: Tensor | None = None
35
+ B: Tensor | None = None
36
+
37
+
38
+ # magic to support tensor shape modifications and splitting
39
+ class LoraTorchTensor:
40
+ _lora_A: Tensor # (n_rank, row_size)
41
+ _lora_B: Tensor # (col_size, n_rank)
42
+ _rank: int
43
+
44
+ def __init__(self, A: Tensor, B: Tensor):
45
+ assert len(A.shape) == len(B.shape)
46
+ assert A.shape[-2] == B.shape[-1]
47
+ if A.dtype != B.dtype:
48
+ A = A.to(torch.float32)
49
+ B = B.to(torch.float32)
50
+ self._lora_A = A
51
+ self._lora_B = B
52
+ self._rank = B.shape[-1]
53
+
54
+ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
55
+ return (self._lora_A, self._lora_B)
56
+
57
+ def __getitem__(
58
+ self,
59
+ indices: (
60
+ SupportsIndex
61
+ | slice
62
+ | tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
63
+ ),
64
+ ) -> LoraTorchTensor:
65
+ shape = self.shape
66
+ if isinstance(indices, SupportsIndex):
67
+ if len(shape) > 2:
68
+ return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
69
+ else:
70
+ raise NotImplementedError # can't return a vector
71
+ elif isinstance(indices, slice):
72
+ if len(shape) > 2:
73
+ return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
74
+ else:
75
+ return LoraTorchTensor(self._lora_A, self._lora_B[indices])
76
+ elif isinstance(indices, tuple):
77
+ assert len(indices) > 0
78
+ if indices[-1] is Ellipsis:
79
+ return self[indices[:-1]]
80
+ # expand ellipsis
81
+ indices = tuple(
82
+ u
83
+ for v in (
84
+ (
85
+ (slice(None, None) for _ in range(len(indices) - 1))
86
+ if i is Ellipsis
87
+ else (i,)
88
+ )
89
+ for i in indices
90
+ )
91
+ for u in v
92
+ )
93
+
94
+ if len(indices) < len(shape):
95
+ indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
96
+
97
+ # TODO: make sure this is correct
98
+ indices_A = (
99
+ *(
100
+ (
101
+ j.__index__() % self._lora_A.shape[i]
102
+ if isinstance(j, SupportsIndex)
103
+ else slice(None, None)
104
+ )
105
+ for i, j in enumerate(indices[:-2])
106
+ ),
107
+ slice(None, None),
108
+ indices[-1],
109
+ )
110
+ indices_B = indices[:-1]
111
+ return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
112
+ else:
113
+ raise NotImplementedError # unknown indice type
114
+
115
+ @property
116
+ def dtype(self) -> torch.dtype:
117
+ assert self._lora_A.dtype == self._lora_B.dtype
118
+ return self._lora_A.dtype
119
+
120
+ @property
121
+ def shape(self) -> tuple[int, ...]:
122
+ assert len(self._lora_A.shape) == len(self._lora_B.shape)
123
+ return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
124
+
125
+ def size(self, dim=None):
126
+ assert dim is None
127
+ return self.shape
128
+
129
+ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
130
+ if isinstance(shape[0], tuple):
131
+ new_shape: tuple[int, ...] = shape[0]
132
+ else:
133
+ new_shape = cast(tuple[int, ...], shape)
134
+ orig_shape = self.shape
135
+ if len(new_shape) < 2:
136
+ raise NotImplementedError # can't become a vector
137
+
138
+ # expand -1 in the shape
139
+ if any(dim == -1 for dim in new_shape):
140
+ n_elems = prod(orig_shape)
141
+ n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
142
+ assert n_elems % n_new_elems == 0
143
+ new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
144
+
145
+ if new_shape[-1] != orig_shape[-1]:
146
+ raise NotImplementedError # can't reshape the row size trivially
147
+
148
+ shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
149
+ shape_B = (*new_shape[:-1], self._rank)
150
+ return LoraTorchTensor(
151
+ self._lora_A.reshape(shape_A),
152
+ self._lora_B.reshape(shape_B),
153
+ )
154
+
155
+ def reshape_as(self, other: Tensor) -> LoraTorchTensor:
156
+ return self.reshape(*other.shape)
157
+
158
+ def view(self, *size: int) -> LoraTorchTensor:
159
+ return self.reshape(*size)
160
+
161
+ def permute(self, *dims: int) -> LoraTorchTensor:
162
+ shape = self.shape
163
+ dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
164
+ if dims[-1] == -1:
165
+ # TODO: support higher dimensional A shapes bigger than 1
166
+ assert all(dim == 1 for dim in self._lora_A.shape[:-2])
167
+ return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
168
+ if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
169
+ return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
170
+ else:
171
+ # TODO: compose the above two
172
+ raise NotImplementedError
173
+
174
+ def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
175
+ shape = self.shape
176
+ dims = [i for i in range(len(shape))]
177
+ dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
178
+ return self.permute(*dims)
179
+
180
+ def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
181
+ return self.transpose(axis0, axis1)
182
+
183
+ def to(self, *args, **kwargs):
184
+ return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
185
+
186
+ @classmethod
187
+ def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
188
+ del types # unused
189
+
190
+ if kwargs is None:
191
+ kwargs = {}
192
+
193
+ if func is torch.permute:
194
+ return type(args[0]).permute(*args, **kwargs)
195
+ elif func is torch.reshape:
196
+ return type(args[0]).reshape(*args, **kwargs)
197
+ elif func is torch.stack:
198
+ assert isinstance(args[0], Sequence)
199
+ dim = kwargs.get("dim", 0)
200
+ assert dim == 0
201
+ return LoraTorchTensor(
202
+ torch.stack([a._lora_A for a in args[0]], dim),
203
+ torch.stack([b._lora_B for b in args[0]], dim),
204
+ )
205
+ elif func is torch.cat:
206
+ assert isinstance(args[0], Sequence)
207
+ dim = kwargs.get("dim", 0)
208
+ assert dim == 0
209
+ if len(args[0][0].shape) > 2:
210
+ return LoraTorchTensor(
211
+ torch.cat([a._lora_A for a in args[0]], dim),
212
+ torch.cat([b._lora_B for b in args[0]], dim),
213
+ )
214
+ elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
215
+ return LoraTorchTensor(
216
+ args[0][0]._lora_A,
217
+ torch.cat([b._lora_B for b in args[0]], dim),
218
+ )
219
+ else:
220
+ raise NotImplementedError
221
+ else:
222
+ raise NotImplementedError
223
+
224
+
225
+ def get_base_tensor_name(lora_tensor_name: str) -> str:
226
+ base_name = lora_tensor_name.replace("base_model.model.", "")
227
+ base_name = base_name.replace(".lora_A.weight", ".weight")
228
+ base_name = base_name.replace(".lora_B.weight", ".weight")
229
+ return base_name
230
+
231
+
232
+ def parse_args() -> argparse.Namespace:
233
+ parser = argparse.ArgumentParser(
234
+ description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
235
+ parser.add_argument(
236
+ "--outfile", type=Path,
237
+ help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
238
+ )
239
+ parser.add_argument(
240
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
241
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
242
+ )
243
+ parser.add_argument(
244
+ "--bigendian", action="store_true",
245
+ help="model is executed on big endian machine",
246
+ )
247
+ parser.add_argument(
248
+ "--no-lazy", action="store_true",
249
+ help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
250
+ )
251
+ parser.add_argument(
252
+ "--verbose", action="store_true",
253
+ help="increase output verbosity",
254
+ )
255
+ parser.add_argument(
256
+ "--dry-run", action="store_true",
257
+ help="only print out what will be done, without writing any new files",
258
+ )
259
+ parser.add_argument(
260
+ "--base", type=Path,
261
+ help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
262
+ )
263
+ parser.add_argument(
264
+ "lora_path", type=Path,
265
+ help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
266
+ )
267
+
268
+ return parser.parse_args()
269
+
270
+
271
+ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
272
+ # normally, adapter does not come with base model config, we need to load it from AutoConfig
273
+ config = AutoConfig.from_pretrained(hf_model_id)
274
+ return config.to_dict()
275
+
276
+
277
+ if __name__ == '__main__':
278
+ args = parse_args()
279
+ logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
280
+
281
+ ftype_map: dict[str, gguf.LlamaFileType] = {
282
+ "f32": gguf.LlamaFileType.ALL_F32,
283
+ "f16": gguf.LlamaFileType.MOSTLY_F16,
284
+ "bf16": gguf.LlamaFileType.MOSTLY_BF16,
285
+ "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
286
+ "auto": gguf.LlamaFileType.GUESSED,
287
+ }
288
+
289
+ ftype = ftype_map[args.outtype]
290
+
291
+ dir_base_model: Path | None = args.base
292
+ dir_lora: Path = args.lora_path
293
+ lora_config = dir_lora / "adapter_config.json"
294
+ input_model = dir_lora / "adapter_model.safetensors"
295
+
296
+ if args.outfile is not None:
297
+ fname_out = args.outfile
298
+ else:
299
+ # output in the same directory as the model by default
300
+ fname_out = dir_lora
301
+
302
+ if os.path.exists(input_model):
303
+ # lazy import load_file only if lora is in safetensors format.
304
+ from safetensors.torch import load_file
305
+
306
+ lora_model = load_file(input_model, device="cpu")
307
+ else:
308
+ input_model = os.path.join(dir_lora, "adapter_model.bin")
309
+ lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
310
+
311
+ # load LoRA config
312
+ with open(lora_config, "r") as f:
313
+ lparams: dict[str, Any] = json.load(f)
314
+
315
+ # load base model
316
+ if dir_base_model is None:
317
+ if "base_model_name_or_path" in lparams:
318
+ model_id = lparams["base_model_name_or_path"]
319
+ logger.info(f"Loading base model from Hugging Face: {model_id}")
320
+ try:
321
+ hparams = load_hparams_from_hf(model_id)
322
+ except OSError as e:
323
+ logger.error(f"Failed to load base model config: {e}")
324
+ logger.error("Please try downloading the base model and add its path to --base")
325
+ sys.exit(1)
326
+ else:
327
+ logger.error("'base_model_name_or_path' is not found in adapter_config.json")
328
+ logger.error("Base model config is required. Please download the base model and add its path to --base")
329
+ sys.exit(1)
330
+ else:
331
+ logger.info(f"Loading base model: {dir_base_model.name}")
332
+ hparams = Model.load_hparams(dir_base_model)
333
+
334
+ with torch.inference_mode():
335
+ try:
336
+ model_class = Model.from_model_architecture(hparams["architectures"][0])
337
+ except NotImplementedError:
338
+ logger.error(f"Model {hparams['architectures'][0]} is not supported")
339
+ sys.exit(1)
340
+
341
+ class LoraModel(model_class):
342
+ model_arch = model_class.model_arch
343
+
344
+ lora_alpha: float
345
+
346
+ def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
347
+
348
+ super().__init__(*args, **kwargs)
349
+
350
+ self.dir_model_card = dir_lora_model
351
+ self.lora_alpha = float(lora_alpha)
352
+
353
+ def set_vocab(self):
354
+ pass
355
+
356
+ def set_type(self):
357
+ self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
358
+ self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
359
+
360
+ def set_gguf_parameters(self):
361
+ self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
362
+
363
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
364
+ # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
365
+ return ()
366
+
367
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
368
+ tensor_map: dict[str, PartialLoraTensor] = {}
369
+
370
+ for name, tensor in lora_model.items():
371
+ if self.lazy:
372
+ tensor = LazyTorchTensor.from_eager(tensor)
373
+ base_name = get_base_tensor_name(name)
374
+ is_lora_a = ".lora_A.weight" in name
375
+ is_lora_b = ".lora_B.weight" in name
376
+ if not is_lora_a and not is_lora_b:
377
+ if ".base_layer.weight" in name:
378
+ continue
379
+ logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
380
+ if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
381
+ logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
382
+ logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
383
+ sys.exit(1)
384
+
385
+ if base_name in tensor_map:
386
+ if is_lora_a:
387
+ tensor_map[base_name].A = tensor
388
+ else:
389
+ tensor_map[base_name].B = tensor
390
+ else:
391
+ if is_lora_a:
392
+ tensor_map[base_name] = PartialLoraTensor(A=tensor)
393
+ else:
394
+ tensor_map[base_name] = PartialLoraTensor(B=tensor)
395
+
396
+ for name, tensor in tensor_map.items():
397
+ assert tensor.A is not None
398
+ assert tensor.B is not None
399
+ yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
400
+
401
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
402
+ dest = list(super().modify_tensors(data_torch, name, bid))
403
+ # some archs may have the same tensor for lm_head and output (tie word embeddings)
404
+ # in this case, adapters targeting lm_head will fail when using llama-export-lora
405
+ # therefore, we ignore them for now
406
+ # see: https://github.com/ggerganov/llama.cpp/issues/9065
407
+ if name == "lm_head.weight" and len(dest) == 0:
408
+ raise ValueError("lm_head is present in adapter, but is ignored in base model")
409
+ for dest_name, dest_data in dest:
410
+ assert isinstance(dest_data, LoraTorchTensor)
411
+ lora_a, lora_b = dest_data.get_lora_A_B()
412
+
413
+ yield (dest_name + ".lora_a", lora_a)
414
+ yield (dest_name + ".lora_b", lora_b)
415
+
416
+ alpha: float = lparams["lora_alpha"]
417
+
418
+ model_instance = LoraModel(
419
+ dir_base_model,
420
+ ftype,
421
+ fname_out,
422
+ is_big_endian=args.bigendian,
423
+ use_temp_file=False,
424
+ eager=args.no_lazy,
425
+ dry_run=args.dry_run,
426
+ dir_lora_model=dir_lora,
427
+ lora_alpha=alpha,
428
+ hparams=hparams,
429
+ )
430
+
431
+ logger.info("Exporting model...")
432
+ model_instance.write()
433
+ logger.info(f"Model successfully exported to {model_instance.fname_out}")