Vui Seng Chua commited on
Commit
cfb9114
β€’
1 Parent(s): e3cc684

Add content

Browse files
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ This repo contains serialized blobs of an up projection layer of llama3-8B (oc=14336, ic=4096).
3
+ The linear layer has been quantized (GPTQ W4 Sym with group size 32) and sparsified by 50%.
4
+
5
+ ```
6
+ β”œβ”€β”€ sparse_w4
7
+ β”‚ β”œβ”€β”€ linear_bitmap_int32.bin
8
+ β”‚ β”œβ”€β”€ linear_compressed_qweight_int32.bin
9
+ β”‚ β”œβ”€β”€ linear_nnz_int16.bin
10
+ β”‚ β”œβ”€β”€ linear_scales_float16.bin
11
+ β”‚ └── linear_zeros_int32.bin
12
+ ```
13
+
14
+ ### Usage
15
+ The following script shows how to process the blobs in python. It shows unpacking, zero location recovery, as well as weight dequantization process.
16
+ ```bash
17
+ python unpack_blobs.py
18
+ ```
19
+
20
+ > you can ignore `internal/`
internal/donttouch_unpacking_autogptq/__pycache__/fake_dequantize.cpython-311.pyc ADDED
Binary file (3.75 kB). View file
 
internal/donttouch_unpacking_autogptq/autogpt_sample.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
2
+
3
+ model_id = "facebook/opt-125m"
4
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
5
+ quantization_config = GPTQConfig(bits=4, sym=True, dataset = 'wikitext2', tokenizer=tokenizer, group_size=128, desc_act=False, use_exllama=False)
6
+
7
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)
8
+
9
+ print("joto")
10
+
11
+
12
+
13
+
internal/donttouch_unpacking_autogptq/blob_manipulate.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+ blob = torch.load("./opt-125m-gptq4.pth")
5
+
6
+ for layer, lblob in blob.items():
7
+ if 'model.decoder.layers.0.fc1' in layer:
8
+ print(f"--> {layer}")
9
+ prepack = lblob['prepack']
10
+ pack = lblob['pack']
11
+
12
+ for k, v in prepack.items():
13
+ print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
14
+
15
+ for k, v in pack.items():
16
+ print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
17
+ break
18
+
19
+ qweight = pack['qweight'].numpy()
20
+ scales = pack['scales'].numpy() #(ngroup, OC)
21
+ qzeros = pack['qzeros'].numpy() #(ngroup, OC//numel_per_int32)
22
+
23
+
24
+
25
+ nbit=4
26
+ numel_per_int32 = 32//nbit
27
+ IC = qweight.shape[0]*numel_per_int32
28
+ OC = qweight.shape[1]
29
+ group_size = IC//scales.shape[0]
30
+
31
+ qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
32
+ for row in range(0, qweight.shape[0]):
33
+ for k in range(0, numel_per_int32):
34
+ qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
35
+
36
+ torch.allclose(
37
+ torch.from_numpy(qweight_unpack).to(torch.int32),
38
+ torch.from_numpy(pack['intweight'].astype(np.int32))
39
+ )
40
+
41
+ scales_float = scales.astype(np.float32)
42
+
43
+ # TODO: verify with asym zero point. sym zero points are all identical
44
+ qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
45
+ for i in range(0, numel_per_int32):
46
+ # shift multiplier
47
+ shift_multiplier = numel_per_int32 - 1 - i
48
+ shift_by = shift_multiplier * nbit
49
+ qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
50
+ qzeros_unpack += 1 # for some reason they minus 1
51
+
52
+ qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
53
+ qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
54
+ scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
55
+
56
+ deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
57
+ for i in range(IC):
58
+ gid = i//group_size
59
+ deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
60
+
61
+ print(torch.allclose(deqweight_unpack, prepack['w'].t(), atol=0.0005))
62
+ print("temp")
63
+
64
+ # Numpy path
65
+ # deqweight_unpack = np.zeros((IC,OC), dtype=np.float32)
66
+ # for i in range(IC):
67
+ # gid = i//group_size
68
+ # deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
69
+
70
+ # deqweight_unpack = torch.from_numpy(deqweight_unpack).to(torch.float16)
71
+
72
+ torch.allclose(dequant_float, prepack['w'].t(), atol=0.0005)
73
+ print("blob")
internal/donttouch_unpacking_autogptq/fake_dequantize.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+
6
+ def fake_dequantize(qweight, scales, qzeros):
7
+ nbit=4
8
+ numel_per_int32 = 32//nbit
9
+
10
+ qweight = qweight.cpu().numpy()
11
+ scales = scales.cpu().numpy() #(ngroup, OC)
12
+ qzeros = qzeros.cpu().numpy() #(ngroup, OC//numel_per_int32)
13
+
14
+ IC = qweight.shape[0]*numel_per_int32
15
+ OC = qweight.shape[1]
16
+ group_size = IC//scales.shape[0]
17
+
18
+ qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
19
+ for row in range(0, qweight.shape[0]):
20
+ for k in range(0, numel_per_int32):
21
+ qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
22
+
23
+ scales_float = scales.astype(np.float32)
24
+
25
+ qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
26
+ for i in range(0, numel_per_int32):
27
+ # shift multiplier
28
+ shift_multiplier = numel_per_int32 - 1 - i
29
+ shift_by = shift_multiplier * nbit
30
+ qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
31
+ qzeros_unpack += 1 # for some reason they minus 1
32
+
33
+ qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
34
+ qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
35
+ scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
36
+
37
+ deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
38
+ for i in range(IC):
39
+ gid = i//group_size
40
+ deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
41
+
42
+ return deqweight_unpack, scales_float, qzeros_unpack
43
+
44
+
45
+
46
+ if __name__ == "__main__":
47
+ blob = torch.load("./opt-125m-gptq4.pth")
48
+
49
+ for layer, lblob in blob.items():
50
+ print(f"\n\n--> {layer}")
51
+ prepack = lblob['prepack']
52
+ pack = lblob['pack']
53
+
54
+ # for k, v in prepack.items():
55
+ # print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
56
+
57
+ # for k, v in pack.items():
58
+ # print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
59
+
60
+ W, _, _ = fake_dequantize(pack['qweight'], pack['scales'], pack['qzeros'])
61
+
62
+ simulated_match = torch.allclose(W, prepack['w'].t(), atol=0.0005)
63
+
64
+ print(f"simulated_match? {simulated_match}")
65
+
internal/donttouch_unpacking_autogptq/opt-125m-gptq4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0269cabd58cd27261fde469502a01e84760a413b16ffa7989f395c53c65e46f4
3
+ size 46688098
internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from logging import getLogger
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ import transformers
8
+
9
+
10
+ logger = getLogger(__name__)
11
+ try:
12
+ import autogptq_cuda_64
13
+ import autogptq_cuda_256
14
+
15
+ _autogptq_cuda_available = True
16
+ except ImportError:
17
+ logger.warning("CUDA extension not installed.")
18
+ autogptq_cuda_256 = None
19
+ autogptq_cuda_64 = None
20
+ _autogptq_cuda_available = False
21
+
22
+
23
+ class QuantLinear(nn.Module):
24
+ QUANT_TYPE = "cuda-old"
25
+
26
+ def __init__(
27
+ self,
28
+ bits,
29
+ group_size,
30
+ infeatures,
31
+ outfeatures,
32
+ bias,
33
+ use_cuda_fp16=True,
34
+ kernel_switch_threshold=128,
35
+ trainable=False,
36
+ weight_dtype=torch.float16,
37
+ ):
38
+ super().__init__()
39
+ global _autogptq_cuda_available
40
+ if bits not in [2, 3, 4, 8]:
41
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
42
+ if trainable:
43
+ _autogptq_cuda_available = False
44
+ self.infeatures = infeatures
45
+ self.outfeatures = outfeatures
46
+ self.bits = bits
47
+ self.group_size = group_size if group_size != -1 else infeatures
48
+ self.maxq = 2**self.bits - 1
49
+
50
+ self.register_buffer(
51
+ "qweight",
52
+ torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
53
+ )
54
+ self.register_buffer(
55
+ "qzeros",
56
+ torch.zeros(
57
+ (
58
+ math.ceil(infeatures / self.group_size),
59
+ outfeatures // 32 * self.bits,
60
+ ),
61
+ dtype=torch.int32,
62
+ ),
63
+ )
64
+ self.register_buffer(
65
+ "scales",
66
+ torch.zeros(
67
+ (math.ceil(infeatures / self.group_size), outfeatures),
68
+ dtype=weight_dtype,
69
+ ),
70
+ )
71
+ self.register_buffer(
72
+ "g_idx",
73
+ torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
74
+ )
75
+
76
+ if bias:
77
+ self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
78
+ else:
79
+ self.bias = None
80
+ self.half_indim = self.infeatures // 2
81
+
82
+ self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
83
+
84
+ # is performed by unpacking the weights and using torch.matmul
85
+ if self.bits in [2, 4, 8]:
86
+ self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
87
+ elif self.bits == 3:
88
+ self.wf = torch.tensor(
89
+ [
90
+ [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
91
+ [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
92
+ [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
93
+ ],
94
+ dtype=torch.int32,
95
+ ).reshape(1, 3, 12)
96
+
97
+ self.kernel_switch_threshold = kernel_switch_threshold
98
+ self.autogptq_cuda_available = _autogptq_cuda_available
99
+ self.autogptq_cuda = autogptq_cuda_256
100
+ if infeatures % 256 != 0 or outfeatures % 256 != 0:
101
+ self.autogptq_cuda = autogptq_cuda_64
102
+ if infeatures % 64 != 0 or outfeatures % 64 != 0:
103
+ self.autogptq_cuda_available = False
104
+
105
+ self.trainable = trainable
106
+
107
+ def post_init(self):
108
+ pass
109
+
110
+ def pack(self, linear, scales, zeros, g_idx):
111
+ W = linear.weight.data.clone()
112
+ if isinstance(linear, nn.Conv2d):
113
+ W = W.flatten(1)
114
+ if isinstance(linear, transformers.pytorch_utils.Conv1D):
115
+ W = W.t()
116
+
117
+ scales = scales.t().contiguous()
118
+ zeros = zeros.t().contiguous()
119
+ scale_zeros = zeros * scales
120
+ self.scales = scales.clone().to(dtype=linear.weight.dtype)
121
+ if linear.bias is not None:
122
+ self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
123
+
124
+ intweight = []
125
+ for idx in range(self.infeatures):
126
+ g_idx = idx // self.group_size
127
+ intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
128
+ intweight = torch.cat(intweight, dim=1)
129
+ intweight = intweight.t().contiguous()
130
+ intweight = intweight.numpy().astype(np.uint32)
131
+ self.intweight = intweight
132
+
133
+ i = 0
134
+ row = 0
135
+ qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
136
+ while row < qweight.shape[0]:
137
+ if self.bits in [2, 4, 8]:
138
+ for j in range(i, i + (32 // self.bits)):
139
+ qweight[row] |= intweight[j] << (self.bits * (j - i))
140
+ i += 32 // self.bits
141
+ row += 1
142
+ elif self.bits == 3:
143
+ for j in range(i, i + 10):
144
+ qweight[row] |= intweight[j] << (3 * (j - i))
145
+ i += 10
146
+ qweight[row] |= intweight[i] << 30
147
+ row += 1
148
+ qweight[row] |= (intweight[i] >> 2) & 1
149
+ i += 1
150
+ for j in range(i, i + 10):
151
+ qweight[row] |= intweight[j] << (3 * (j - i) + 1)
152
+ i += 10
153
+ qweight[row] |= intweight[i] << 31
154
+ row += 1
155
+ qweight[row] |= (intweight[i] >> 1) & 0x3
156
+ i += 1
157
+ for j in range(i, i + 10):
158
+ qweight[row] |= intweight[j] << (3 * (j - i) + 2)
159
+ i += 10
160
+ row += 1
161
+ else:
162
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
163
+
164
+ qweight = qweight.astype(np.int32)
165
+ self.qweight = torch.from_numpy(qweight)
166
+
167
+ zeros -= 1
168
+ zeros = zeros.numpy().astype(np.uint32)
169
+ qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
170
+ i = 0
171
+ col = 0
172
+ while col < qzeros.shape[1]:
173
+ if self.bits in [2, 4, 8]:
174
+ for j in range(i, i + (32 // self.bits)):
175
+ qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
176
+ i += 32 // self.bits
177
+ col += 1
178
+ elif self.bits == 3:
179
+ for j in range(i, i + 10):
180
+ qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
181
+ i += 10
182
+ qzeros[:, col] |= zeros[:, i] << 30
183
+ col += 1
184
+ qzeros[:, col] |= (zeros[:, i] >> 2) & 1
185
+ i += 1
186
+ for j in range(i, i + 10):
187
+ qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
188
+ i += 10
189
+ qzeros[:, col] |= zeros[:, i] << 31
190
+ col += 1
191
+ qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
192
+ i += 1
193
+ for j in range(i, i + 10):
194
+ qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
195
+ i += 10
196
+ col += 1
197
+ else:
198
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
199
+
200
+ qzeros = qzeros.astype(np.int32)
201
+ self.qzeros = torch.from_numpy(qzeros)
202
+
203
+ def forward(self, x):
204
+ x_dtype = x.dtype
205
+ out_shape = x.shape[:-1] + (self.outfeatures,)
206
+ x = x.reshape(-1, x.shape[-1])
207
+ if (
208
+ x.device.type == "cuda"
209
+ and self.autogptq_cuda_available is True
210
+ and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
211
+ ):
212
+ out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
213
+ if self.use_cuda_fp16:
214
+ if x_dtype != torch.float16:
215
+ logger.warning_once(
216
+ f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
217
+ )
218
+
219
+ if self.bits == 2:
220
+ self.autogptq_cuda.vecquant2matmul_faster_old(
221
+ x,
222
+ self.qweight,
223
+ out,
224
+ self.scales.float(),
225
+ self.qzeros,
226
+ self.group_size,
227
+ self.half_indim,
228
+ )
229
+ elif self.bits == 3:
230
+ self.autogptq_cuda.vecquant3matmul_faster_old(
231
+ x,
232
+ self.qweight,
233
+ out,
234
+ self.scales.float(),
235
+ self.qzeros,
236
+ self.group_size,
237
+ self.half_indim,
238
+ )
239
+ elif self.bits == 4:
240
+ self.autogptq_cuda.vecquant4matmul_faster_old(
241
+ x,
242
+ self.qweight,
243
+ out,
244
+ self.scales.float(),
245
+ self.qzeros,
246
+ self.group_size,
247
+ self.half_indim,
248
+ )
249
+
250
+ else:
251
+ raise NotImplementedError("Only 2,3,4 bits are supported.")
252
+ else:
253
+ x = x.to(torch.float32) # This is required for autocast compatibility.
254
+ if self.bits == 2:
255
+ self.autogptq_cuda.vecquant2matmul_old(
256
+ x,
257
+ self.qweight,
258
+ out,
259
+ self.scales.float(),
260
+ self.qzeros,
261
+ self.group_size,
262
+ )
263
+ elif self.bits == 3:
264
+ self.autogptq_cuda.vecquant3matmul_old(
265
+ x,
266
+ self.qweight,
267
+ out,
268
+ self.scales.float(),
269
+ self.qzeros,
270
+ self.group_size,
271
+ )
272
+ elif self.bits == 4:
273
+ self.autogptq_cuda.vecquant4matmul_old(
274
+ x,
275
+ self.qweight,
276
+ out,
277
+ self.scales.float(),
278
+ self.qzeros,
279
+ self.group_size,
280
+ )
281
+ elif self.bits == 8:
282
+ self.autogptq_cuda.vecquant8matmul_old(
283
+ x,
284
+ self.qweight,
285
+ out,
286
+ self.scales.float(),
287
+ self.qzeros,
288
+ self.group_size,
289
+ )
290
+ else:
291
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
292
+ else:
293
+ if self.wf.device != self.qzeros.device:
294
+ self.wf = self.wf.to(self.qzeros.device)
295
+
296
+ if self.bits in [2, 4, 8]:
297
+ zeros = torch.bitwise_right_shift(
298
+ torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
299
+ self.wf.unsqueeze(0),
300
+ ).to(torch.int16 if self.bits == 8 else torch.int8)
301
+
302
+ zeros = zeros + 1
303
+ zeros = torch.bitwise_and(
304
+ zeros, (2**self.bits) - 1
305
+ ) # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
306
+
307
+ zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
308
+
309
+ scales = self.scales
310
+ scales = scales.reshape(-1, 1, scales.shape[-1])
311
+
312
+ weight = torch.bitwise_right_shift(
313
+ torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
314
+ self.wf.unsqueeze(-1),
315
+ ).to(torch.int16 if self.bits == 8 else torch.int8)
316
+ weight = torch.bitwise_and(weight, (2**self.bits) - 1)
317
+ weight = weight.reshape(-1, self.group_size, weight.shape[2])
318
+ elif self.bits == 3:
319
+ zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
320
+ -1, -1, -1, 12
321
+ )
322
+ zeros = zeros >> self.wf.unsqueeze(0)
323
+ zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
324
+ zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
325
+ zeros = zeros & 0x7
326
+ zeros = torch.cat(
327
+ [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
328
+ dim=2,
329
+ )
330
+
331
+ zeros = zeros + 1
332
+ zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
333
+
334
+ scales = self.scales
335
+ scales = scales.reshape(-1, 1, scales.shape[-1])
336
+
337
+ weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
338
+ -1, -1, 12, -1
339
+ )
340
+ weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
341
+ weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
342
+ weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
343
+ weight = weight & 0x7
344
+ weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
345
+ weight = weight.reshape(-1, self.group_size, weight.shape[2])
346
+ else:
347
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
348
+
349
+ weight = scales * (weight - zeros)
350
+ weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
351
+ out = torch.matmul(x, weight)
352
+ out = out.to(dtype=x_dtype).reshape(
353
+ out_shape
354
+ ) # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
355
+ out = out + self.bias if self.bias is not None else out
356
+ return out
357
+
358
+
359
+ __all__ = ["QuantLinear"]
internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py.ori.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from logging import getLogger
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ import transformers
8
+
9
+
10
+ logger = getLogger(__name__)
11
+ try:
12
+ import autogptq_cuda_64
13
+ import autogptq_cuda_256
14
+
15
+ _autogptq_cuda_available = True
16
+ except ImportError:
17
+ logger.warning("CUDA extension not installed.")
18
+ autogptq_cuda_256 = None
19
+ autogptq_cuda_64 = None
20
+ _autogptq_cuda_available = False
21
+
22
+
23
+ class QuantLinear(nn.Module):
24
+ QUANT_TYPE = "cuda-old"
25
+
26
+ def __init__(
27
+ self,
28
+ bits,
29
+ group_size,
30
+ infeatures,
31
+ outfeatures,
32
+ bias,
33
+ use_cuda_fp16=True,
34
+ kernel_switch_threshold=128,
35
+ trainable=False,
36
+ weight_dtype=torch.float16,
37
+ ):
38
+ super().__init__()
39
+ global _autogptq_cuda_available
40
+ if bits not in [2, 3, 4, 8]:
41
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
42
+ if trainable:
43
+ _autogptq_cuda_available = False
44
+ self.infeatures = infeatures
45
+ self.outfeatures = outfeatures
46
+ self.bits = bits
47
+ self.group_size = group_size if group_size != -1 else infeatures
48
+ self.maxq = 2**self.bits - 1
49
+
50
+ self.register_buffer(
51
+ "qweight",
52
+ torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
53
+ )
54
+ self.register_buffer(
55
+ "qzeros",
56
+ torch.zeros(
57
+ (
58
+ math.ceil(infeatures / self.group_size),
59
+ outfeatures // 32 * self.bits,
60
+ ),
61
+ dtype=torch.int32,
62
+ ),
63
+ )
64
+ self.register_buffer(
65
+ "scales",
66
+ torch.zeros(
67
+ (math.ceil(infeatures / self.group_size), outfeatures),
68
+ dtype=weight_dtype,
69
+ ),
70
+ )
71
+ self.register_buffer(
72
+ "g_idx",
73
+ torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
74
+ )
75
+
76
+ if bias:
77
+ self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
78
+ else:
79
+ self.bias = None
80
+ self.half_indim = self.infeatures // 2
81
+
82
+ self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
83
+
84
+ # is performed by unpacking the weights and using torch.matmul
85
+ if self.bits in [2, 4, 8]:
86
+ self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
87
+ elif self.bits == 3:
88
+ self.wf = torch.tensor(
89
+ [
90
+ [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
91
+ [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
92
+ [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
93
+ ],
94
+ dtype=torch.int32,
95
+ ).reshape(1, 3, 12)
96
+
97
+ self.kernel_switch_threshold = kernel_switch_threshold
98
+ self.autogptq_cuda_available = _autogptq_cuda_available
99
+ self.autogptq_cuda = autogptq_cuda_256
100
+ if infeatures % 256 != 0 or outfeatures % 256 != 0:
101
+ self.autogptq_cuda = autogptq_cuda_64
102
+ if infeatures % 64 != 0 or outfeatures % 64 != 0:
103
+ self.autogptq_cuda_available = False
104
+
105
+ self.trainable = trainable
106
+
107
+ def post_init(self):
108
+ pass
109
+
110
+ def pack(self, linear, scales, zeros, g_idx):
111
+ W = linear.weight.data.clone()
112
+ if isinstance(linear, nn.Conv2d):
113
+ W = W.flatten(1)
114
+ if isinstance(linear, transformers.pytorch_utils.Conv1D):
115
+ W = W.t()
116
+
117
+ scales = scales.t().contiguous()
118
+ zeros = zeros.t().contiguous()
119
+ scale_zeros = zeros * scales
120
+ self.scales = scales.clone().to(dtype=linear.weight.dtype)
121
+ if linear.bias is not None:
122
+ self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
123
+
124
+ intweight = []
125
+ for idx in range(self.infeatures):
126
+ g_idx = idx // self.group_size
127
+ intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
128
+ intweight = torch.cat(intweight, dim=1)
129
+ intweight = intweight.t().contiguous()
130
+ intweight = intweight.numpy().astype(np.uint32)
131
+
132
+ i = 0
133
+ row = 0
134
+ qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
135
+ while row < qweight.shape[0]:
136
+ if self.bits in [2, 4, 8]:
137
+ for j in range(i, i + (32 // self.bits)):
138
+ qweight[row] |= intweight[j] << (self.bits * (j - i))
139
+ i += 32 // self.bits
140
+ row += 1
141
+ elif self.bits == 3:
142
+ for j in range(i, i + 10):
143
+ qweight[row] |= intweight[j] << (3 * (j - i))
144
+ i += 10
145
+ qweight[row] |= intweight[i] << 30
146
+ row += 1
147
+ qweight[row] |= (intweight[i] >> 2) & 1
148
+ i += 1
149
+ for j in range(i, i + 10):
150
+ qweight[row] |= intweight[j] << (3 * (j - i) + 1)
151
+ i += 10
152
+ qweight[row] |= intweight[i] << 31
153
+ row += 1
154
+ qweight[row] |= (intweight[i] >> 1) & 0x3
155
+ i += 1
156
+ for j in range(i, i + 10):
157
+ qweight[row] |= intweight[j] << (3 * (j - i) + 2)
158
+ i += 10
159
+ row += 1
160
+ else:
161
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
162
+
163
+ qweight = qweight.astype(np.int32)
164
+ self.qweight = torch.from_numpy(qweight)
165
+
166
+ zeros -= 1
167
+ zeros = zeros.numpy().astype(np.uint32)
168
+ qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
169
+ i = 0
170
+ col = 0
171
+ while col < qzeros.shape[1]:
172
+ if self.bits in [2, 4, 8]:
173
+ for j in range(i, i + (32 // self.bits)):
174
+ qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
175
+ i += 32 // self.bits
176
+ col += 1
177
+ elif self.bits == 3:
178
+ for j in range(i, i + 10):
179
+ qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
180
+ i += 10
181
+ qzeros[:, col] |= zeros[:, i] << 30
182
+ col += 1
183
+ qzeros[:, col] |= (zeros[:, i] >> 2) & 1
184
+ i += 1
185
+ for j in range(i, i + 10):
186
+ qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
187
+ i += 10
188
+ qzeros[:, col] |= zeros[:, i] << 31
189
+ col += 1
190
+ qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
191
+ i += 1
192
+ for j in range(i, i + 10):
193
+ qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
194
+ i += 10
195
+ col += 1
196
+ else:
197
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
198
+
199
+ qzeros = qzeros.astype(np.int32)
200
+ self.qzeros = torch.from_numpy(qzeros)
201
+
202
+ def forward(self, x):
203
+ x_dtype = x.dtype
204
+ out_shape = x.shape[:-1] + (self.outfeatures,)
205
+ x = x.reshape(-1, x.shape[-1])
206
+ if (
207
+ x.device.type == "cuda"
208
+ and self.autogptq_cuda_available is True
209
+ and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
210
+ ):
211
+ out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
212
+ if self.use_cuda_fp16:
213
+ if x_dtype != torch.float16:
214
+ logger.warning_once(
215
+ f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
216
+ )
217
+
218
+ if self.bits == 2:
219
+ self.autogptq_cuda.vecquant2matmul_faster_old(
220
+ x,
221
+ self.qweight,
222
+ out,
223
+ self.scales.float(),
224
+ self.qzeros,
225
+ self.group_size,
226
+ self.half_indim,
227
+ )
228
+ elif self.bits == 3:
229
+ self.autogptq_cuda.vecquant3matmul_faster_old(
230
+ x,
231
+ self.qweight,
232
+ out,
233
+ self.scales.float(),
234
+ self.qzeros,
235
+ self.group_size,
236
+ self.half_indim,
237
+ )
238
+ elif self.bits == 4:
239
+ self.autogptq_cuda.vecquant4matmul_faster_old(
240
+ x,
241
+ self.qweight,
242
+ out,
243
+ self.scales.float(),
244
+ self.qzeros,
245
+ self.group_size,
246
+ self.half_indim,
247
+ )
248
+
249
+ else:
250
+ raise NotImplementedError("Only 2,3,4 bits are supported.")
251
+ else:
252
+ x = x.to(torch.float32) # This is required for autocast compatibility.
253
+ if self.bits == 2:
254
+ self.autogptq_cuda.vecquant2matmul_old(
255
+ x,
256
+ self.qweight,
257
+ out,
258
+ self.scales.float(),
259
+ self.qzeros,
260
+ self.group_size,
261
+ )
262
+ elif self.bits == 3:
263
+ self.autogptq_cuda.vecquant3matmul_old(
264
+ x,
265
+ self.qweight,
266
+ out,
267
+ self.scales.float(),
268
+ self.qzeros,
269
+ self.group_size,
270
+ )
271
+ elif self.bits == 4:
272
+ self.autogptq_cuda.vecquant4matmul_old(
273
+ x,
274
+ self.qweight,
275
+ out,
276
+ self.scales.float(),
277
+ self.qzeros,
278
+ self.group_size,
279
+ )
280
+ elif self.bits == 8:
281
+ self.autogptq_cuda.vecquant8matmul_old(
282
+ x,
283
+ self.qweight,
284
+ out,
285
+ self.scales.float(),
286
+ self.qzeros,
287
+ self.group_size,
288
+ )
289
+ else:
290
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
291
+ else:
292
+ if self.wf.device != self.qzeros.device:
293
+ self.wf = self.wf.to(self.qzeros.device)
294
+
295
+ if self.bits in [2, 4, 8]:
296
+ zeros = torch.bitwise_right_shift(
297
+ torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
298
+ self.wf.unsqueeze(0),
299
+ ).to(torch.int16 if self.bits == 8 else torch.int8)
300
+
301
+ zeros = zeros + 1
302
+ zeros = torch.bitwise_and(
303
+ zeros, (2**self.bits) - 1
304
+ ) # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
305
+
306
+ zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
307
+
308
+ scales = self.scales
309
+ scales = scales.reshape(-1, 1, scales.shape[-1])
310
+
311
+ weight = torch.bitwise_right_shift(
312
+ torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
313
+ self.wf.unsqueeze(-1),
314
+ ).to(torch.int16 if self.bits == 8 else torch.int8)
315
+ weight = torch.bitwise_and(weight, (2**self.bits) - 1)
316
+ weight = weight.reshape(-1, self.group_size, weight.shape[2])
317
+ elif self.bits == 3:
318
+ zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
319
+ -1, -1, -1, 12
320
+ )
321
+ zeros = zeros >> self.wf.unsqueeze(0)
322
+ zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
323
+ zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
324
+ zeros = zeros & 0x7
325
+ zeros = torch.cat(
326
+ [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
327
+ dim=2,
328
+ )
329
+
330
+ zeros = zeros + 1
331
+ zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
332
+
333
+ scales = self.scales
334
+ scales = scales.reshape(-1, 1, scales.shape[-1])
335
+
336
+ weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
337
+ -1, -1, 12, -1
338
+ )
339
+ weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
340
+ weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
341
+ weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
342
+ weight = weight & 0x7
343
+ weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
344
+ weight = weight.reshape(-1, self.group_size, weight.shape[2])
345
+ else:
346
+ raise NotImplementedError("Only 2,3,4,8 bits are supported.")
347
+
348
+ weight = scales * (weight - zeros)
349
+ weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
350
+ out = torch.matmul(x, weight)
351
+ out = out.to(dtype=x_dtype).reshape(
352
+ out_shape
353
+ ) # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
354
+ out = out + self.bias if self.bias is not None else out
355
+ return out
356
+
357
+
358
+ __all__ = ["QuantLinear"]
internal/donttouch_unpacking_autogptq/quantizer.py ADDED
@@ -0,0 +1,816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import json
16
+ import os
17
+ from enum import Enum
18
+ from logging import getLogger
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ from torch import nn
23
+ from tqdm.auto import tqdm
24
+ from transformers import AutoTokenizer
25
+ from transformers.pytorch_utils import Conv1D
26
+ from transformers.utils.quantization_config import QuantizationMethod
27
+
28
+ from ..utils import is_accelerate_available, is_auto_gptq_available
29
+ from ..utils.modeling_utils import recurse_getattr
30
+ from .constants import GPTQ_CONFIG
31
+ from .data import get_dataset, prepare_dataset
32
+ from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
33
+ from collections import OrderedDict
34
+
35
+ if is_accelerate_available():
36
+ from accelerate import (
37
+ cpu_offload_with_hook,
38
+ load_checkpoint_and_dispatch,
39
+ )
40
+ from accelerate.hooks import remove_hook_from_module
41
+
42
+ if is_auto_gptq_available():
43
+ from auto_gptq import exllama_set_max_input_length
44
+ from auto_gptq.modeling._utils import autogptq_post_init
45
+ from auto_gptq.quantization import GPTQ
46
+ from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
47
+
48
+ logger = getLogger(__name__)
49
+
50
+
51
+ class ExllamaVersion(int, Enum):
52
+ ONE = 1
53
+ TWO = 2
54
+
55
+
56
+ class GPTQQuantizer(object):
57
+ r"""
58
+ A simple API for GPTQ Quantization
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ bits: int,
64
+ dataset: Optional[Union[List[str], str]] = None,
65
+ group_size: int = 128,
66
+ damp_percent: float = 0.1,
67
+ desc_act: bool = False,
68
+ sym: bool = True,
69
+ true_sequential: bool = True,
70
+ use_cuda_fp16: bool = False,
71
+ model_seqlen: Optional[int] = None,
72
+ block_name_to_quantize: Optional[str] = None,
73
+ module_name_preceding_first_block: Optional[List[str]] = None,
74
+ batch_size: int = 1,
75
+ pad_token_id: Optional[int] = None,
76
+ disable_exllama: bool = False,
77
+ exllama_config: Dict[str, Any] = None,
78
+ max_input_length: Optional[int] = None,
79
+ cache_block_outputs: Optional[bool] = True,
80
+ modules_in_block_to_quantize: Optional[List[List[str]]] = None,
81
+ *args,
82
+ **kwargs,
83
+ ):
84
+ """
85
+ Args:
86
+ bits (`int`):
87
+ The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
88
+ dataset (`Union[List[str], str, Any]`, defaults to `None`):
89
+ The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
90
+ (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
91
+ or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
92
+ group_size (int, defaults to 128):
93
+ The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
94
+ damp_percent (`float`, defaults to `0.1`):
95
+ The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1.
96
+ desc_act (`bool`, defaults to `False`):
97
+ Whether to quantize columns in order of decreasing activation size.
98
+ Setting it to False can significantly speed up inference but the perplexity may become slightly worse.
99
+ Also known as act-order.
100
+ sym (`bool`, defaults to `True`):
101
+ Whether to use symetric quantization.
102
+ true_sequential (`bool`, defaults to `True`):
103
+ Whether to perform sequential quantization even within a single Transformer block.
104
+ Instead of quantizing the entire block at once, we perform layer-wise quantization.
105
+ As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
106
+ use_cuda_fp16 (`bool`, defaults to `False`):
107
+ Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
108
+ model_seqlen (`Optional[int]`, defaults to `None`):
109
+ The maximum sequence length that the model can take.
110
+ block_name_to_quantize (`Optional[str]`, defaults to `None`):
111
+ The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
112
+ module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
113
+ The layers that are preceding the first Transformer block.
114
+ batch_size (`int`, defaults to `1`):
115
+ The batch size of the dataset
116
+ pad_token_id (`Optional[int]`, defaults to `None`):
117
+ The pad token id. Needed to prepare the dataset when `batch_size` > 1.
118
+ disable_exllama (`bool`, defaults to `False`):
119
+ Whether to use exllama backend. Only works with `bits` = 4.
120
+ exllama_config (`Dict[str, Any]`, *optional*):
121
+ The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
122
+ max_input_length (`Optional[int]`, defaults to `None`):
123
+ The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
124
+ It is specific to the exllama backend with act-order.
125
+ cache_block_outputs (`bool`, defaults to `True`):
126
+ Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
127
+ (e.g. ChatGLM) but can require more time.
128
+ modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
129
+ List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
130
+ The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
131
+ If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
132
+ """
133
+
134
+ self.bits = bits
135
+ self.dataset = dataset
136
+ self.group_size = group_size
137
+ self.damp_percent = damp_percent
138
+ self.desc_act = desc_act
139
+ self.sym = sym
140
+ self.true_sequential = true_sequential
141
+ self.use_cuda_fp16 = use_cuda_fp16
142
+ self.model_seqlen = model_seqlen
143
+ self.block_name_to_quantize = block_name_to_quantize
144
+ self.module_name_preceding_first_block = module_name_preceding_first_block
145
+ self.batch_size = batch_size
146
+ self.pad_token_id = pad_token_id
147
+ self.disable_exllama = disable_exllama
148
+ self.exllama_config = exllama_config
149
+ self.max_input_length = max_input_length
150
+ self.quant_method = QuantizationMethod.GPTQ
151
+ self.cache_block_outputs = cache_block_outputs
152
+ self.modules_in_block_to_quantize = modules_in_block_to_quantize
153
+
154
+ self.serialization_keys = [
155
+ "bits",
156
+ "dataset",
157
+ "group_size",
158
+ "damp_percent",
159
+ "desc_act",
160
+ "sym",
161
+ "true_sequential",
162
+ "quant_method",
163
+ "modules_in_block_to_quantize",
164
+ ]
165
+
166
+ if self.bits not in [2, 3, 4, 8]:
167
+ raise ValueError("only support quantize to [2,3,4,8] bits.")
168
+ if self.group_size != -1 and self.group_size <= 0:
169
+ raise ValueError("group_size must be greater than 0 or equal to -1")
170
+ if not (0 < self.damp_percent < 1):
171
+ raise ValueError("damp_percent must between 0 and 1.")
172
+
173
+ if self.exllama_config is None:
174
+ self.exllama_config = {"version": ExllamaVersion.TWO}
175
+ else:
176
+ if "version" not in self.exllama_config:
177
+ raise ValueError("`exllama_config` needs to have a `version` key")
178
+ elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
179
+ version = self.exllama_config["version"]
180
+ raise ValueError(
181
+ f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
182
+ )
183
+ self.exllama_version = self.exllama_config["version"]
184
+
185
+ def to_dict(self):
186
+ """
187
+ Returns the args in dict format.
188
+ """
189
+ gptq_dict = {}
190
+ for key in self.serialization_keys:
191
+ gptq_dict[key] = getattr(self, key)
192
+ return gptq_dict
193
+
194
+ @classmethod
195
+ def from_dict(cls, config_dict: Dict[str, Any]):
196
+ """
197
+ Instantiates a `GPTQQuantizer` using config_dict as kwargs
198
+
199
+ Args:
200
+ config_dict (`Dict[str,Any]`):
201
+ quantization config
202
+
203
+ Returns:
204
+ `GPTQQuantizer`: The quantizer object instantiated from those parameters.
205
+ """
206
+ return cls(**config_dict)
207
+
208
+ def convert_model(self, model: nn.Module):
209
+ """
210
+ Convert the model to a GPTQ model by getting and replacing the layers.
211
+
212
+ Args:
213
+ model (`nn.Module`):
214
+ Model to be converted
215
+
216
+ """
217
+ if self.block_name_to_quantize is None:
218
+ self.block_name_to_quantize = get_block_name_with_pattern(model)
219
+ block_name = self.block_name_to_quantize
220
+ layers_to_be_replaced = get_layers(model, prefix=block_name)
221
+ if self.modules_in_block_to_quantize is not None:
222
+ layers_to_keep = sum(self.modules_in_block_to_quantize, [])
223
+ for name in list(layers_to_be_replaced.keys()):
224
+ if not any(name.endswith(layer) for layer in layers_to_keep):
225
+ logger.info(
226
+ f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
227
+ )
228
+ del layers_to_be_replaced[name]
229
+ self._replace_by_quant_layers(model, layers_to_be_replaced)
230
+ return model
231
+
232
+ def get_no_split_module_classes(self, model):
233
+ """
234
+ Get the modules that should not be split across multiple devices.
235
+ Args:
236
+ model (`nn.Module`):
237
+ The input model
238
+ """
239
+
240
+ block_class_name = recurse_getattr(model, self.block_name_to_quantize)[0].__class__.__name__
241
+ no_split_module_classes = [block_class_name]
242
+ return no_split_module_classes
243
+
244
+ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: str = ""):
245
+ """
246
+ Replaces linear layers in `module` by `QuantLinear`
247
+
248
+ Args:
249
+ module (`nn.Module`):
250
+ Module to quantize
251
+ names (`List[str]`):
252
+ List of names of the module to quantize
253
+ name (`str`, defaults to `""`):
254
+ To keep track of the name of the current module
255
+ """
256
+ QuantLinear = dynamically_import_QuantLinear(
257
+ use_triton=False,
258
+ desc_act=self.desc_act,
259
+ group_size=self.group_size,
260
+ bits=self.bits,
261
+ disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
262
+ disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
263
+ )
264
+ if isinstance(module, QuantLinear):
265
+ return
266
+ for attr in dir(module):
267
+ layer = getattr(module, attr)
268
+ name1 = name + "." + attr if name != "" else attr
269
+ if name1 in names:
270
+ device = get_device(layer)
271
+ delattr(module, attr)
272
+ if isinstance(layer, nn.Linear):
273
+ in_features = layer.in_features
274
+ out_features = layer.out_features
275
+ elif isinstance(layer, nn.Conv2d):
276
+ in_features = layer.in_channels
277
+ out_features = layer.out_channels
278
+ elif isinstance(layer, Conv1D):
279
+ in_features = layer.weight.shape[0]
280
+ out_features = layer.weight.shape[1]
281
+ bias = layer.bias is not None
282
+ if not (self.desc_act) or self.group_size == -1:
283
+ new_layer = QuantLinear(
284
+ self.bits,
285
+ self.group_size,
286
+ in_features,
287
+ out_features,
288
+ bias,
289
+ use_cuda_fp16=self.use_cuda_fp16,
290
+ weight_dtype=layer.weight.dtype,
291
+ )
292
+ else:
293
+ new_layer = QuantLinear(
294
+ self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
295
+ )
296
+ new_layer.device = device
297
+ setattr(module, attr, new_layer.to(device))
298
+ for name1, child in module.named_children():
299
+ self._replace_by_quant_layers(child, names, name + "." + name1 if name != "" else name1)
300
+
301
+ @torch.no_grad()
302
+ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
303
+ """
304
+ Quantizes the model using the dataset
305
+
306
+ Args:
307
+ model (`nn.Module`):
308
+ The model to quantize
309
+ tokenizer (Optional[`Any`], defaults to `None`):
310
+ The tokenizer to use in order to prepare the dataset. You can pass either:
311
+ - A custom tokenizer object.
312
+ - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
313
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
314
+ user or organization name, like `dbmdz/bert-base-german-cased`.
315
+ - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
316
+ using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
317
+ Returns:
318
+ `nn.Module`: The quantized model
319
+ """
320
+
321
+ if not is_auto_gptq_available():
322
+ raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
323
+ if not torch.cuda.is_available():
324
+ raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
325
+
326
+ model.eval()
327
+
328
+ # For Transformer model
329
+ has_config = False
330
+ has_device_map = False
331
+ if hasattr(model, "config"):
332
+ has_config = True
333
+ use_cache = model.config.use_cache
334
+ model.config.use_cache = False
335
+
336
+ # If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
337
+ if hasattr(model, "hf_device_map"):
338
+ devices = list(model.hf_device_map.values())
339
+ has_device_map = True
340
+ if "disk" in devices:
341
+ raise ValueError("disk offload is not supported with GPTQ quantization")
342
+ if "cpu" in devices or torch.device("cpu") in devices:
343
+ if len(model.hf_device_map) > 1:
344
+ logger.info("Cpu offload is not recommended. There might be some issues with the memory")
345
+ hook = None
346
+ for name, device in model.hf_device_map.items():
347
+ if device == "cpu":
348
+ module = recurse_getattr(model, name)
349
+ remove_hook_from_module(module, recurse=True)
350
+ module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
351
+ else:
352
+ has_device_map = False
353
+
354
+ if hasattr(model, "dtype"):
355
+ self.use_cuda_fp16 = model.dtype == torch.float16
356
+
357
+ if self.model_seqlen is None:
358
+ # We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
359
+ self.model_seqlen = min(4028, get_seqlen(model))
360
+
361
+ device = get_device(model)
362
+
363
+ # Step 1: Prepare the data
364
+ if isinstance(self.dataset, list) and not isinstance(self.dataset[0], str):
365
+ dataset = self.dataset
366
+ logger.info("GPTQQuantizer dataset appears to be already tokenized. Skipping tokenization.")
367
+ else:
368
+ if isinstance(tokenizer, str):
369
+ try:
370
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer)
371
+ except Exception:
372
+ raise ValueError(
373
+ f"""We were not able to get the tokenizer using `AutoTokenizer.from_pretrained`
374
+ with the string that you have passed {tokenizer}. If you have a custom tokenizer, you can pass it as input.
375
+ For now, we only support quantization for text model. Support for vision, speech and multimodel will come later."""
376
+ )
377
+ if self.dataset is None:
378
+ raise ValueError("You need to pass `dataset` in order to quantize your model")
379
+ elif isinstance(self.dataset, str):
380
+ dataset = get_dataset(self.dataset, tokenizer, seqlen=self.model_seqlen, split="train")
381
+ elif isinstance(self.dataset, list):
382
+ dataset = [tokenizer(data, return_tensors="pt") for data in self.dataset]
383
+ else:
384
+ raise ValueError(
385
+ f"You need to pass a list of string, a list of tokenized data or a string for `dataset`. Found: {type(self.dataset)}."
386
+ )
387
+
388
+ dataset = prepare_dataset(dataset, pad_token_id=self.pad_token_id, batch_size=self.batch_size)
389
+
390
+ # Step 2: get the input of the 1st block
391
+ # To do that, we need to put the modules preceding the first block on the same device as the first bloc.
392
+ # Then we run the model and it will stop at the first bloc as we added a prehook that raise an Exception after storing the inputs.
393
+
394
+ layer_inputs = []
395
+ layer_outputs = []
396
+ layer_input_kwargs = []
397
+
398
+ if self.block_name_to_quantize is None:
399
+ self.block_name_to_quantize = get_block_name_with_pattern(model)
400
+
401
+ if self.module_name_preceding_first_block is None:
402
+ self.module_name_preceding_first_block = get_preceding_modules(model, self.block_name_to_quantize)
403
+
404
+ blocks = recurse_getattr(model, self.block_name_to_quantize)
405
+
406
+ if not has_device_map:
407
+ # put modules from module_name_preceding_first_block on cuda
408
+ for module_name in self.module_name_preceding_first_block:
409
+ module = recurse_getattr(model, module_name)
410
+ if module is None:
411
+ raise ValueError(f"Module {module_name} was not found in model")
412
+ module = module.to(0)
413
+ blocks[0] = blocks[0].to(0)
414
+
415
+ def store_input_hook(_, input, *args):
416
+ kwargs = args[0]
417
+ if input is None:
418
+ if "hidden_states" in kwargs:
419
+ input = (kwargs["hidden_states"],)
420
+ else:
421
+ raise ValueError("No input value found in the foward pass")
422
+ layer_inputs.append(input)
423
+ other_kwargs = {}
424
+ for k, v in kwargs.items(): # make sure other arguments also be captured
425
+ if k not in ["hidden_states"]:
426
+ other_kwargs[k] = v
427
+ layer_input_kwargs.append(other_kwargs)
428
+ raise ValueError
429
+
430
+ if self.cache_block_outputs:
431
+ handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
432
+ for data in dataset:
433
+ for k, v in data.items():
434
+ # put the data on gpu, we won't put them back to cpu
435
+ data[k] = v.to(0)
436
+ try:
437
+ model(**data)
438
+ except ValueError:
439
+ pass
440
+ handle.remove()
441
+
442
+ if not has_device_map:
443
+ blocks[0].to(device)
444
+ for module_name in self.module_name_preceding_first_block:
445
+ module = recurse_getattr(model, module_name)
446
+ if module is None:
447
+ raise ValueError(f"Module {module_name} was not found in model")
448
+
449
+ torch.cuda.empty_cache()
450
+
451
+ # Step 3: Quantize the blocks
452
+ quantizers = {}
453
+ for i, block in enumerate(tqdm(blocks, desc=f"Quantizing {self.block_name_to_quantize} blocks ")):
454
+ logger.info(f"Start quantizing block {self.block_name_to_quantize} {i + 1}/{len(blocks)}")
455
+
456
+ if not self.cache_block_outputs:
457
+ handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
458
+ for data in dataset:
459
+ for k, v in data.items():
460
+ # put the data on gpu, we won't put them back to cpu
461
+ data[k] = v.to(0)
462
+ try:
463
+ model(**data)
464
+ except ValueError:
465
+ pass
466
+ handle.remove()
467
+
468
+ # move block to cuda if needed
469
+ # in case we have offload modules, we need to put them on cuda because of GPTQ object
470
+ if not has_device_map or get_device(block) == torch.device("cpu"):
471
+ block = block.to(0)
472
+ layers = get_layers(block)
473
+ if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
474
+ if self.true_sequential:
475
+ layers_name_list = self.modules_in_block_to_quantize
476
+ else:
477
+ layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
478
+ else:
479
+ if self.true_sequential:
480
+ # lazy sequential but works well
481
+ layers_name_list = [[key] for key in layers.keys()]
482
+ else:
483
+ layers_name_list = [list(layers.keys())]
484
+ logger.info(f"Module to quantize {layers_name_list}")
485
+ for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
486
+ subset_layers = {name: layers[name] for name in subset_name_list}
487
+ gptq = {}
488
+ handles = []
489
+ # add hook for each layer in subset_layers
490
+ for name in subset_layers:
491
+ gptq[name] = GPTQ(subset_layers[name])
492
+ gptq[name].quantizer.configure(bits=self.bits, sym=self.sym, perchannel=True)
493
+
494
+ def add_batch(name):
495
+ def tmp(_, input, output):
496
+ gptq[name].add_batch(input[0].data, output.data)
497
+
498
+ return tmp
499
+
500
+ # because it adding a hook will replace the old one.
501
+ handles.append(subset_layers[name].register_forward_hook(add_batch(name)))
502
+ # update Hessian for each layer in subset_layers thanks to the hook
503
+ for j in range(len(dataset)):
504
+ # the args are already on the gpu
505
+ # don't need to store the output
506
+ block(*layer_inputs[j], **layer_input_kwargs[j])
507
+ # remove hook
508
+ for h in handles:
509
+ h.remove()
510
+ for name in subset_name_list:
511
+ logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
512
+ scale, zero, g_idx = gptq[name].fasterquant(
513
+ percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
514
+ )
515
+ quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
516
+ gptq[name].quantizer,
517
+ scale,
518
+ zero,
519
+ g_idx,
520
+ )
521
+ gptq[name].free()
522
+ del subset_layers
523
+ # we get the new output from the partial quantized block
524
+ if self.cache_block_outputs:
525
+ for j in range(len(dataset)):
526
+ layer_output = block(*layer_inputs[j], **layer_input_kwargs[j])
527
+ layer_outputs.append(layer_output)
528
+
529
+ # put back to device
530
+ if not has_device_map:
531
+ blocks[i] = block.to(device)
532
+ del layers
533
+ del layer_inputs
534
+ layer_inputs, layer_outputs = layer_outputs, []
535
+ else:
536
+ del layers
537
+ del layer_inputs
538
+ layer_inputs = []
539
+ torch.cuda.empty_cache()
540
+ if i==5:
541
+ break
542
+
543
+ if self.bits == 4:
544
+ # device not on gpu
545
+ if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
546
+ if not self.disable_exllama:
547
+ logger.warning(
548
+ "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
549
+ )
550
+ self.disable_exllama = True
551
+ # act order and exllama
552
+ elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE:
553
+ logger.warning(
554
+ "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
555
+ "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
556
+ )
557
+ self.disable_exllama = True
558
+ elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO:
559
+ logger.warning(
560
+ "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
561
+ "Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. "
562
+ )
563
+ self.disable_exllama = True
564
+ # Step 4: Pack the model at the end (Replacing the layers)
565
+ self.pack_model(model=model, quantizers=quantizers)
566
+
567
+ model.is_quantized = True
568
+ model.quantization_method = QuantizationMethod.GPTQ
569
+ if has_config:
570
+ model.config.use_cache = use_cache
571
+ model.config.quantization_config = self.to_dict()
572
+
573
+ # Step 5: Any post-initialization that require device information, for example buffers initialization on device.
574
+ model = self.post_init_model(model)
575
+
576
+ torch.cuda.empty_cache()
577
+ return model
578
+
579
+ def post_init_model(self, model):
580
+ """
581
+ Post-initialization that require device information, for example buffers initialization on device.
582
+
583
+ Args:
584
+ model (`nn.Module`):
585
+ The input model
586
+ """
587
+ if self.bits == 4 and not self.disable_exllama:
588
+ if get_device(model) == torch.device("cpu") or (
589
+ hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
590
+ ):
591
+ raise ValueError(
592
+ "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
593
+ "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
594
+ )
595
+
596
+ class StoreAttr(object):
597
+ pass
598
+
599
+ model.quantize_config = StoreAttr()
600
+ model.quantize_config.desc_act = self.desc_act
601
+ model = autogptq_post_init(model, use_act_order=self.desc_act)
602
+ if (
603
+ self.desc_act
604
+ and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
605
+ and self.max_input_length is not None
606
+ ):
607
+ model = exllama_set_max_input_length(model, self.max_input_length)
608
+ return model
609
+
610
+ def pack_model(
611
+ self,
612
+ model: nn.Module,
613
+ quantizers: Dict[str, Tuple],
614
+ ):
615
+ """
616
+ Pack the model by replacing the layers by quantized layers
617
+
618
+ Args:
619
+ model (`nn.Module`):
620
+ The model to pack
621
+ quantizers (`Dict[str,Tuple]`):
622
+ A mapping of the layer name and the data needed to pack the layer
623
+ """
624
+ QuantLinear = dynamically_import_QuantLinear(
625
+ use_triton=False,
626
+ desc_act=self.desc_act,
627
+ group_size=self.group_size,
628
+ bits=self.bits,
629
+ disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
630
+ disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
631
+ )
632
+ logger.info("Packing model...")
633
+ layers = get_layers(model)
634
+ layers = {n: layers[n] for n in quantizers}
635
+ self._replace_by_quant_layers(model, quantizers)
636
+ qlayers = get_layers(model, [QuantLinear])
637
+ autogptq_blobs = OrderedDict()
638
+ for i, name in enumerate(qlayers):
639
+ logger.info(name)
640
+ quantizers[name], scale, zero, g_idx = quantizers[name]
641
+ # so far can only pack layer on CPU
642
+ layer_device = qlayers[name].device
643
+ qlayers[name].to("cpu")
644
+ layers[name], scale, zero, g_idx = layers[name].to("cpu"), scale.to("cpu"), zero.to("cpu"), g_idx.to("cpu")
645
+ autogptq_blobs[name] = {
646
+ "prepack": dict(
647
+ w=layers[name].weight,
648
+ b=layers[name].bias,
649
+ scale=scale,
650
+ zero=zero,
651
+ g_idx=g_idx
652
+ )
653
+ }
654
+ qlayers[name].pack(layers[name], scale, zero, g_idx)
655
+ autogptq_blobs[name]["pack"] = dict(
656
+ qweight=qlayers[name].qweight,
657
+ bias=qlayers[name].bias,
658
+ scales=qlayers[name].scales,
659
+ qzeros=qlayers[name].qzeros,
660
+ g_idx=qlayers[name].g_idx,
661
+ intweight=qlayers[name].intweight
662
+ )
663
+ qlayers[name].to(layer_device)
664
+ if i==5:
665
+ break
666
+ torch.save(autogptq_blobs, "./opt-125m-gptq4.pth")
667
+ exit()
668
+ logger.info("Model packed.")
669
+
670
+ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
671
+ """
672
+ Save model state dict and configs
673
+
674
+ Args:
675
+ model (`nn.Module`):
676
+ Model to be saved. The model can be wrapped or unwraped.
677
+ save_dir (`str`):
678
+ Directory to which to save. Will be created if it doesn't exist.
679
+ max_shard_size (`str`, defaults to `"10GB"`):
680
+ The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
681
+ lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
682
+ <Tip warning={true}>
683
+
684
+ If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
685
+ which will be bigger than `max_shard_size`.
686
+
687
+ </Tip>
688
+ safe_serialization (`bool`, defaults to `True`):
689
+ Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
690
+
691
+ """
692
+ os.makedirs(save_dir, exist_ok=True)
693
+ model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
694
+ with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
695
+ json.dump(self.to_dict(), f, indent=2)
696
+
697
+
698
+ def load_quantized_model(
699
+ model: nn.Module,
700
+ save_folder: str,
701
+ quant_config_name: str = GPTQ_CONFIG,
702
+ state_dict_name: Optional[str] = None,
703
+ device_map: Optional[str] = None,
704
+ max_memory: Optional[Dict] = None,
705
+ no_split_module_classes: Optional[Dict] = None,
706
+ offload_folder: Optional[str] = None,
707
+ offload_buffers: Optional[str] = None,
708
+ offload_state_dict: bool = False,
709
+ disable_exllama: bool = False,
710
+ exllama_config: Optional[Dict[str, Any]] = None,
711
+ max_input_length: Optional[int] = None,
712
+ ):
713
+ """
714
+ Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map.
715
+
716
+ Args:
717
+ model (`nn.Module`):
718
+ The model can be enpty or not.
719
+ save_folder (`str`):
720
+ Directory to which to load the weights.
721
+ quant_config_name (`str`, defaults to `GPTQ_CONFIG`):
722
+ Name of the quantization config file
723
+ state_dict_name (`Optional[str]`, defaults to `None`):
724
+ Name of the state dict file
725
+ device_map (`Optional[str]`, defaults to `None`):
726
+ A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
727
+ name, once a given module name is inside, every submodule of it will be sent to the same device.
728
+ To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
729
+ max_memory (`Optional[Dict]`, defaults to `None`):
730
+ A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
731
+ and the available CPU RAM if unset.
732
+ no_split_module_classes (`Optional[Dict]`, defaults to `None`):
733
+ A list of layer class names that should never be split across device (for instance any layer that has a
734
+ residual connection).
735
+ offload_folder (`Optional[str]`, defaults to `None`):
736
+ If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
737
+ offload_buffers (`Optional[str]`, defaults to `None`):
738
+ In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
739
+ well as the parameters.
740
+ offload_state_dict (`bool`, defaults to `False`):
741
+ If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
742
+ the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
743
+ picked contains `"disk"` values.
744
+ disable_exllama (`Optional[bool]`, defaults to `None`):
745
+ Whether to use exllama backend. Only works with `bits` = 4.
746
+ exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
747
+ The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
748
+ max_input_length (`Optional[int]`, defaults to `None`):
749
+ The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
750
+ It is specific to the exllama backend with act-order.
751
+
752
+ Returns:
753
+ `nn.Module`: The quantized model
754
+ """
755
+ if not torch.cuda.is_available():
756
+ raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
757
+ if not is_auto_gptq_available():
758
+ raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
759
+ if not is_accelerate_available():
760
+ raise RuntimeError(
761
+ "You need to install accelerate in order to load and dispatch weights to"
762
+ "a quantized model. You can do it with `pip install accelerate`"
763
+ )
764
+ if device_map is None:
765
+ device_map = {"": torch.cuda.current_device()}
766
+ logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
767
+
768
+ if exllama_config is None:
769
+ exllama_config = {"version": ExllamaVersion.TWO}
770
+ else:
771
+ if "version" not in exllama_config:
772
+ raise ValueError("`exllama_config` needs to have a `version` key")
773
+ elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
774
+ version = exllama_config["version"]
775
+ raise ValueError(
776
+ f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
777
+ )
778
+
779
+ # this branch will check if model is from huggingface
780
+ try:
781
+ if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
782
+ quantize_config_dict = model.config.quantization_config.to_dict()
783
+ else:
784
+ with open(os.path.join(save_folder, quant_config_name), "r", encoding="utf-8") as f:
785
+ quantize_config_dict = json.load(f)
786
+ except Exception as err:
787
+ raise ValueError(
788
+ f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
789
+ ) from err
790
+ quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
791
+ quantizer.disable_exllama = disable_exllama
792
+ quantizer.exllama_config = exllama_config
793
+ quantizer.exllama_version = quantizer.exllama_config["version"]
794
+ quantizer.max_input_length = max_input_length
795
+
796
+ model = quantizer.convert_model(model)
797
+
798
+ if no_split_module_classes is None:
799
+ no_split_module_classes = quantizer.get_no_split_module_classes(model)
800
+
801
+ model = load_checkpoint_and_dispatch(
802
+ model,
803
+ checkpoint=os.path.join(save_folder, state_dict_name) if state_dict_name is not None else save_folder,
804
+ device_map=device_map,
805
+ max_memory=max_memory,
806
+ no_split_module_classes=no_split_module_classes,
807
+ offload_folder=offload_folder,
808
+ offload_buffers=offload_buffers,
809
+ offload_state_dict=offload_state_dict,
810
+ )
811
+
812
+ model = quantizer.post_init_model(model)
813
+ model.is_quantized = True
814
+ model.quantization_method = QuantizationMethod.GPTQ
815
+ model.eval()
816
+ return model
internal/donttouch_unpacking_autogptq/quantizer.py.ori.py ADDED
@@ -0,0 +1,793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import json
16
+ import os
17
+ from enum import Enum
18
+ from logging import getLogger
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ from torch import nn
23
+ from tqdm.auto import tqdm
24
+ from transformers import AutoTokenizer
25
+ from transformers.pytorch_utils import Conv1D
26
+ from transformers.utils.quantization_config import QuantizationMethod
27
+
28
+ from ..utils import is_accelerate_available, is_auto_gptq_available
29
+ from ..utils.modeling_utils import recurse_getattr
30
+ from .constants import GPTQ_CONFIG
31
+ from .data import get_dataset, prepare_dataset
32
+ from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
33
+
34
+
35
+ if is_accelerate_available():
36
+ from accelerate import (
37
+ cpu_offload_with_hook,
38
+ load_checkpoint_and_dispatch,
39
+ )
40
+ from accelerate.hooks import remove_hook_from_module
41
+
42
+ if is_auto_gptq_available():
43
+ from auto_gptq import exllama_set_max_input_length
44
+ from auto_gptq.modeling._utils import autogptq_post_init
45
+ from auto_gptq.quantization import GPTQ
46
+ from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
47
+
48
+ logger = getLogger(__name__)
49
+
50
+
51
+ class ExllamaVersion(int, Enum):
52
+ ONE = 1
53
+ TWO = 2
54
+
55
+
56
+ class GPTQQuantizer(object):
57
+ r"""
58
+ A simple API for GPTQ Quantization
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ bits: int,
64
+ dataset: Optional[Union[List[str], str]] = None,
65
+ group_size: int = 128,
66
+ damp_percent: float = 0.1,
67
+ desc_act: bool = False,
68
+ sym: bool = True,
69
+ true_sequential: bool = True,
70
+ use_cuda_fp16: bool = False,
71
+ model_seqlen: Optional[int] = None,
72
+ block_name_to_quantize: Optional[str] = None,
73
+ module_name_preceding_first_block: Optional[List[str]] = None,
74
+ batch_size: int = 1,
75
+ pad_token_id: Optional[int] = None,
76
+ disable_exllama: bool = False,
77
+ exllama_config: Dict[str, Any] = None,
78
+ max_input_length: Optional[int] = None,
79
+ cache_block_outputs: Optional[bool] = True,
80
+ modules_in_block_to_quantize: Optional[List[List[str]]] = None,
81
+ *args,
82
+ **kwargs,
83
+ ):
84
+ """
85
+ Args:
86
+ bits (`int`):
87
+ The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
88
+ dataset (`Union[List[str], str, Any]`, defaults to `None`):
89
+ The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
90
+ (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
91
+ or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
92
+ group_size (int, defaults to 128):
93
+ The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
94
+ damp_percent (`float`, defaults to `0.1`):
95
+ The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1.
96
+ desc_act (`bool`, defaults to `False`):
97
+ Whether to quantize columns in order of decreasing activation size.
98
+ Setting it to False can significantly speed up inference but the perplexity may become slightly worse.
99
+ Also known as act-order.
100
+ sym (`bool`, defaults to `True`):
101
+ Whether to use symetric quantization.
102
+ true_sequential (`bool`, defaults to `True`):
103
+ Whether to perform sequential quantization even within a single Transformer block.
104
+ Instead of quantizing the entire block at once, we perform layer-wise quantization.
105
+ As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
106
+ use_cuda_fp16 (`bool`, defaults to `False`):
107
+ Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
108
+ model_seqlen (`Optional[int]`, defaults to `None`):
109
+ The maximum sequence length that the model can take.
110
+ block_name_to_quantize (`Optional[str]`, defaults to `None`):
111
+ The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
112
+ module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
113
+ The layers that are preceding the first Transformer block.
114
+ batch_size (`int`, defaults to `1`):
115
+ The batch size of the dataset
116
+ pad_token_id (`Optional[int]`, defaults to `None`):
117
+ The pad token id. Needed to prepare the dataset when `batch_size` > 1.
118
+ disable_exllama (`bool`, defaults to `False`):
119
+ Whether to use exllama backend. Only works with `bits` = 4.
120
+ exllama_config (`Dict[str, Any]`, *optional*):
121
+ The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
122
+ max_input_length (`Optional[int]`, defaults to `None`):
123
+ The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
124
+ It is specific to the exllama backend with act-order.
125
+ cache_block_outputs (`bool`, defaults to `True`):
126
+ Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
127
+ (e.g. ChatGLM) but can require more time.
128
+ modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
129
+ List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
130
+ The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
131
+ If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
132
+ """
133
+
134
+ self.bits = bits
135
+ self.dataset = dataset
136
+ self.group_size = group_size
137
+ self.damp_percent = damp_percent
138
+ self.desc_act = desc_act
139
+ self.sym = sym
140
+ self.true_sequential = true_sequential
141
+ self.use_cuda_fp16 = use_cuda_fp16
142
+ self.model_seqlen = model_seqlen
143
+ self.block_name_to_quantize = block_name_to_quantize
144
+ self.module_name_preceding_first_block = module_name_preceding_first_block
145
+ self.batch_size = batch_size
146
+ self.pad_token_id = pad_token_id
147
+ self.disable_exllama = disable_exllama
148
+ self.exllama_config = exllama_config
149
+ self.max_input_length = max_input_length
150
+ self.quant_method = QuantizationMethod.GPTQ
151
+ self.cache_block_outputs = cache_block_outputs
152
+ self.modules_in_block_to_quantize = modules_in_block_to_quantize
153
+
154
+ self.serialization_keys = [
155
+ "bits",
156
+ "dataset",
157
+ "group_size",
158
+ "damp_percent",
159
+ "desc_act",
160
+ "sym",
161
+ "true_sequential",
162
+ "quant_method",
163
+ "modules_in_block_to_quantize",
164
+ ]
165
+
166
+ if self.bits not in [2, 3, 4, 8]:
167
+ raise ValueError("only support quantize to [2,3,4,8] bits.")
168
+ if self.group_size != -1 and self.group_size <= 0:
169
+ raise ValueError("group_size must be greater than 0 or equal to -1")
170
+ if not (0 < self.damp_percent < 1):
171
+ raise ValueError("damp_percent must between 0 and 1.")
172
+
173
+ if self.exllama_config is None:
174
+ self.exllama_config = {"version": ExllamaVersion.TWO}
175
+ else:
176
+ if "version" not in self.exllama_config:
177
+ raise ValueError("`exllama_config` needs to have a `version` key")
178
+ elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
179
+ version = self.exllama_config["version"]
180
+ raise ValueError(
181
+ f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
182
+ )
183
+ self.exllama_version = self.exllama_config["version"]
184
+
185
+ def to_dict(self):
186
+ """
187
+ Returns the args in dict format.
188
+ """
189
+ gptq_dict = {}
190
+ for key in self.serialization_keys:
191
+ gptq_dict[key] = getattr(self, key)
192
+ return gptq_dict
193
+
194
+ @classmethod
195
+ def from_dict(cls, config_dict: Dict[str, Any]):
196
+ """
197
+ Instantiates a `GPTQQuantizer` using config_dict as kwargs
198
+
199
+ Args:
200
+ config_dict (`Dict[str,Any]`):
201
+ quantization config
202
+
203
+ Returns:
204
+ `GPTQQuantizer`: The quantizer object instantiated from those parameters.
205
+ """
206
+ return cls(**config_dict)
207
+
208
+ def convert_model(self, model: nn.Module):
209
+ """
210
+ Convert the model to a GPTQ model by getting and replacing the layers.
211
+
212
+ Args:
213
+ model (`nn.Module`):
214
+ Model to be converted
215
+
216
+ """
217
+ if self.block_name_to_quantize is None:
218
+ self.block_name_to_quantize = get_block_name_with_pattern(model)
219
+ block_name = self.block_name_to_quantize
220
+ layers_to_be_replaced = get_layers(model, prefix=block_name)
221
+ if self.modules_in_block_to_quantize is not None:
222
+ layers_to_keep = sum(self.modules_in_block_to_quantize, [])
223
+ for name in list(layers_to_be_replaced.keys()):
224
+ if not any(name.endswith(layer) for layer in layers_to_keep):
225
+ logger.info(
226
+ f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
227
+ )
228
+ del layers_to_be_replaced[name]
229
+ self._replace_by_quant_layers(model, layers_to_be_replaced)
230
+ return model
231
+
232
+ def get_no_split_module_classes(self, model):
233
+ """
234
+ Get the modules that should not be split across multiple devices.
235
+ Args:
236
+ model (`nn.Module`):
237
+ The input model
238
+ """
239
+
240
+ block_class_name = recurse_getattr(model, self.block_name_to_quantize)[0].__class__.__name__
241
+ no_split_module_classes = [block_class_name]
242
+ return no_split_module_classes
243
+
244
+ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: str = ""):
245
+ """
246
+ Replaces linear layers in `module` by `QuantLinear`
247
+
248
+ Args:
249
+ module (`nn.Module`):
250
+ Module to quantize
251
+ names (`List[str]`):
252
+ List of names of the module to quantize
253
+ name (`str`, defaults to `""`):
254
+ To keep track of the name of the current module
255
+ """
256
+ QuantLinear = dynamically_import_QuantLinear(
257
+ use_triton=False,
258
+ desc_act=self.desc_act,
259
+ group_size=self.group_size,
260
+ bits=self.bits,
261
+ disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
262
+ disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
263
+ )
264
+ if isinstance(module, QuantLinear):
265
+ return
266
+ for attr in dir(module):
267
+ layer = getattr(module, attr)
268
+ name1 = name + "." + attr if name != "" else attr
269
+ if name1 in names:
270
+ device = get_device(layer)
271
+ delattr(module, attr)
272
+ if isinstance(layer, nn.Linear):
273
+ in_features = layer.in_features
274
+ out_features = layer.out_features
275
+ elif isinstance(layer, nn.Conv2d):
276
+ in_features = layer.in_channels
277
+ out_features = layer.out_channels
278
+ elif isinstance(layer, Conv1D):
279
+ in_features = layer.weight.shape[0]
280
+ out_features = layer.weight.shape[1]
281
+ bias = layer.bias is not None
282
+ if not (self.desc_act) or self.group_size == -1:
283
+ new_layer = QuantLinear(
284
+ self.bits,
285
+ self.group_size,
286
+ in_features,
287
+ out_features,
288
+ bias,
289
+ use_cuda_fp16=self.use_cuda_fp16,
290
+ weight_dtype=layer.weight.dtype,
291
+ )
292
+ else:
293
+ new_layer = QuantLinear(
294
+ self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
295
+ )
296
+ new_layer.device = device
297
+ setattr(module, attr, new_layer.to(device))
298
+ for name1, child in module.named_children():
299
+ self._replace_by_quant_layers(child, names, name + "." + name1 if name != "" else name1)
300
+
301
+ @torch.no_grad()
302
+ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
303
+ """
304
+ Quantizes the model using the dataset
305
+
306
+ Args:
307
+ model (`nn.Module`):
308
+ The model to quantize
309
+ tokenizer (Optional[`Any`], defaults to `None`):
310
+ The tokenizer to use in order to prepare the dataset. You can pass either:
311
+ - A custom tokenizer object.
312
+ - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
313
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
314
+ user or organization name, like `dbmdz/bert-base-german-cased`.
315
+ - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
316
+ using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
317
+ Returns:
318
+ `nn.Module`: The quantized model
319
+ """
320
+
321
+ if not is_auto_gptq_available():
322
+ raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
323
+ if not torch.cuda.is_available():
324
+ raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
325
+
326
+ model.eval()
327
+
328
+ # For Transformer model
329
+ has_config = False
330
+ has_device_map = False
331
+ if hasattr(model, "config"):
332
+ has_config = True
333
+ use_cache = model.config.use_cache
334
+ model.config.use_cache = False
335
+
336
+ # If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
337
+ if hasattr(model, "hf_device_map"):
338
+ devices = list(model.hf_device_map.values())
339
+ has_device_map = True
340
+ if "disk" in devices:
341
+ raise ValueError("disk offload is not supported with GPTQ quantization")
342
+ if "cpu" in devices or torch.device("cpu") in devices:
343
+ if len(model.hf_device_map) > 1:
344
+ logger.info("Cpu offload is not recommended. There might be some issues with the memory")
345
+ hook = None
346
+ for name, device in model.hf_device_map.items():
347
+ if device == "cpu":
348
+ module = recurse_getattr(model, name)
349
+ remove_hook_from_module(module, recurse=True)
350
+ module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
351
+ else:
352
+ has_device_map = False
353
+
354
+ if hasattr(model, "dtype"):
355
+ self.use_cuda_fp16 = model.dtype == torch.float16
356
+
357
+ if self.model_seqlen is None:
358
+ # We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
359
+ self.model_seqlen = min(4028, get_seqlen(model))
360
+
361
+ device = get_device(model)
362
+
363
+ # Step 1: Prepare the data
364
+ if isinstance(self.dataset, list) and not isinstance(self.dataset[0], str):
365
+ dataset = self.dataset
366
+ logger.info("GPTQQuantizer dataset appears to be already tokenized. Skipping tokenization.")
367
+ else:
368
+ if isinstance(tokenizer, str):
369
+ try:
370
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer)
371
+ except Exception:
372
+ raise ValueError(
373
+ f"""We were not able to get the tokenizer using `AutoTokenizer.from_pretrained`
374
+ with the string that you have passed {tokenizer}. If you have a custom tokenizer, you can pass it as input.
375
+ For now, we only support quantization for text model. Support for vision, speech and multimodel will come later."""
376
+ )
377
+ if self.dataset is None:
378
+ raise ValueError("You need to pass `dataset` in order to quantize your model")
379
+ elif isinstance(self.dataset, str):
380
+ dataset = get_dataset(self.dataset, tokenizer, seqlen=self.model_seqlen, split="train")
381
+ elif isinstance(self.dataset, list):
382
+ dataset = [tokenizer(data, return_tensors="pt") for data in self.dataset]
383
+ else:
384
+ raise ValueError(
385
+ f"You need to pass a list of string, a list of tokenized data or a string for `dataset`. Found: {type(self.dataset)}."
386
+ )
387
+
388
+ dataset = prepare_dataset(dataset, pad_token_id=self.pad_token_id, batch_size=self.batch_size)
389
+
390
+ # Step 2: get the input of the 1st block
391
+ # To do that, we need to put the modules preceding the first block on the same device as the first bloc.
392
+ # Then we run the model and it will stop at the first bloc as we added a prehook that raise an Exception after storing the inputs.
393
+
394
+ layer_inputs = []
395
+ layer_outputs = []
396
+ layer_input_kwargs = []
397
+
398
+ if self.block_name_to_quantize is None:
399
+ self.block_name_to_quantize = get_block_name_with_pattern(model)
400
+
401
+ if self.module_name_preceding_first_block is None:
402
+ self.module_name_preceding_first_block = get_preceding_modules(model, self.block_name_to_quantize)
403
+
404
+ blocks = recurse_getattr(model, self.block_name_to_quantize)
405
+
406
+ if not has_device_map:
407
+ # put modules from module_name_preceding_first_block on cuda
408
+ for module_name in self.module_name_preceding_first_block:
409
+ module = recurse_getattr(model, module_name)
410
+ if module is None:
411
+ raise ValueError(f"Module {module_name} was not found in model")
412
+ module = module.to(0)
413
+ blocks[0] = blocks[0].to(0)
414
+
415
+ def store_input_hook(_, input, *args):
416
+ kwargs = args[0]
417
+ if input is None:
418
+ if "hidden_states" in kwargs:
419
+ input = (kwargs["hidden_states"],)
420
+ else:
421
+ raise ValueError("No input value found in the foward pass")
422
+ layer_inputs.append(input)
423
+ other_kwargs = {}
424
+ for k, v in kwargs.items(): # make sure other arguments also be captured
425
+ if k not in ["hidden_states"]:
426
+ other_kwargs[k] = v
427
+ layer_input_kwargs.append(other_kwargs)
428
+ raise ValueError
429
+
430
+ if self.cache_block_outputs:
431
+ handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
432
+ for data in dataset:
433
+ for k, v in data.items():
434
+ # put the data on gpu, we won't put them back to cpu
435
+ data[k] = v.to(0)
436
+ try:
437
+ model(**data)
438
+ except ValueError:
439
+ pass
440
+ handle.remove()
441
+
442
+ if not has_device_map:
443
+ blocks[0].to(device)
444
+ for module_name in self.module_name_preceding_first_block:
445
+ module = recurse_getattr(model, module_name)
446
+ if module is None:
447
+ raise ValueError(f"Module {module_name} was not found in model")
448
+
449
+ torch.cuda.empty_cache()
450
+
451
+ # Step 3: Quantize the blocks
452
+ quantizers = {}
453
+ for i, block in enumerate(tqdm(blocks, desc=f"Quantizing {self.block_name_to_quantize} blocks ")):
454
+ logger.info(f"Start quantizing block {self.block_name_to_quantize} {i + 1}/{len(blocks)}")
455
+
456
+ if not self.cache_block_outputs:
457
+ handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
458
+ for data in dataset:
459
+ for k, v in data.items():
460
+ # put the data on gpu, we won't put them back to cpu
461
+ data[k] = v.to(0)
462
+ try:
463
+ model(**data)
464
+ except ValueError:
465
+ pass
466
+ handle.remove()
467
+
468
+ # move block to cuda if needed
469
+ # in case we have offload modules, we need to put them on cuda because of GPTQ object
470
+ if not has_device_map or get_device(block) == torch.device("cpu"):
471
+ block = block.to(0)
472
+ layers = get_layers(block)
473
+ if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
474
+ if self.true_sequential:
475
+ layers_name_list = self.modules_in_block_to_quantize
476
+ else:
477
+ layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
478
+ else:
479
+ if self.true_sequential:
480
+ # lazy sequential but works well
481
+ layers_name_list = [[key] for key in layers.keys()]
482
+ else:
483
+ layers_name_list = [list(layers.keys())]
484
+ logger.info(f"Module to quantize {layers_name_list}")
485
+ for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
486
+ subset_layers = {name: layers[name] for name in subset_name_list}
487
+ gptq = {}
488
+ handles = []
489
+ # add hook for each layer in subset_layers
490
+ for name in subset_layers:
491
+ gptq[name] = GPTQ(subset_layers[name])
492
+ gptq[name].quantizer.configure(bits=self.bits, sym=self.sym, perchannel=True)
493
+
494
+ def add_batch(name):
495
+ def tmp(_, input, output):
496
+ gptq[name].add_batch(input[0].data, output.data)
497
+
498
+ return tmp
499
+
500
+ # because it adding a hook will replace the old one.
501
+ handles.append(subset_layers[name].register_forward_hook(add_batch(name)))
502
+ # update Hessian for each layer in subset_layers thanks to the hook
503
+ for j in range(len(dataset)):
504
+ # the args are already on the gpu
505
+ # don't need to store the output
506
+ block(*layer_inputs[j], **layer_input_kwargs[j])
507
+ # remove hook
508
+ for h in handles:
509
+ h.remove()
510
+ for name in subset_name_list:
511
+ logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
512
+ scale, zero, g_idx = gptq[name].fasterquant(
513
+ percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
514
+ )
515
+ quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
516
+ gptq[name].quantizer,
517
+ scale,
518
+ zero,
519
+ g_idx,
520
+ )
521
+ gptq[name].free()
522
+ del subset_layers
523
+ # we get the new output from the partial quantized block
524
+ if self.cache_block_outputs:
525
+ for j in range(len(dataset)):
526
+ layer_output = block(*layer_inputs[j], **layer_input_kwargs[j])
527
+ layer_outputs.append(layer_output)
528
+
529
+ # put back to device
530
+ if not has_device_map:
531
+ blocks[i] = block.to(device)
532
+ del layers
533
+ del layer_inputs
534
+ layer_inputs, layer_outputs = layer_outputs, []
535
+ else:
536
+ del layers
537
+ del layer_inputs
538
+ layer_inputs = []
539
+ torch.cuda.empty_cache()
540
+
541
+ if self.bits == 4:
542
+ # device not on gpu
543
+ if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
544
+ if not self.disable_exllama:
545
+ logger.warning(
546
+ "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
547
+ )
548
+ self.disable_exllama = True
549
+ # act order and exllama
550
+ elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE:
551
+ logger.warning(
552
+ "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
553
+ "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
554
+ )
555
+ self.disable_exllama = True
556
+ elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO:
557
+ logger.warning(
558
+ "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
559
+ "Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. "
560
+ )
561
+ self.disable_exllama = True
562
+ # Step 4: Pack the model at the end (Replacing the layers)
563
+ self.pack_model(model=model, quantizers=quantizers)
564
+
565
+ model.is_quantized = True
566
+ model.quantization_method = QuantizationMethod.GPTQ
567
+ if has_config:
568
+ model.config.use_cache = use_cache
569
+ model.config.quantization_config = self.to_dict()
570
+
571
+ # Step 5: Any post-initialization that require device information, for example buffers initialization on device.
572
+ model = self.post_init_model(model)
573
+
574
+ torch.cuda.empty_cache()
575
+ return model
576
+
577
+ def post_init_model(self, model):
578
+ """
579
+ Post-initialization that require device information, for example buffers initialization on device.
580
+
581
+ Args:
582
+ model (`nn.Module`):
583
+ The input model
584
+ """
585
+ if self.bits == 4 and not self.disable_exllama:
586
+ if get_device(model) == torch.device("cpu") or (
587
+ hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
588
+ ):
589
+ raise ValueError(
590
+ "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
591
+ "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
592
+ )
593
+
594
+ class StoreAttr(object):
595
+ pass
596
+
597
+ model.quantize_config = StoreAttr()
598
+ model.quantize_config.desc_act = self.desc_act
599
+ model = autogptq_post_init(model, use_act_order=self.desc_act)
600
+ if (
601
+ self.desc_act
602
+ and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
603
+ and self.max_input_length is not None
604
+ ):
605
+ model = exllama_set_max_input_length(model, self.max_input_length)
606
+ return model
607
+
608
+ def pack_model(
609
+ self,
610
+ model: nn.Module,
611
+ quantizers: Dict[str, Tuple],
612
+ ):
613
+ """
614
+ Pack the model by replacing the layers by quantized layers
615
+
616
+ Args:
617
+ model (`nn.Module`):
618
+ The model to pack
619
+ quantizers (`Dict[str,Tuple]`):
620
+ A mapping of the layer name and the data needed to pack the layer
621
+ """
622
+ QuantLinear = dynamically_import_QuantLinear(
623
+ use_triton=False,
624
+ desc_act=self.desc_act,
625
+ group_size=self.group_size,
626
+ bits=self.bits,
627
+ disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
628
+ disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
629
+ )
630
+ logger.info("Packing model...")
631
+ layers = get_layers(model)
632
+ layers = {n: layers[n] for n in quantizers}
633
+ self._replace_by_quant_layers(model, quantizers)
634
+ qlayers = get_layers(model, [QuantLinear])
635
+ for name in qlayers:
636
+ logger.info(name)
637
+ quantizers[name], scale, zero, g_idx = quantizers[name]
638
+ # so far can only pack layer on CPU
639
+ layer_device = qlayers[name].device
640
+ qlayers[name].to("cpu")
641
+ layers[name], scale, zero, g_idx = layers[name].to("cpu"), scale.to("cpu"), zero.to("cpu"), g_idx.to("cpu")
642
+ qlayers[name].pack(layers[name], scale, zero, g_idx)
643
+ qlayers[name].to(layer_device)
644
+
645
+ logger.info("Model packed.")
646
+
647
+ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
648
+ """
649
+ Save model state dict and configs
650
+
651
+ Args:
652
+ model (`nn.Module`):
653
+ Model to be saved. The model can be wrapped or unwraped.
654
+ save_dir (`str`):
655
+ Directory to which to save. Will be created if it doesn't exist.
656
+ max_shard_size (`str`, defaults to `"10GB"`):
657
+ The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
658
+ lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
659
+ <Tip warning={true}>
660
+
661
+ If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
662
+ which will be bigger than `max_shard_size`.
663
+
664
+ </Tip>
665
+ safe_serialization (`bool`, defaults to `True`):
666
+ Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
667
+
668
+ """
669
+ os.makedirs(save_dir, exist_ok=True)
670
+ model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
671
+ with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
672
+ json.dump(self.to_dict(), f, indent=2)
673
+
674
+
675
+ def load_quantized_model(
676
+ model: nn.Module,
677
+ save_folder: str,
678
+ quant_config_name: str = GPTQ_CONFIG,
679
+ state_dict_name: Optional[str] = None,
680
+ device_map: Optional[str] = None,
681
+ max_memory: Optional[Dict] = None,
682
+ no_split_module_classes: Optional[Dict] = None,
683
+ offload_folder: Optional[str] = None,
684
+ offload_buffers: Optional[str] = None,
685
+ offload_state_dict: bool = False,
686
+ disable_exllama: bool = False,
687
+ exllama_config: Optional[Dict[str, Any]] = None,
688
+ max_input_length: Optional[int] = None,
689
+ ):
690
+ """
691
+ Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map.
692
+
693
+ Args:
694
+ model (`nn.Module`):
695
+ The model can be enpty or not.
696
+ save_folder (`str`):
697
+ Directory to which to load the weights.
698
+ quant_config_name (`str`, defaults to `GPTQ_CONFIG`):
699
+ Name of the quantization config file
700
+ state_dict_name (`Optional[str]`, defaults to `None`):
701
+ Name of the state dict file
702
+ device_map (`Optional[str]`, defaults to `None`):
703
+ A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
704
+ name, once a given module name is inside, every submodule of it will be sent to the same device.
705
+ To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
706
+ max_memory (`Optional[Dict]`, defaults to `None`):
707
+ A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
708
+ and the available CPU RAM if unset.
709
+ no_split_module_classes (`Optional[Dict]`, defaults to `None`):
710
+ A list of layer class names that should never be split across device (for instance any layer that has a
711
+ residual connection).
712
+ offload_folder (`Optional[str]`, defaults to `None`):
713
+ If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
714
+ offload_buffers (`Optional[str]`, defaults to `None`):
715
+ In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
716
+ well as the parameters.
717
+ offload_state_dict (`bool`, defaults to `False`):
718
+ If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
719
+ the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
720
+ picked contains `"disk"` values.
721
+ disable_exllama (`Optional[bool]`, defaults to `None`):
722
+ Whether to use exllama backend. Only works with `bits` = 4.
723
+ exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
724
+ The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
725
+ max_input_length (`Optional[int]`, defaults to `None`):
726
+ The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
727
+ It is specific to the exllama backend with act-order.
728
+
729
+ Returns:
730
+ `nn.Module`: The quantized model
731
+ """
732
+ if not torch.cuda.is_available():
733
+ raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
734
+ if not is_auto_gptq_available():
735
+ raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
736
+ if not is_accelerate_available():
737
+ raise RuntimeError(
738
+ "You need to install accelerate in order to load and dispatch weights to"
739
+ "a quantized model. You can do it with `pip install accelerate`"
740
+ )
741
+ if device_map is None:
742
+ device_map = {"": torch.cuda.current_device()}
743
+ logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
744
+
745
+ if exllama_config is None:
746
+ exllama_config = {"version": ExllamaVersion.TWO}
747
+ else:
748
+ if "version" not in exllama_config:
749
+ raise ValueError("`exllama_config` needs to have a `version` key")
750
+ elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
751
+ version = exllama_config["version"]
752
+ raise ValueError(
753
+ f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
754
+ )
755
+
756
+ # this branch will check if model is from huggingface
757
+ try:
758
+ if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
759
+ quantize_config_dict = model.config.quantization_config.to_dict()
760
+ else:
761
+ with open(os.path.join(save_folder, quant_config_name), "r", encoding="utf-8") as f:
762
+ quantize_config_dict = json.load(f)
763
+ except Exception as err:
764
+ raise ValueError(
765
+ f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
766
+ ) from err
767
+ quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
768
+ quantizer.disable_exllama = disable_exllama
769
+ quantizer.exllama_config = exllama_config
770
+ quantizer.exllama_version = quantizer.exllama_config["version"]
771
+ quantizer.max_input_length = max_input_length
772
+
773
+ model = quantizer.convert_model(model)
774
+
775
+ if no_split_module_classes is None:
776
+ no_split_module_classes = quantizer.get_no_split_module_classes(model)
777
+
778
+ model = load_checkpoint_and_dispatch(
779
+ model,
780
+ checkpoint=os.path.join(save_folder, state_dict_name) if state_dict_name is not None else save_folder,
781
+ device_map=device_map,
782
+ max_memory=max_memory,
783
+ no_split_module_classes=no_split_module_classes,
784
+ offload_folder=offload_folder,
785
+ offload_buffers=offload_buffers,
786
+ offload_state_dict=offload_state_dict,
787
+ )
788
+
789
+ model = quantizer.post_init_model(model)
790
+ model.is_quantized = True
791
+ model.quantization_method = QuantizationMethod.GPTQ
792
+ model.eval()
793
+ return model
internal/donttouch_unpacking_autogptq/readme.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use autogpt_sample.py to dump opt-125m-gptq4.pth
2
+ but before that we need to hack a few files
3
+
4
+ patch according to delta
5
+ /data/vchua/miniconda3/envs/240531-hgx1-hf-clm/lib/python3.11/site-packages/optimum/gptq/quantizer.py
6
+ /data/vchua/miniconda3/envs/240531-hgx1-hf-clm/lib/python3.11/site-packages/auto_gptq/nn_modules/qlinear/
7
+
8
+ then use blob_manipulate.py
9
+
10
+ verify_unpacking_logic.py
11
+
12
+ fake_dequantize.py
internal/donttouch_unpacking_autogptq/run_sqft.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import torch
3
+ import torch.nn as nn
4
+ import numpy as np
5
+ from transformers import LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
6
+ from fake_dequantize import fake_dequantize
7
+ from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear
8
+
9
+ DEBUG=False
10
+
11
+ class SparseCompressLinear(nn.Linear):
12
+ def __init__(self, in_features, out_features, bias=True, verbose=DEBUG):
13
+ super(SparseCompressLinear, self).__init__(in_features, out_features, bias)
14
+ self.verbose = verbose # for debug
15
+
16
+ def forward(self, input):
17
+ if self.verbose is True:
18
+ print("SparseCompressLinear Forward!")
19
+ return super(SparseCompressLinear, self).forward(input)
20
+
21
+ def __repr__(self):
22
+ # Custom print out
23
+ return f"SparseCompressLinear(in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None})"
24
+
25
+
26
+ def make_linear_from_QuantLinear(QuantLinearObj):
27
+ device = QuantLinearObj.scales.device
28
+
29
+ qweight = QuantLinearObj.qweight
30
+ scales = QuantLinearObj.scales
31
+ qzeros = QuantLinearObj.qzeros
32
+
33
+ with torch.no_grad():
34
+ W, scales, zeros = fake_dequantize(qweight, scales, qzeros)
35
+ IC, OC = W.shape
36
+
37
+ linear = SparseCompressLinear(in_features=IC, out_features=OC, bias=(QuantLinearObj.bias != None))
38
+
39
+ assert linear.weight.shape == W.t().shape, "Logical Error"
40
+ linear.weight.data = W.t().contiguous()
41
+
42
+ if QuantLinearObj.bias is not None:
43
+ linear.bias.data = QuantLinearObj.bias
44
+
45
+ linear.register_buffer("scales", scales)
46
+ linear.register_buffer("zeros", zeros)
47
+
48
+ return linear.to(device)
49
+
50
+
51
+ def replace_QuantLinear_with_SparseCompressLinear(model):
52
+ for name, module in model.named_children():
53
+ if isinstance(module, QuantLinear):
54
+ if DEBUG is True:
55
+ print(f"Restoring {name}")
56
+ restored_linear = make_linear_from_QuantLinear(module)
57
+ restored_linear = restored_linear.to(torch.float16) #TODO: Hardcoding
58
+ setattr(model, name, restored_linear)
59
+ else:
60
+ # Recursively apply to child modules
61
+ replace_QuantLinear_with_SparseCompressLinear(module)
62
+ return model
63
+
64
+
65
+ if __name__ == "__main__":
66
+
67
+ # model_id = "/data4/vchua/hf-model/Meta-Llama-3-8B-Instruct"
68
+ # model_id = "/data4/vchua/hf-model/Meta-Llama-3-70B"
69
+
70
+ model_id = "/home/vchua/sqft-qa-sparsepeft-llama-3-8b-50-gptq-gsm8k"
71
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="cuda")
72
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
73
+
74
+ prompt = "Alan Turing theorized that computers would one day become"
75
+ input_ids = tokenizer([prompt]).input_ids
76
+ input_ids = torch.as_tensor(input_ids)
77
+
78
+ # -----------------------------------------
79
+ output_ids = model.generate(
80
+ input_ids.cuda(), do_sample=False, top_p=None, num_beams=1, max_new_tokens=256
81
+ )
82
+
83
+ output_sqft = tokenizer.batch_decode(output_ids.cpu())
84
+ print(f"\n++ Baseline sqft output:\n\n{output_sqft[0]}\n\n")
85
+
86
+ # -----------------------------------------
87
+ replace_QuantLinear_with_SparseCompressLinear(model)
88
+ output_ids = model.generate(
89
+ input_ids.cuda(), do_sample=False, top_p=None, num_beams=1, max_new_tokens=256
90
+ )
91
+
92
+ output_fake_dequantize = tokenizer.batch_decode(output_ids.cpu())
93
+ print(f"\n++ fake dequantize sqft output:\n\n{output_fake_dequantize[0]}\n\n")
94
+
95
+ tx1mlp = model.model.layers[0].mlp
96
+ torch.save(tx1mlp.state_dict(), "./sqft_llama3_8B_gptq_tx1_mlp.pth")
97
+ # -----------------------------------------
98
+ print()
99
+
100
+
101
+ # torch.save(tx1mlp.state_dict(), "./sqft_llama3_8B_gptq_tx1_mlp.pth")
internal/donttouch_unpacking_autogptq/verify_unpacking_logic.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+ blob = torch.load("./opt-125m-gptq4.pth")
5
+
6
+ def verify_unpack_logic(prepack, pack, nbit=4):
7
+ numel_per_int32 = 32//nbit
8
+
9
+ qweight = pack['qweight'].numpy()
10
+ scales = pack['scales'].numpy() #(ngroup, OC)
11
+ qzeros = pack['qzeros'].numpy() #(ngroup, OC//numel_per_int32)
12
+
13
+ IC = qweight.shape[0]*numel_per_int32
14
+ OC = qweight.shape[1]
15
+ group_size = IC//scales.shape[0]
16
+
17
+ qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
18
+ for row in range(0, qweight.shape[0]):
19
+ for k in range(0, numel_per_int32):
20
+ qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
21
+
22
+ intweight_match = torch.allclose(
23
+ torch.from_numpy(qweight_unpack).to(torch.int32),
24
+ torch.from_numpy(pack['intweight'].astype(np.int32))
25
+ )
26
+
27
+ assert intweight_match, "intweight and qweight_unpack do not match! pls debug"
28
+
29
+ scales_float = scales.astype(np.float32)
30
+
31
+ # TODO: verify with asym zero point. sym zero points are all identical
32
+ qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
33
+ for i in range(0, numel_per_int32):
34
+ # shift multiplier
35
+ shift_multiplier = numel_per_int32 - 1 - i
36
+ shift_by = shift_multiplier * nbit
37
+ qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
38
+ qzeros_unpack += 1 # for some reason they minus 1
39
+
40
+ qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
41
+ qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
42
+ scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
43
+
44
+ deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
45
+ for i in range(IC):
46
+ gid = i//group_size
47
+ deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
48
+
49
+ simulated_match = torch.allclose(deqweight_unpack, prepack['w'].t(), atol=0.0005)
50
+
51
+ assert simulated_match, "prepack['w'] and deqweight_unpack do not match! pls debug"
52
+
53
+ print(f"intweight_match: {intweight_match}, simulated_match: {simulated_match}")
54
+
55
+
56
+ for layer, lblob in blob.items():
57
+ print(f"\n\n--> {layer}")
58
+ prepack = lblob['prepack']
59
+ pack = lblob['pack']
60
+
61
+ # for k, v in prepack.items():
62
+ # print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
63
+
64
+ # for k, v in pack.items():
65
+ # print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
66
+
67
+ verify_unpack_logic(prepack, pack)
internal/pack_sparse_linear.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import os
4
+
5
+ def calc_sparsity(tensor):
6
+ if isinstance(tensor, torch.Tensor):
7
+ nnz = tensor.count_nonzero()
8
+ rate = 1-(nnz/tensor.numel())
9
+ return rate.item(), nnz
10
+ else:
11
+ nnz = np.count_nonzero(tensor)
12
+ rate = 1-(nnz/tensor.size)
13
+ return rate, nnz
14
+
15
+ if __name__ == "__main__":
16
+ sd = torch.load("./sqft_llama3_8B_gptq_tx1_mlp.pth")
17
+
18
+ for k,v in sd.items():
19
+ print(k)
20
+
21
+ weight = sd['up_proj.weight'] # OC x IC
22
+ scales = sd['up_proj.scales'] # n_group x OC
23
+ zeros = sd['up_proj.zeros'] # n_group x OC
24
+
25
+ nbit=4
26
+ OC, IC = weight.shape
27
+ numel_per_int32 = 32//nbit
28
+ #16x128B tile
29
+ stride_oc = 16
30
+ stride_ic = 128 * 8 // nbit
31
+
32
+ # always make contigous!
33
+ weight = weight.contiguous() # OC x IC
34
+ scales = scales.t().contiguous() # OC x n_group
35
+ zeros = zeros.t().contiguous() # OC x n_group
36
+
37
+ #TODO: hardcoding, temporary, Livia requires group size of 32. but our model is 128, we are going to repeat the value
38
+ group_size = 32
39
+ scales = scales.repeat_interleave(4, dim=1)
40
+ zeros = zeros.repeat_interleave(4, dim=1)
41
+
42
+ # Tile weight into target block size
43
+ tiled_weight = weight.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic, stride_ic)
44
+ tiled_scales = scales.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)
45
+ tiled_zeros = zeros.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)
46
+
47
+ assert tiled_weight.shape[:2] == tiled_scales.shape[:2], "pls debug"
48
+ assert tiled_weight.shape[:2] == tiled_zeros.shape[:2], "pls debug"
49
+
50
+ tiled_qweight = torch.zeros_like(tiled_weight)
51
+ tiled_bitmap = torch.zeros_like(tiled_weight).to(torch.bool)
52
+ tiled_nnz = torch.zeros(tiled_weight.shape[:2]).to(torch.int16)
53
+
54
+ non_zero_removed_tiled_qweight = torch.zeros_like(tiled_weight) # for debug
55
+ for tile_r in range(0, tiled_weight.shape[0]):
56
+ for tile_c in range(0, tiled_weight.shape[1]):
57
+
58
+ # metadata: number of non-zero elements (nnz)
59
+ sparsity, nnz = calc_sparsity(tiled_weight[tile_r, tile_c])
60
+ print(f"tile [{tile_r:4},{tile_c:4}], sparsity: {sparsity*100:4.1f}%, nnz: {nnz:5}")
61
+
62
+ # metadata: generate bitmask
63
+ nonzero_bool = (tiled_weight[tile_r, tile_c] != 0)
64
+ assert nonzero_bool.sum() == nnz, "pls debug"
65
+ tiled_bitmap[tile_r, tile_c] = nonzero_bool
66
+ tiled_nnz[tile_r, tile_c] = nnz
67
+
68
+ r = tile_r
69
+ c = tile_c
70
+
71
+ # get quantize val
72
+ w = tiled_weight[r, c]
73
+ qw = torch.zeros_like(tiled_weight[r, c])
74
+ s = tiled_scales[r, c]
75
+ z = tiled_zeros[r, c]
76
+
77
+ # for every column of groups
78
+ for col in range(tiled_scales.shape[-1]):
79
+ sidx = col*group_size
80
+ eidx = (col+1)*group_size
81
+
82
+ # unsqueeze is needed to make the vector as column
83
+ qw[:, sidx:eidx] = ( w[:, sidx:eidx] + (s[:,col]*z[:,col]).unsqueeze(-1) ) / s[:,col].unsqueeze(-1)
84
+
85
+ #for debug
86
+ non_zero_removed_tiled_qweight[r, c]=qw
87
+
88
+ # Zero Removal and pad to tile length (per Livia's request)
89
+ assert len(qw[nonzero_bool]) == nnz, "pls debug"
90
+ compress_qw = (torch.ones_like(qw)*8).reshape(-1) # because zero is 8, in this manner we achieve padding effect
91
+ compress_qw[:nnz] = qw[nonzero_bool]
92
+ assert (compress_qw != 8).sum() == nnz, "pls debug"
93
+ compress_qw = compress_qw.reshape(qw.shape)
94
+
95
+ tiled_qweight[r, c] = compress_qw
96
+ # nnz
97
+ # scale
98
+ # zeros
99
+
100
+ tiled_qweight = tiled_qweight.to(torch.int32).contiguous()
101
+ tiled_zeros = tiled_zeros.to(torch.int32).contiguous()
102
+ tiled_scales = tiled_scales.to(torch.float16).contiguous()
103
+ tiled_bitmap = tiled_bitmap.to(torch.int32).contiguous()
104
+ tiled_nnz = tiled_nnz.to(torch.int16).contiguous()
105
+
106
+
107
+ linear_nnz = tiled_nnz
108
+ linear_scales = tiled_scales.reshape(-1)
109
+
110
+ linear_qweight = tiled_qweight.reshape(-1).reshape(-1, 8).cpu().numpy()
111
+ linear_qweight_pack = np.zeros((linear_qweight.shape[0], 1), dtype=np.int32)
112
+ for i in range(0, numel_per_int32):
113
+ linear_qweight_pack[:, 0] |= linear_qweight[:, i] << (numel_per_int32 - 1 - i)*nbit
114
+ linear_qweight_pack = linear_qweight_pack.reshape(-1)
115
+
116
+ linear_zeros = tiled_zeros.reshape(-1).reshape(-1, 8).cpu().numpy()
117
+ linear_zeros_pack = np.zeros((linear_zeros.shape[0], 1), dtype=np.int32)
118
+ for i in range(0, numel_per_int32):
119
+ linear_zeros_pack[:, 0] |= linear_zeros[:, i] << (numel_per_int32 - 1 - i)*nbit
120
+ linear_zeros_pack = linear_zeros_pack.reshape(-1)
121
+
122
+ linear_bitmap = tiled_bitmap.reshape(-1).reshape(-1, 32).cpu().numpy() # why 32? 32 bitmask for an int32
123
+ linear_bitmap_pack = np.zeros((linear_bitmap.shape[0], 1), dtype=np.int32)
124
+ for i in range(0, 32):
125
+ linear_bitmap_pack[:, 0] |= linear_bitmap[:, i] << (32 - 1 - i)
126
+ linear_bitmap_pack = linear_bitmap_pack.reshape(-1)
127
+
128
+ os.makedirs("sparse_w4", exist_ok=True)
129
+ linear_qweight_pack.tofile('sparse_w4/linear_compressed_qweight_int32.bin')
130
+ linear_zeros_pack.tofile('sparse_w4/linear_zeros_int32.bin')
131
+ linear_scales.cpu().contiguous().numpy().tofile('sparse_w4/linear_scales_float16.bin')
132
+ linear_bitmap_pack.tofile('sparse_w4/linear_bitmap_int32.bin')
133
+ linear_nnz.cpu().contiguous().numpy().tofile('sparse_w4/linear_nnz_int16.bin')
134
+
135
+ print("joto")
136
+
137
+ loaded_linear_nnz = np.fromfile("sparse_w4/linear_nnz_int16.bin", dtype=np.int16)
138
+ loaded_tiled_nnz = loaded_linear_nnz.reshape(896,16)
139
+
140
+ assert torch.all(torch.from_numpy(loaded_tiled_nnz) == tiled_nnz), "pls debug"
141
+
142
+ loaded_linear_scales = np.fromfile("sparse_w4/linear_scales_float16.bin", dtype=np.float16)
143
+ loaded_tiled_scales = loaded_linear_scales.reshape(896, 16, 16, 8)
144
+
145
+ assert torch.all(torch.from_numpy(loaded_tiled_scales).to("cuda") == tiled_scales), "pls debug"
146
+
147
+ loaded_linear_bitmap_pack = np.fromfile('sparse_w4/linear_bitmap_int32.bin', dtype=np.int32)
148
+ loaded_linear_bitmap_pack = np.expand_dims(loaded_linear_bitmap_pack, axis=-1)
149
+ loaded_linear_bitmap = np.zeros((loaded_linear_bitmap_pack.shape[0], 32), dtype=np.int32)
150
+ for i in range(0, 32):
151
+ loaded_linear_bitmap[:, i] = ( loaded_linear_bitmap_pack[:, 0] >> (32 - 1 - i) ) & 0x1
152
+ loaded_tiled_bitmap = loaded_linear_bitmap.reshape(-1).reshape(896, 16, 16, 256)
153
+
154
+ assert torch.all(torch.from_numpy(loaded_tiled_bitmap).to("cuda") == tiled_bitmap), "pls debug"
155
+
156
+ loaded_linear_qweight_pack = np.fromfile('sparse_w4/linear_compressed_qweight_int32.bin', dtype=np.int32)
157
+ loaded_linear_qweight_pack = np.expand_dims(loaded_linear_qweight_pack, axis=-1)
158
+ loaded_linear_qweight = np.zeros((loaded_linear_qweight_pack.shape[0], numel_per_int32), dtype=np.int32)
159
+ for i in range(0, numel_per_int32):
160
+ loaded_linear_qweight[:, i] = ( loaded_linear_qweight_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
161
+ loaded_tiled_qweight = loaded_linear_qweight.reshape(-1).reshape(896, 16, 16, 256)
162
+
163
+ assert torch.all(torch.from_numpy(loaded_tiled_qweight).to("cuda") == tiled_qweight), "pls debug"
164
+
165
+ loaded_linear_zeros_pack = np.fromfile('sparse_w4/linear_zeros_int32.bin', dtype=np.int32)
166
+ loaded_linear_zeros_pack = np.expand_dims(loaded_linear_zeros_pack, axis=-1)
167
+ loaded_linear_zeros = np.zeros((loaded_linear_zeros_pack.shape[0], numel_per_int32), dtype=np.int32)
168
+ for i in range(0, numel_per_int32):
169
+ loaded_linear_zeros[:, i] = ( loaded_linear_zeros_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
170
+ loaded_tiled_zeros = loaded_linear_zeros.reshape(-1).reshape(896, 16, 16, 8)
171
+
172
+ assert torch.all(torch.from_numpy(loaded_tiled_zeros).to("cuda") == tiled_zeros), "pls debug"
173
+
174
+ zero_recovered_tiles = np.ones_like(loaded_tiled_qweight)*8 # zero is represented by value of 8
175
+ for r in range(0, loaded_tiled_qweight.shape[0]):
176
+ for c in range(0, loaded_tiled_qweight.shape[1]):
177
+ zero_removed_padded_tile = loaded_tiled_qweight[r, c]
178
+ nnz=loaded_tiled_nnz[r, c]
179
+ tile_values = zero_removed_padded_tile.reshape(-1)[0:nnz]
180
+ nnz_indices = np.nonzero(loaded_tiled_bitmap[r, c])
181
+ zero_recovered_tiles[r, c][nnz_indices] = tile_values
182
+
183
+ assert torch.all(non_zero_removed_tiled_qweight.to(torch.int32) == torch.from_numpy(zero_recovered_tiles).to("cuda")), "pls debug"
184
+
185
+ dequantized_tiles = np.zeros_like(zero_recovered_tiles, dtype=np.float16)
186
+
187
+ zero_recovered_tiles = zero_recovered_tiles.astype(np.float16)
188
+ loaded_tiled_zeros = loaded_tiled_zeros.astype(np.float16)
189
+ loaded_tiled_scales = loaded_tiled_scales.astype(np.float16)
190
+ for i in range(0, zero_recovered_tiles.shape[-1], group_size):
191
+ gid = i//group_size
192
+ dequantized_tiles[:, :, :, i:i+group_size] = \
193
+ ( zero_recovered_tiles[:, :, :, i:i+group_size] - \
194
+ np.expand_dims(loaded_tiled_zeros[:, :, :, gid], axis=-1) ) * \
195
+ np.expand_dims(loaded_tiled_scales[:, :, :, gid], axis=-1)
196
+
197
+ print("joto")
198
+ # torch.allclose(linear_tiled_W[0], tiled_W[0,0])
199
+ # torch.allclose(linear_tiled_W[1], tiled_W[0,1])
200
+ # torch.allclose(linear_tiled_W[12], tiled_W[1,0])
201
+ # torch.allclose(linear_tiled_W[26], tiled_W[2,2])
202
+ # torch.allclose(linear_tiled_W[-1], tiled_W[-1,-1])
203
+ # In [18]: torch.allclose(tiled_W[0,1], W[0:16, 256:512])
204
+ # Out[18]: True
205
+
206
+ # In [19]: torch.allclose(tiled_W[1,1], W[16:32, 256:512])
207
+ # Out[19]: True
208
+
209
+ # In [20]: torch.allclose(tiled_W[-1,-1], W[(768-16):768, (3072-256):3072])
210
+ # Out[20]: True
211
+
212
+
213
+
214
+ # If you want to serialize the tensor such that a single bit indicates if an element is zero or non-zero, you can achieve this by creating a byte array where each bit corresponds to the zero/non-zero status of each element. Here’s how you can do it:
215
+
216
+ # Convert the tensor to a boolean tensor indicating zero or non-zero.
217
+ # Flatten the boolean tensor.
218
+ # Pack the boolean values into bytes.
219
+ # Here’s a step-by-step example:
220
+
221
+ # python
222
+ # Copy code
223
+ # import torch
224
+
225
+ # # Example tensor
226
+ # tensor = torch.tensor([[0, 1, 2], [3, 0, 4], [5, 6, 0]])
227
+
228
+ # # Step 1: Create a boolean tensor indicating zero or non-zero values
229
+ # zero_indicator = torch.eq(tensor, 0)
230
+
231
+ # # Step 2: Flatten the boolean tensor
232
+ # flat_zero_indicator = zero_indicator.flatten()
233
+
234
+ # # Step 3: Convert boolean tensor to a list of bytes
235
+ # byte_array = []
236
+ # byte = 0
237
+ # for i, bit in enumerate(flat_zero_indicator):
238
+ # if bit:
239
+ # byte |= 1 << (i % 8)
240
+ # if (i % 8) == 7:
241
+ # byte_array.append(byte)
242
+ # byte = 0
243
+
244
+ # # Append the last byte if necessary
245
+ # if (len(flat_zero_indicator) % 8) != 0:
246
+ # byte_array.append(byte)
247
+
248
+ # # Convert to bytearray
249
+ # result = bytearray(byte_array)
250
+
251
+ # print(result)
internal/sqft_llama3_8B_gptq_tx1_mlp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e4b052cf767df68cde1e08ab4c5e1adf19d821d64b6f9ff5727ef5b615f97a7
3
+ size 357830528
sparse_w4/linear_bitmap_int32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1db9c9466c5e2f5efdb426685b479794520c35f196e6811e175cb5066b9b874b
3
+ size 7340032
sparse_w4/linear_compressed_qweight_int32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f20a8d23c239a5d002686ff8c0867bb49ffc0daec5480fedef4a5163877ca7f
3
+ size 29360128
sparse_w4/linear_nnz_int16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f8d24ef1e4e2af4d04f7ef8e3f52d2023b916336c1bd013a4256f8d96805736
3
+ size 28672
sparse_w4/linear_scales_float16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f825735214928e40a0c5850f95f5b55bc8de1b31bf7c1a67974df544f247b45
3
+ size 3670016
sparse_w4/linear_zeros_int32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf1b625d7d0b024b60e69eae10e8f7bf74ec7d6a249ab6e0e2dee6c482123946
3
+ size 917504
unpack_blobs.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # Python samples to recover the zero compressed W4 blobs
4
+
5
+ nbit=4
6
+ numel_per_int32 = 32//nbit
7
+ group_size=32
8
+
9
+ linear_nnz = np.fromfile("sparse_w4/linear_nnz_int16.bin", dtype=np.int16)
10
+ tiled_nnz = linear_nnz.reshape(896,16)
11
+
12
+
13
+ linear_scales = np.fromfile("sparse_w4/linear_scales_float16.bin", dtype=np.float16)
14
+ tiled_scales = linear_scales.reshape(896, 16, 16, 8)
15
+
16
+
17
+ linear_bitmap_pack = np.fromfile('sparse_w4/linear_bitmap_int32.bin', dtype=np.int32)
18
+ linear_bitmap_pack = np.expand_dims(linear_bitmap_pack, axis=-1)
19
+ linear_bitmap = np.zeros((linear_bitmap_pack.shape[0], 32), dtype=np.int32)
20
+ for i in range(0, 32):
21
+ linear_bitmap[:, i] = ( linear_bitmap_pack[:, 0] >> (32 - 1 - i) ) & 0x1
22
+ tiled_bitmap = linear_bitmap.reshape(-1).reshape(896, 16, 16, 256)
23
+
24
+
25
+ linear_qweight_pack = np.fromfile('sparse_w4/linear_compressed_qweight_int32.bin', dtype=np.int32)
26
+ linear_qweight_pack = np.expand_dims(linear_qweight_pack, axis=-1)
27
+ linear_qweight = np.zeros((linear_qweight_pack.shape[0], numel_per_int32), dtype=np.int32)
28
+ for i in range(0, numel_per_int32):
29
+ linear_qweight[:, i] = ( linear_qweight_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
30
+ tiled_qweight = linear_qweight.reshape(-1).reshape(896, 16, 16, 256)
31
+
32
+
33
+ linear_zeros_pack = np.fromfile('sparse_w4/linear_zeros_int32.bin', dtype=np.int32)
34
+ linear_zeros_pack = np.expand_dims(linear_zeros_pack, axis=-1)
35
+ linear_zeros = np.zeros((linear_zeros_pack.shape[0], numel_per_int32), dtype=np.int32)
36
+ for i in range(0, numel_per_int32):
37
+ linear_zeros[:, i] = ( linear_zeros_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
38
+ tiled_zeros = linear_zeros.reshape(-1).reshape(896, 16, 16, 8)
39
+
40
+ # ------------------------------------------------------------
41
+ # Decompress the tile, recover the zero locations
42
+ zero_recovered_tiles = np.ones_like(tiled_qweight)*8 # zero is represented by value of 8
43
+ for r in range(0, tiled_qweight.shape[0]):
44
+ for c in range(0, tiled_qweight.shape[1]):
45
+ zero_removed_padded_tile = tiled_qweight[r, c]
46
+ nnz=tiled_nnz[r, c]
47
+ tile_values = zero_removed_padded_tile.reshape(-1)[0:nnz]
48
+ nnz_indices = np.nonzero(tiled_bitmap[r, c])
49
+ zero_recovered_tiles[r, c][nnz_indices] = tile_values
50
+
51
+ # ------------------------------------------------------------
52
+ # Simulate dequantization of 4-bit weight to floating value
53
+ dequantized_tiles = np.zeros_like(zero_recovered_tiles, dtype=np.float16)
54
+
55
+ zero_recovered_tiles = zero_recovered_tiles.astype(np.float16)
56
+ tiled_zeros = tiled_zeros.astype(np.float16)
57
+ tiled_scales = tiled_scales.astype(np.float16)
58
+ for i in range(0, zero_recovered_tiles.shape[-1], group_size):
59
+ gid = i//group_size
60
+ dequantized_tiles[:, :, :, i:i+group_size] = \
61
+ ( zero_recovered_tiles[:, :, :, i:i+group_size] - \
62
+ np.expand_dims(tiled_zeros[:, :, :, gid], axis=-1) ) * \
63
+ np.expand_dims(tiled_scales[:, :, :, gid], axis=-1)
64
+
65
+ # ------------------------------------------------------------
66
+ # Check sparsity per tile
67
+ def calc_sparsity(tensor):
68
+ nnz = np.count_nonzero(tensor)
69
+ rate = 1-(nnz/tensor.size)
70
+ return rate, nnz
71
+
72
+ for tile_r in range(0, dequantized_tiles.shape[0]):
73
+ for tile_c in range(0, dequantized_tiles.shape[1]):
74
+ sparsity, nnz = calc_sparsity(dequantized_tiles[tile_r, tile_c])
75
+ print(f"tile [{tile_r:4},{tile_c:4}], sparsity: {sparsity*100:4.1f}%, nnz: {nnz:5}")
76
+
77
+ print("end.")