Vui Seng Chua
commited on
Commit
β’
cfb9114
1
Parent(s):
e3cc684
Add content
Browse files- README.md +20 -0
- internal/donttouch_unpacking_autogptq/__pycache__/fake_dequantize.cpython-311.pyc +0 -0
- internal/donttouch_unpacking_autogptq/autogpt_sample.py +13 -0
- internal/donttouch_unpacking_autogptq/blob_manipulate.py +73 -0
- internal/donttouch_unpacking_autogptq/fake_dequantize.py +65 -0
- internal/donttouch_unpacking_autogptq/opt-125m-gptq4.pth +3 -0
- internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py +359 -0
- internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py.ori.py +358 -0
- internal/donttouch_unpacking_autogptq/quantizer.py +816 -0
- internal/donttouch_unpacking_autogptq/quantizer.py.ori.py +793 -0
- internal/donttouch_unpacking_autogptq/readme.md +12 -0
- internal/donttouch_unpacking_autogptq/run_sqft.py +101 -0
- internal/donttouch_unpacking_autogptq/verify_unpacking_logic.py +67 -0
- internal/pack_sparse_linear.py +251 -0
- internal/sqft_llama3_8B_gptq_tx1_mlp.pth +3 -0
- sparse_w4/linear_bitmap_int32.bin +3 -0
- sparse_w4/linear_compressed_qweight_int32.bin +3 -0
- sparse_w4/linear_nnz_int16.bin +3 -0
- sparse_w4/linear_scales_float16.bin +3 -0
- sparse_w4/linear_zeros_int32.bin +3 -0
- unpack_blobs.py +77 -0
README.md
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
This repo contains serialized blobs of an up projection layer of llama3-8B (oc=14336, ic=4096).
|
3 |
+
The linear layer has been quantized (GPTQ W4 Sym with group size 32) and sparsified by 50%.
|
4 |
+
|
5 |
+
```
|
6 |
+
βββ sparse_w4
|
7 |
+
β βββ linear_bitmap_int32.bin
|
8 |
+
β βββ linear_compressed_qweight_int32.bin
|
9 |
+
β βββ linear_nnz_int16.bin
|
10 |
+
β βββ linear_scales_float16.bin
|
11 |
+
β βββ linear_zeros_int32.bin
|
12 |
+
```
|
13 |
+
|
14 |
+
### Usage
|
15 |
+
The following script shows how to process the blobs in python. It shows unpacking, zero location recovery, as well as weight dequantization process.
|
16 |
+
```bash
|
17 |
+
python unpack_blobs.py
|
18 |
+
```
|
19 |
+
|
20 |
+
> you can ignore `internal/`
|
internal/donttouch_unpacking_autogptq/__pycache__/fake_dequantize.cpython-311.pyc
ADDED
Binary file (3.75 kB). View file
|
|
internal/donttouch_unpacking_autogptq/autogpt_sample.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
2 |
+
|
3 |
+
model_id = "facebook/opt-125m"
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
5 |
+
quantization_config = GPTQConfig(bits=4, sym=True, dataset = 'wikitext2', tokenizer=tokenizer, group_size=128, desc_act=False, use_exllama=False)
|
6 |
+
|
7 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)
|
8 |
+
|
9 |
+
print("joto")
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
internal/donttouch_unpacking_autogptq/blob_manipulate.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
blob = torch.load("./opt-125m-gptq4.pth")
|
5 |
+
|
6 |
+
for layer, lblob in blob.items():
|
7 |
+
if 'model.decoder.layers.0.fc1' in layer:
|
8 |
+
print(f"--> {layer}")
|
9 |
+
prepack = lblob['prepack']
|
10 |
+
pack = lblob['pack']
|
11 |
+
|
12 |
+
for k, v in prepack.items():
|
13 |
+
print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
|
14 |
+
|
15 |
+
for k, v in pack.items():
|
16 |
+
print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
|
17 |
+
break
|
18 |
+
|
19 |
+
qweight = pack['qweight'].numpy()
|
20 |
+
scales = pack['scales'].numpy() #(ngroup, OC)
|
21 |
+
qzeros = pack['qzeros'].numpy() #(ngroup, OC//numel_per_int32)
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
nbit=4
|
26 |
+
numel_per_int32 = 32//nbit
|
27 |
+
IC = qweight.shape[0]*numel_per_int32
|
28 |
+
OC = qweight.shape[1]
|
29 |
+
group_size = IC//scales.shape[0]
|
30 |
+
|
31 |
+
qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
|
32 |
+
for row in range(0, qweight.shape[0]):
|
33 |
+
for k in range(0, numel_per_int32):
|
34 |
+
qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
|
35 |
+
|
36 |
+
torch.allclose(
|
37 |
+
torch.from_numpy(qweight_unpack).to(torch.int32),
|
38 |
+
torch.from_numpy(pack['intweight'].astype(np.int32))
|
39 |
+
)
|
40 |
+
|
41 |
+
scales_float = scales.astype(np.float32)
|
42 |
+
|
43 |
+
# TODO: verify with asym zero point. sym zero points are all identical
|
44 |
+
qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
|
45 |
+
for i in range(0, numel_per_int32):
|
46 |
+
# shift multiplier
|
47 |
+
shift_multiplier = numel_per_int32 - 1 - i
|
48 |
+
shift_by = shift_multiplier * nbit
|
49 |
+
qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
|
50 |
+
qzeros_unpack += 1 # for some reason they minus 1
|
51 |
+
|
52 |
+
qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
|
53 |
+
qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
|
54 |
+
scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
|
55 |
+
|
56 |
+
deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
|
57 |
+
for i in range(IC):
|
58 |
+
gid = i//group_size
|
59 |
+
deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
|
60 |
+
|
61 |
+
print(torch.allclose(deqweight_unpack, prepack['w'].t(), atol=0.0005))
|
62 |
+
print("temp")
|
63 |
+
|
64 |
+
# Numpy path
|
65 |
+
# deqweight_unpack = np.zeros((IC,OC), dtype=np.float32)
|
66 |
+
# for i in range(IC):
|
67 |
+
# gid = i//group_size
|
68 |
+
# deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
|
69 |
+
|
70 |
+
# deqweight_unpack = torch.from_numpy(deqweight_unpack).to(torch.float16)
|
71 |
+
|
72 |
+
torch.allclose(dequant_float, prepack['w'].t(), atol=0.0005)
|
73 |
+
print("blob")
|
internal/donttouch_unpacking_autogptq/fake_dequantize.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
def fake_dequantize(qweight, scales, qzeros):
|
7 |
+
nbit=4
|
8 |
+
numel_per_int32 = 32//nbit
|
9 |
+
|
10 |
+
qweight = qweight.cpu().numpy()
|
11 |
+
scales = scales.cpu().numpy() #(ngroup, OC)
|
12 |
+
qzeros = qzeros.cpu().numpy() #(ngroup, OC//numel_per_int32)
|
13 |
+
|
14 |
+
IC = qweight.shape[0]*numel_per_int32
|
15 |
+
OC = qweight.shape[1]
|
16 |
+
group_size = IC//scales.shape[0]
|
17 |
+
|
18 |
+
qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
|
19 |
+
for row in range(0, qweight.shape[0]):
|
20 |
+
for k in range(0, numel_per_int32):
|
21 |
+
qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
|
22 |
+
|
23 |
+
scales_float = scales.astype(np.float32)
|
24 |
+
|
25 |
+
qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
|
26 |
+
for i in range(0, numel_per_int32):
|
27 |
+
# shift multiplier
|
28 |
+
shift_multiplier = numel_per_int32 - 1 - i
|
29 |
+
shift_by = shift_multiplier * nbit
|
30 |
+
qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
|
31 |
+
qzeros_unpack += 1 # for some reason they minus 1
|
32 |
+
|
33 |
+
qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
|
34 |
+
qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
|
35 |
+
scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
|
36 |
+
|
37 |
+
deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
|
38 |
+
for i in range(IC):
|
39 |
+
gid = i//group_size
|
40 |
+
deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
|
41 |
+
|
42 |
+
return deqweight_unpack, scales_float, qzeros_unpack
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
blob = torch.load("./opt-125m-gptq4.pth")
|
48 |
+
|
49 |
+
for layer, lblob in blob.items():
|
50 |
+
print(f"\n\n--> {layer}")
|
51 |
+
prepack = lblob['prepack']
|
52 |
+
pack = lblob['pack']
|
53 |
+
|
54 |
+
# for k, v in prepack.items():
|
55 |
+
# print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
|
56 |
+
|
57 |
+
# for k, v in pack.items():
|
58 |
+
# print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
|
59 |
+
|
60 |
+
W, _, _ = fake_dequantize(pack['qweight'], pack['scales'], pack['qzeros'])
|
61 |
+
|
62 |
+
simulated_match = torch.allclose(W, prepack['w'].t(), atol=0.0005)
|
63 |
+
|
64 |
+
print(f"simulated_match? {simulated_match}")
|
65 |
+
|
internal/donttouch_unpacking_autogptq/opt-125m-gptq4.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0269cabd58cd27261fde469502a01e84760a413b16ffa7989f395c53c65e46f4
|
3 |
+
size 46688098
|
internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py
ADDED
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from logging import getLogger
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
import transformers
|
8 |
+
|
9 |
+
|
10 |
+
logger = getLogger(__name__)
|
11 |
+
try:
|
12 |
+
import autogptq_cuda_64
|
13 |
+
import autogptq_cuda_256
|
14 |
+
|
15 |
+
_autogptq_cuda_available = True
|
16 |
+
except ImportError:
|
17 |
+
logger.warning("CUDA extension not installed.")
|
18 |
+
autogptq_cuda_256 = None
|
19 |
+
autogptq_cuda_64 = None
|
20 |
+
_autogptq_cuda_available = False
|
21 |
+
|
22 |
+
|
23 |
+
class QuantLinear(nn.Module):
|
24 |
+
QUANT_TYPE = "cuda-old"
|
25 |
+
|
26 |
+
def __init__(
|
27 |
+
self,
|
28 |
+
bits,
|
29 |
+
group_size,
|
30 |
+
infeatures,
|
31 |
+
outfeatures,
|
32 |
+
bias,
|
33 |
+
use_cuda_fp16=True,
|
34 |
+
kernel_switch_threshold=128,
|
35 |
+
trainable=False,
|
36 |
+
weight_dtype=torch.float16,
|
37 |
+
):
|
38 |
+
super().__init__()
|
39 |
+
global _autogptq_cuda_available
|
40 |
+
if bits not in [2, 3, 4, 8]:
|
41 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
42 |
+
if trainable:
|
43 |
+
_autogptq_cuda_available = False
|
44 |
+
self.infeatures = infeatures
|
45 |
+
self.outfeatures = outfeatures
|
46 |
+
self.bits = bits
|
47 |
+
self.group_size = group_size if group_size != -1 else infeatures
|
48 |
+
self.maxq = 2**self.bits - 1
|
49 |
+
|
50 |
+
self.register_buffer(
|
51 |
+
"qweight",
|
52 |
+
torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
|
53 |
+
)
|
54 |
+
self.register_buffer(
|
55 |
+
"qzeros",
|
56 |
+
torch.zeros(
|
57 |
+
(
|
58 |
+
math.ceil(infeatures / self.group_size),
|
59 |
+
outfeatures // 32 * self.bits,
|
60 |
+
),
|
61 |
+
dtype=torch.int32,
|
62 |
+
),
|
63 |
+
)
|
64 |
+
self.register_buffer(
|
65 |
+
"scales",
|
66 |
+
torch.zeros(
|
67 |
+
(math.ceil(infeatures / self.group_size), outfeatures),
|
68 |
+
dtype=weight_dtype,
|
69 |
+
),
|
70 |
+
)
|
71 |
+
self.register_buffer(
|
72 |
+
"g_idx",
|
73 |
+
torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
|
74 |
+
)
|
75 |
+
|
76 |
+
if bias:
|
77 |
+
self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
|
78 |
+
else:
|
79 |
+
self.bias = None
|
80 |
+
self.half_indim = self.infeatures // 2
|
81 |
+
|
82 |
+
self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
|
83 |
+
|
84 |
+
# is performed by unpacking the weights and using torch.matmul
|
85 |
+
if self.bits in [2, 4, 8]:
|
86 |
+
self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
|
87 |
+
elif self.bits == 3:
|
88 |
+
self.wf = torch.tensor(
|
89 |
+
[
|
90 |
+
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
|
91 |
+
[0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
|
92 |
+
[0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
|
93 |
+
],
|
94 |
+
dtype=torch.int32,
|
95 |
+
).reshape(1, 3, 12)
|
96 |
+
|
97 |
+
self.kernel_switch_threshold = kernel_switch_threshold
|
98 |
+
self.autogptq_cuda_available = _autogptq_cuda_available
|
99 |
+
self.autogptq_cuda = autogptq_cuda_256
|
100 |
+
if infeatures % 256 != 0 or outfeatures % 256 != 0:
|
101 |
+
self.autogptq_cuda = autogptq_cuda_64
|
102 |
+
if infeatures % 64 != 0 or outfeatures % 64 != 0:
|
103 |
+
self.autogptq_cuda_available = False
|
104 |
+
|
105 |
+
self.trainable = trainable
|
106 |
+
|
107 |
+
def post_init(self):
|
108 |
+
pass
|
109 |
+
|
110 |
+
def pack(self, linear, scales, zeros, g_idx):
|
111 |
+
W = linear.weight.data.clone()
|
112 |
+
if isinstance(linear, nn.Conv2d):
|
113 |
+
W = W.flatten(1)
|
114 |
+
if isinstance(linear, transformers.pytorch_utils.Conv1D):
|
115 |
+
W = W.t()
|
116 |
+
|
117 |
+
scales = scales.t().contiguous()
|
118 |
+
zeros = zeros.t().contiguous()
|
119 |
+
scale_zeros = zeros * scales
|
120 |
+
self.scales = scales.clone().to(dtype=linear.weight.dtype)
|
121 |
+
if linear.bias is not None:
|
122 |
+
self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
|
123 |
+
|
124 |
+
intweight = []
|
125 |
+
for idx in range(self.infeatures):
|
126 |
+
g_idx = idx // self.group_size
|
127 |
+
intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
|
128 |
+
intweight = torch.cat(intweight, dim=1)
|
129 |
+
intweight = intweight.t().contiguous()
|
130 |
+
intweight = intweight.numpy().astype(np.uint32)
|
131 |
+
self.intweight = intweight
|
132 |
+
|
133 |
+
i = 0
|
134 |
+
row = 0
|
135 |
+
qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
|
136 |
+
while row < qweight.shape[0]:
|
137 |
+
if self.bits in [2, 4, 8]:
|
138 |
+
for j in range(i, i + (32 // self.bits)):
|
139 |
+
qweight[row] |= intweight[j] << (self.bits * (j - i))
|
140 |
+
i += 32 // self.bits
|
141 |
+
row += 1
|
142 |
+
elif self.bits == 3:
|
143 |
+
for j in range(i, i + 10):
|
144 |
+
qweight[row] |= intweight[j] << (3 * (j - i))
|
145 |
+
i += 10
|
146 |
+
qweight[row] |= intweight[i] << 30
|
147 |
+
row += 1
|
148 |
+
qweight[row] |= (intweight[i] >> 2) & 1
|
149 |
+
i += 1
|
150 |
+
for j in range(i, i + 10):
|
151 |
+
qweight[row] |= intweight[j] << (3 * (j - i) + 1)
|
152 |
+
i += 10
|
153 |
+
qweight[row] |= intweight[i] << 31
|
154 |
+
row += 1
|
155 |
+
qweight[row] |= (intweight[i] >> 1) & 0x3
|
156 |
+
i += 1
|
157 |
+
for j in range(i, i + 10):
|
158 |
+
qweight[row] |= intweight[j] << (3 * (j - i) + 2)
|
159 |
+
i += 10
|
160 |
+
row += 1
|
161 |
+
else:
|
162 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
163 |
+
|
164 |
+
qweight = qweight.astype(np.int32)
|
165 |
+
self.qweight = torch.from_numpy(qweight)
|
166 |
+
|
167 |
+
zeros -= 1
|
168 |
+
zeros = zeros.numpy().astype(np.uint32)
|
169 |
+
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
|
170 |
+
i = 0
|
171 |
+
col = 0
|
172 |
+
while col < qzeros.shape[1]:
|
173 |
+
if self.bits in [2, 4, 8]:
|
174 |
+
for j in range(i, i + (32 // self.bits)):
|
175 |
+
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
|
176 |
+
i += 32 // self.bits
|
177 |
+
col += 1
|
178 |
+
elif self.bits == 3:
|
179 |
+
for j in range(i, i + 10):
|
180 |
+
qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
|
181 |
+
i += 10
|
182 |
+
qzeros[:, col] |= zeros[:, i] << 30
|
183 |
+
col += 1
|
184 |
+
qzeros[:, col] |= (zeros[:, i] >> 2) & 1
|
185 |
+
i += 1
|
186 |
+
for j in range(i, i + 10):
|
187 |
+
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
|
188 |
+
i += 10
|
189 |
+
qzeros[:, col] |= zeros[:, i] << 31
|
190 |
+
col += 1
|
191 |
+
qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
|
192 |
+
i += 1
|
193 |
+
for j in range(i, i + 10):
|
194 |
+
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
|
195 |
+
i += 10
|
196 |
+
col += 1
|
197 |
+
else:
|
198 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
199 |
+
|
200 |
+
qzeros = qzeros.astype(np.int32)
|
201 |
+
self.qzeros = torch.from_numpy(qzeros)
|
202 |
+
|
203 |
+
def forward(self, x):
|
204 |
+
x_dtype = x.dtype
|
205 |
+
out_shape = x.shape[:-1] + (self.outfeatures,)
|
206 |
+
x = x.reshape(-1, x.shape[-1])
|
207 |
+
if (
|
208 |
+
x.device.type == "cuda"
|
209 |
+
and self.autogptq_cuda_available is True
|
210 |
+
and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
|
211 |
+
):
|
212 |
+
out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
|
213 |
+
if self.use_cuda_fp16:
|
214 |
+
if x_dtype != torch.float16:
|
215 |
+
logger.warning_once(
|
216 |
+
f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
|
217 |
+
)
|
218 |
+
|
219 |
+
if self.bits == 2:
|
220 |
+
self.autogptq_cuda.vecquant2matmul_faster_old(
|
221 |
+
x,
|
222 |
+
self.qweight,
|
223 |
+
out,
|
224 |
+
self.scales.float(),
|
225 |
+
self.qzeros,
|
226 |
+
self.group_size,
|
227 |
+
self.half_indim,
|
228 |
+
)
|
229 |
+
elif self.bits == 3:
|
230 |
+
self.autogptq_cuda.vecquant3matmul_faster_old(
|
231 |
+
x,
|
232 |
+
self.qweight,
|
233 |
+
out,
|
234 |
+
self.scales.float(),
|
235 |
+
self.qzeros,
|
236 |
+
self.group_size,
|
237 |
+
self.half_indim,
|
238 |
+
)
|
239 |
+
elif self.bits == 4:
|
240 |
+
self.autogptq_cuda.vecquant4matmul_faster_old(
|
241 |
+
x,
|
242 |
+
self.qweight,
|
243 |
+
out,
|
244 |
+
self.scales.float(),
|
245 |
+
self.qzeros,
|
246 |
+
self.group_size,
|
247 |
+
self.half_indim,
|
248 |
+
)
|
249 |
+
|
250 |
+
else:
|
251 |
+
raise NotImplementedError("Only 2,3,4 bits are supported.")
|
252 |
+
else:
|
253 |
+
x = x.to(torch.float32) # This is required for autocast compatibility.
|
254 |
+
if self.bits == 2:
|
255 |
+
self.autogptq_cuda.vecquant2matmul_old(
|
256 |
+
x,
|
257 |
+
self.qweight,
|
258 |
+
out,
|
259 |
+
self.scales.float(),
|
260 |
+
self.qzeros,
|
261 |
+
self.group_size,
|
262 |
+
)
|
263 |
+
elif self.bits == 3:
|
264 |
+
self.autogptq_cuda.vecquant3matmul_old(
|
265 |
+
x,
|
266 |
+
self.qweight,
|
267 |
+
out,
|
268 |
+
self.scales.float(),
|
269 |
+
self.qzeros,
|
270 |
+
self.group_size,
|
271 |
+
)
|
272 |
+
elif self.bits == 4:
|
273 |
+
self.autogptq_cuda.vecquant4matmul_old(
|
274 |
+
x,
|
275 |
+
self.qweight,
|
276 |
+
out,
|
277 |
+
self.scales.float(),
|
278 |
+
self.qzeros,
|
279 |
+
self.group_size,
|
280 |
+
)
|
281 |
+
elif self.bits == 8:
|
282 |
+
self.autogptq_cuda.vecquant8matmul_old(
|
283 |
+
x,
|
284 |
+
self.qweight,
|
285 |
+
out,
|
286 |
+
self.scales.float(),
|
287 |
+
self.qzeros,
|
288 |
+
self.group_size,
|
289 |
+
)
|
290 |
+
else:
|
291 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
292 |
+
else:
|
293 |
+
if self.wf.device != self.qzeros.device:
|
294 |
+
self.wf = self.wf.to(self.qzeros.device)
|
295 |
+
|
296 |
+
if self.bits in [2, 4, 8]:
|
297 |
+
zeros = torch.bitwise_right_shift(
|
298 |
+
torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
|
299 |
+
self.wf.unsqueeze(0),
|
300 |
+
).to(torch.int16 if self.bits == 8 else torch.int8)
|
301 |
+
|
302 |
+
zeros = zeros + 1
|
303 |
+
zeros = torch.bitwise_and(
|
304 |
+
zeros, (2**self.bits) - 1
|
305 |
+
) # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
|
306 |
+
|
307 |
+
zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
|
308 |
+
|
309 |
+
scales = self.scales
|
310 |
+
scales = scales.reshape(-1, 1, scales.shape[-1])
|
311 |
+
|
312 |
+
weight = torch.bitwise_right_shift(
|
313 |
+
torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
|
314 |
+
self.wf.unsqueeze(-1),
|
315 |
+
).to(torch.int16 if self.bits == 8 else torch.int8)
|
316 |
+
weight = torch.bitwise_and(weight, (2**self.bits) - 1)
|
317 |
+
weight = weight.reshape(-1, self.group_size, weight.shape[2])
|
318 |
+
elif self.bits == 3:
|
319 |
+
zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
|
320 |
+
-1, -1, -1, 12
|
321 |
+
)
|
322 |
+
zeros = zeros >> self.wf.unsqueeze(0)
|
323 |
+
zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
|
324 |
+
zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
|
325 |
+
zeros = zeros & 0x7
|
326 |
+
zeros = torch.cat(
|
327 |
+
[zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
|
328 |
+
dim=2,
|
329 |
+
)
|
330 |
+
|
331 |
+
zeros = zeros + 1
|
332 |
+
zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
|
333 |
+
|
334 |
+
scales = self.scales
|
335 |
+
scales = scales.reshape(-1, 1, scales.shape[-1])
|
336 |
+
|
337 |
+
weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
|
338 |
+
-1, -1, 12, -1
|
339 |
+
)
|
340 |
+
weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
|
341 |
+
weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
|
342 |
+
weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
|
343 |
+
weight = weight & 0x7
|
344 |
+
weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
|
345 |
+
weight = weight.reshape(-1, self.group_size, weight.shape[2])
|
346 |
+
else:
|
347 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
348 |
+
|
349 |
+
weight = scales * (weight - zeros)
|
350 |
+
weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
|
351 |
+
out = torch.matmul(x, weight)
|
352 |
+
out = out.to(dtype=x_dtype).reshape(
|
353 |
+
out_shape
|
354 |
+
) # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
|
355 |
+
out = out + self.bias if self.bias is not None else out
|
356 |
+
return out
|
357 |
+
|
358 |
+
|
359 |
+
__all__ = ["QuantLinear"]
|
internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py.ori.py
ADDED
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from logging import getLogger
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
import transformers
|
8 |
+
|
9 |
+
|
10 |
+
logger = getLogger(__name__)
|
11 |
+
try:
|
12 |
+
import autogptq_cuda_64
|
13 |
+
import autogptq_cuda_256
|
14 |
+
|
15 |
+
_autogptq_cuda_available = True
|
16 |
+
except ImportError:
|
17 |
+
logger.warning("CUDA extension not installed.")
|
18 |
+
autogptq_cuda_256 = None
|
19 |
+
autogptq_cuda_64 = None
|
20 |
+
_autogptq_cuda_available = False
|
21 |
+
|
22 |
+
|
23 |
+
class QuantLinear(nn.Module):
|
24 |
+
QUANT_TYPE = "cuda-old"
|
25 |
+
|
26 |
+
def __init__(
|
27 |
+
self,
|
28 |
+
bits,
|
29 |
+
group_size,
|
30 |
+
infeatures,
|
31 |
+
outfeatures,
|
32 |
+
bias,
|
33 |
+
use_cuda_fp16=True,
|
34 |
+
kernel_switch_threshold=128,
|
35 |
+
trainable=False,
|
36 |
+
weight_dtype=torch.float16,
|
37 |
+
):
|
38 |
+
super().__init__()
|
39 |
+
global _autogptq_cuda_available
|
40 |
+
if bits not in [2, 3, 4, 8]:
|
41 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
42 |
+
if trainable:
|
43 |
+
_autogptq_cuda_available = False
|
44 |
+
self.infeatures = infeatures
|
45 |
+
self.outfeatures = outfeatures
|
46 |
+
self.bits = bits
|
47 |
+
self.group_size = group_size if group_size != -1 else infeatures
|
48 |
+
self.maxq = 2**self.bits - 1
|
49 |
+
|
50 |
+
self.register_buffer(
|
51 |
+
"qweight",
|
52 |
+
torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
|
53 |
+
)
|
54 |
+
self.register_buffer(
|
55 |
+
"qzeros",
|
56 |
+
torch.zeros(
|
57 |
+
(
|
58 |
+
math.ceil(infeatures / self.group_size),
|
59 |
+
outfeatures // 32 * self.bits,
|
60 |
+
),
|
61 |
+
dtype=torch.int32,
|
62 |
+
),
|
63 |
+
)
|
64 |
+
self.register_buffer(
|
65 |
+
"scales",
|
66 |
+
torch.zeros(
|
67 |
+
(math.ceil(infeatures / self.group_size), outfeatures),
|
68 |
+
dtype=weight_dtype,
|
69 |
+
),
|
70 |
+
)
|
71 |
+
self.register_buffer(
|
72 |
+
"g_idx",
|
73 |
+
torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
|
74 |
+
)
|
75 |
+
|
76 |
+
if bias:
|
77 |
+
self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
|
78 |
+
else:
|
79 |
+
self.bias = None
|
80 |
+
self.half_indim = self.infeatures // 2
|
81 |
+
|
82 |
+
self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
|
83 |
+
|
84 |
+
# is performed by unpacking the weights and using torch.matmul
|
85 |
+
if self.bits in [2, 4, 8]:
|
86 |
+
self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
|
87 |
+
elif self.bits == 3:
|
88 |
+
self.wf = torch.tensor(
|
89 |
+
[
|
90 |
+
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
|
91 |
+
[0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
|
92 |
+
[0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
|
93 |
+
],
|
94 |
+
dtype=torch.int32,
|
95 |
+
).reshape(1, 3, 12)
|
96 |
+
|
97 |
+
self.kernel_switch_threshold = kernel_switch_threshold
|
98 |
+
self.autogptq_cuda_available = _autogptq_cuda_available
|
99 |
+
self.autogptq_cuda = autogptq_cuda_256
|
100 |
+
if infeatures % 256 != 0 or outfeatures % 256 != 0:
|
101 |
+
self.autogptq_cuda = autogptq_cuda_64
|
102 |
+
if infeatures % 64 != 0 or outfeatures % 64 != 0:
|
103 |
+
self.autogptq_cuda_available = False
|
104 |
+
|
105 |
+
self.trainable = trainable
|
106 |
+
|
107 |
+
def post_init(self):
|
108 |
+
pass
|
109 |
+
|
110 |
+
def pack(self, linear, scales, zeros, g_idx):
|
111 |
+
W = linear.weight.data.clone()
|
112 |
+
if isinstance(linear, nn.Conv2d):
|
113 |
+
W = W.flatten(1)
|
114 |
+
if isinstance(linear, transformers.pytorch_utils.Conv1D):
|
115 |
+
W = W.t()
|
116 |
+
|
117 |
+
scales = scales.t().contiguous()
|
118 |
+
zeros = zeros.t().contiguous()
|
119 |
+
scale_zeros = zeros * scales
|
120 |
+
self.scales = scales.clone().to(dtype=linear.weight.dtype)
|
121 |
+
if linear.bias is not None:
|
122 |
+
self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
|
123 |
+
|
124 |
+
intweight = []
|
125 |
+
for idx in range(self.infeatures):
|
126 |
+
g_idx = idx // self.group_size
|
127 |
+
intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
|
128 |
+
intweight = torch.cat(intweight, dim=1)
|
129 |
+
intweight = intweight.t().contiguous()
|
130 |
+
intweight = intweight.numpy().astype(np.uint32)
|
131 |
+
|
132 |
+
i = 0
|
133 |
+
row = 0
|
134 |
+
qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
|
135 |
+
while row < qweight.shape[0]:
|
136 |
+
if self.bits in [2, 4, 8]:
|
137 |
+
for j in range(i, i + (32 // self.bits)):
|
138 |
+
qweight[row] |= intweight[j] << (self.bits * (j - i))
|
139 |
+
i += 32 // self.bits
|
140 |
+
row += 1
|
141 |
+
elif self.bits == 3:
|
142 |
+
for j in range(i, i + 10):
|
143 |
+
qweight[row] |= intweight[j] << (3 * (j - i))
|
144 |
+
i += 10
|
145 |
+
qweight[row] |= intweight[i] << 30
|
146 |
+
row += 1
|
147 |
+
qweight[row] |= (intweight[i] >> 2) & 1
|
148 |
+
i += 1
|
149 |
+
for j in range(i, i + 10):
|
150 |
+
qweight[row] |= intweight[j] << (3 * (j - i) + 1)
|
151 |
+
i += 10
|
152 |
+
qweight[row] |= intweight[i] << 31
|
153 |
+
row += 1
|
154 |
+
qweight[row] |= (intweight[i] >> 1) & 0x3
|
155 |
+
i += 1
|
156 |
+
for j in range(i, i + 10):
|
157 |
+
qweight[row] |= intweight[j] << (3 * (j - i) + 2)
|
158 |
+
i += 10
|
159 |
+
row += 1
|
160 |
+
else:
|
161 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
162 |
+
|
163 |
+
qweight = qweight.astype(np.int32)
|
164 |
+
self.qweight = torch.from_numpy(qweight)
|
165 |
+
|
166 |
+
zeros -= 1
|
167 |
+
zeros = zeros.numpy().astype(np.uint32)
|
168 |
+
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
|
169 |
+
i = 0
|
170 |
+
col = 0
|
171 |
+
while col < qzeros.shape[1]:
|
172 |
+
if self.bits in [2, 4, 8]:
|
173 |
+
for j in range(i, i + (32 // self.bits)):
|
174 |
+
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
|
175 |
+
i += 32 // self.bits
|
176 |
+
col += 1
|
177 |
+
elif self.bits == 3:
|
178 |
+
for j in range(i, i + 10):
|
179 |
+
qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
|
180 |
+
i += 10
|
181 |
+
qzeros[:, col] |= zeros[:, i] << 30
|
182 |
+
col += 1
|
183 |
+
qzeros[:, col] |= (zeros[:, i] >> 2) & 1
|
184 |
+
i += 1
|
185 |
+
for j in range(i, i + 10):
|
186 |
+
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
|
187 |
+
i += 10
|
188 |
+
qzeros[:, col] |= zeros[:, i] << 31
|
189 |
+
col += 1
|
190 |
+
qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
|
191 |
+
i += 1
|
192 |
+
for j in range(i, i + 10):
|
193 |
+
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
|
194 |
+
i += 10
|
195 |
+
col += 1
|
196 |
+
else:
|
197 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
198 |
+
|
199 |
+
qzeros = qzeros.astype(np.int32)
|
200 |
+
self.qzeros = torch.from_numpy(qzeros)
|
201 |
+
|
202 |
+
def forward(self, x):
|
203 |
+
x_dtype = x.dtype
|
204 |
+
out_shape = x.shape[:-1] + (self.outfeatures,)
|
205 |
+
x = x.reshape(-1, x.shape[-1])
|
206 |
+
if (
|
207 |
+
x.device.type == "cuda"
|
208 |
+
and self.autogptq_cuda_available is True
|
209 |
+
and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
|
210 |
+
):
|
211 |
+
out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
|
212 |
+
if self.use_cuda_fp16:
|
213 |
+
if x_dtype != torch.float16:
|
214 |
+
logger.warning_once(
|
215 |
+
f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
|
216 |
+
)
|
217 |
+
|
218 |
+
if self.bits == 2:
|
219 |
+
self.autogptq_cuda.vecquant2matmul_faster_old(
|
220 |
+
x,
|
221 |
+
self.qweight,
|
222 |
+
out,
|
223 |
+
self.scales.float(),
|
224 |
+
self.qzeros,
|
225 |
+
self.group_size,
|
226 |
+
self.half_indim,
|
227 |
+
)
|
228 |
+
elif self.bits == 3:
|
229 |
+
self.autogptq_cuda.vecquant3matmul_faster_old(
|
230 |
+
x,
|
231 |
+
self.qweight,
|
232 |
+
out,
|
233 |
+
self.scales.float(),
|
234 |
+
self.qzeros,
|
235 |
+
self.group_size,
|
236 |
+
self.half_indim,
|
237 |
+
)
|
238 |
+
elif self.bits == 4:
|
239 |
+
self.autogptq_cuda.vecquant4matmul_faster_old(
|
240 |
+
x,
|
241 |
+
self.qweight,
|
242 |
+
out,
|
243 |
+
self.scales.float(),
|
244 |
+
self.qzeros,
|
245 |
+
self.group_size,
|
246 |
+
self.half_indim,
|
247 |
+
)
|
248 |
+
|
249 |
+
else:
|
250 |
+
raise NotImplementedError("Only 2,3,4 bits are supported.")
|
251 |
+
else:
|
252 |
+
x = x.to(torch.float32) # This is required for autocast compatibility.
|
253 |
+
if self.bits == 2:
|
254 |
+
self.autogptq_cuda.vecquant2matmul_old(
|
255 |
+
x,
|
256 |
+
self.qweight,
|
257 |
+
out,
|
258 |
+
self.scales.float(),
|
259 |
+
self.qzeros,
|
260 |
+
self.group_size,
|
261 |
+
)
|
262 |
+
elif self.bits == 3:
|
263 |
+
self.autogptq_cuda.vecquant3matmul_old(
|
264 |
+
x,
|
265 |
+
self.qweight,
|
266 |
+
out,
|
267 |
+
self.scales.float(),
|
268 |
+
self.qzeros,
|
269 |
+
self.group_size,
|
270 |
+
)
|
271 |
+
elif self.bits == 4:
|
272 |
+
self.autogptq_cuda.vecquant4matmul_old(
|
273 |
+
x,
|
274 |
+
self.qweight,
|
275 |
+
out,
|
276 |
+
self.scales.float(),
|
277 |
+
self.qzeros,
|
278 |
+
self.group_size,
|
279 |
+
)
|
280 |
+
elif self.bits == 8:
|
281 |
+
self.autogptq_cuda.vecquant8matmul_old(
|
282 |
+
x,
|
283 |
+
self.qweight,
|
284 |
+
out,
|
285 |
+
self.scales.float(),
|
286 |
+
self.qzeros,
|
287 |
+
self.group_size,
|
288 |
+
)
|
289 |
+
else:
|
290 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
291 |
+
else:
|
292 |
+
if self.wf.device != self.qzeros.device:
|
293 |
+
self.wf = self.wf.to(self.qzeros.device)
|
294 |
+
|
295 |
+
if self.bits in [2, 4, 8]:
|
296 |
+
zeros = torch.bitwise_right_shift(
|
297 |
+
torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
|
298 |
+
self.wf.unsqueeze(0),
|
299 |
+
).to(torch.int16 if self.bits == 8 else torch.int8)
|
300 |
+
|
301 |
+
zeros = zeros + 1
|
302 |
+
zeros = torch.bitwise_and(
|
303 |
+
zeros, (2**self.bits) - 1
|
304 |
+
) # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
|
305 |
+
|
306 |
+
zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
|
307 |
+
|
308 |
+
scales = self.scales
|
309 |
+
scales = scales.reshape(-1, 1, scales.shape[-1])
|
310 |
+
|
311 |
+
weight = torch.bitwise_right_shift(
|
312 |
+
torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
|
313 |
+
self.wf.unsqueeze(-1),
|
314 |
+
).to(torch.int16 if self.bits == 8 else torch.int8)
|
315 |
+
weight = torch.bitwise_and(weight, (2**self.bits) - 1)
|
316 |
+
weight = weight.reshape(-1, self.group_size, weight.shape[2])
|
317 |
+
elif self.bits == 3:
|
318 |
+
zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
|
319 |
+
-1, -1, -1, 12
|
320 |
+
)
|
321 |
+
zeros = zeros >> self.wf.unsqueeze(0)
|
322 |
+
zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
|
323 |
+
zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
|
324 |
+
zeros = zeros & 0x7
|
325 |
+
zeros = torch.cat(
|
326 |
+
[zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
|
327 |
+
dim=2,
|
328 |
+
)
|
329 |
+
|
330 |
+
zeros = zeros + 1
|
331 |
+
zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
|
332 |
+
|
333 |
+
scales = self.scales
|
334 |
+
scales = scales.reshape(-1, 1, scales.shape[-1])
|
335 |
+
|
336 |
+
weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
|
337 |
+
-1, -1, 12, -1
|
338 |
+
)
|
339 |
+
weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
|
340 |
+
weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
|
341 |
+
weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
|
342 |
+
weight = weight & 0x7
|
343 |
+
weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
|
344 |
+
weight = weight.reshape(-1, self.group_size, weight.shape[2])
|
345 |
+
else:
|
346 |
+
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
347 |
+
|
348 |
+
weight = scales * (weight - zeros)
|
349 |
+
weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
|
350 |
+
out = torch.matmul(x, weight)
|
351 |
+
out = out.to(dtype=x_dtype).reshape(
|
352 |
+
out_shape
|
353 |
+
) # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
|
354 |
+
out = out + self.bias if self.bias is not None else out
|
355 |
+
return out
|
356 |
+
|
357 |
+
|
358 |
+
__all__ = ["QuantLinear"]
|
internal/donttouch_unpacking_autogptq/quantizer.py
ADDED
@@ -0,0 +1,816 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
import json
|
16 |
+
import os
|
17 |
+
from enum import Enum
|
18 |
+
from logging import getLogger
|
19 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
20 |
+
|
21 |
+
import torch
|
22 |
+
from torch import nn
|
23 |
+
from tqdm.auto import tqdm
|
24 |
+
from transformers import AutoTokenizer
|
25 |
+
from transformers.pytorch_utils import Conv1D
|
26 |
+
from transformers.utils.quantization_config import QuantizationMethod
|
27 |
+
|
28 |
+
from ..utils import is_accelerate_available, is_auto_gptq_available
|
29 |
+
from ..utils.modeling_utils import recurse_getattr
|
30 |
+
from .constants import GPTQ_CONFIG
|
31 |
+
from .data import get_dataset, prepare_dataset
|
32 |
+
from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
|
33 |
+
from collections import OrderedDict
|
34 |
+
|
35 |
+
if is_accelerate_available():
|
36 |
+
from accelerate import (
|
37 |
+
cpu_offload_with_hook,
|
38 |
+
load_checkpoint_and_dispatch,
|
39 |
+
)
|
40 |
+
from accelerate.hooks import remove_hook_from_module
|
41 |
+
|
42 |
+
if is_auto_gptq_available():
|
43 |
+
from auto_gptq import exllama_set_max_input_length
|
44 |
+
from auto_gptq.modeling._utils import autogptq_post_init
|
45 |
+
from auto_gptq.quantization import GPTQ
|
46 |
+
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
|
47 |
+
|
48 |
+
logger = getLogger(__name__)
|
49 |
+
|
50 |
+
|
51 |
+
class ExllamaVersion(int, Enum):
|
52 |
+
ONE = 1
|
53 |
+
TWO = 2
|
54 |
+
|
55 |
+
|
56 |
+
class GPTQQuantizer(object):
|
57 |
+
r"""
|
58 |
+
A simple API for GPTQ Quantization
|
59 |
+
"""
|
60 |
+
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
bits: int,
|
64 |
+
dataset: Optional[Union[List[str], str]] = None,
|
65 |
+
group_size: int = 128,
|
66 |
+
damp_percent: float = 0.1,
|
67 |
+
desc_act: bool = False,
|
68 |
+
sym: bool = True,
|
69 |
+
true_sequential: bool = True,
|
70 |
+
use_cuda_fp16: bool = False,
|
71 |
+
model_seqlen: Optional[int] = None,
|
72 |
+
block_name_to_quantize: Optional[str] = None,
|
73 |
+
module_name_preceding_first_block: Optional[List[str]] = None,
|
74 |
+
batch_size: int = 1,
|
75 |
+
pad_token_id: Optional[int] = None,
|
76 |
+
disable_exllama: bool = False,
|
77 |
+
exllama_config: Dict[str, Any] = None,
|
78 |
+
max_input_length: Optional[int] = None,
|
79 |
+
cache_block_outputs: Optional[bool] = True,
|
80 |
+
modules_in_block_to_quantize: Optional[List[List[str]]] = None,
|
81 |
+
*args,
|
82 |
+
**kwargs,
|
83 |
+
):
|
84 |
+
"""
|
85 |
+
Args:
|
86 |
+
bits (`int`):
|
87 |
+
The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
|
88 |
+
dataset (`Union[List[str], str, Any]`, defaults to `None`):
|
89 |
+
The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
|
90 |
+
(e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
|
91 |
+
or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
|
92 |
+
group_size (int, defaults to 128):
|
93 |
+
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
|
94 |
+
damp_percent (`float`, defaults to `0.1`):
|
95 |
+
The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1.
|
96 |
+
desc_act (`bool`, defaults to `False`):
|
97 |
+
Whether to quantize columns in order of decreasing activation size.
|
98 |
+
Setting it to False can significantly speed up inference but the perplexity may become slightly worse.
|
99 |
+
Also known as act-order.
|
100 |
+
sym (`bool`, defaults to `True`):
|
101 |
+
Whether to use symetric quantization.
|
102 |
+
true_sequential (`bool`, defaults to `True`):
|
103 |
+
Whether to perform sequential quantization even within a single Transformer block.
|
104 |
+
Instead of quantizing the entire block at once, we perform layer-wise quantization.
|
105 |
+
As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
|
106 |
+
use_cuda_fp16 (`bool`, defaults to `False`):
|
107 |
+
Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
|
108 |
+
model_seqlen (`Optional[int]`, defaults to `None`):
|
109 |
+
The maximum sequence length that the model can take.
|
110 |
+
block_name_to_quantize (`Optional[str]`, defaults to `None`):
|
111 |
+
The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
|
112 |
+
module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
|
113 |
+
The layers that are preceding the first Transformer block.
|
114 |
+
batch_size (`int`, defaults to `1`):
|
115 |
+
The batch size of the dataset
|
116 |
+
pad_token_id (`Optional[int]`, defaults to `None`):
|
117 |
+
The pad token id. Needed to prepare the dataset when `batch_size` > 1.
|
118 |
+
disable_exllama (`bool`, defaults to `False`):
|
119 |
+
Whether to use exllama backend. Only works with `bits` = 4.
|
120 |
+
exllama_config (`Dict[str, Any]`, *optional*):
|
121 |
+
The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
|
122 |
+
max_input_length (`Optional[int]`, defaults to `None`):
|
123 |
+
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
|
124 |
+
It is specific to the exllama backend with act-order.
|
125 |
+
cache_block_outputs (`bool`, defaults to `True`):
|
126 |
+
Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
|
127 |
+
(e.g. ChatGLM) but can require more time.
|
128 |
+
modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
|
129 |
+
List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
|
130 |
+
The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
|
131 |
+
If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
|
132 |
+
"""
|
133 |
+
|
134 |
+
self.bits = bits
|
135 |
+
self.dataset = dataset
|
136 |
+
self.group_size = group_size
|
137 |
+
self.damp_percent = damp_percent
|
138 |
+
self.desc_act = desc_act
|
139 |
+
self.sym = sym
|
140 |
+
self.true_sequential = true_sequential
|
141 |
+
self.use_cuda_fp16 = use_cuda_fp16
|
142 |
+
self.model_seqlen = model_seqlen
|
143 |
+
self.block_name_to_quantize = block_name_to_quantize
|
144 |
+
self.module_name_preceding_first_block = module_name_preceding_first_block
|
145 |
+
self.batch_size = batch_size
|
146 |
+
self.pad_token_id = pad_token_id
|
147 |
+
self.disable_exllama = disable_exllama
|
148 |
+
self.exllama_config = exllama_config
|
149 |
+
self.max_input_length = max_input_length
|
150 |
+
self.quant_method = QuantizationMethod.GPTQ
|
151 |
+
self.cache_block_outputs = cache_block_outputs
|
152 |
+
self.modules_in_block_to_quantize = modules_in_block_to_quantize
|
153 |
+
|
154 |
+
self.serialization_keys = [
|
155 |
+
"bits",
|
156 |
+
"dataset",
|
157 |
+
"group_size",
|
158 |
+
"damp_percent",
|
159 |
+
"desc_act",
|
160 |
+
"sym",
|
161 |
+
"true_sequential",
|
162 |
+
"quant_method",
|
163 |
+
"modules_in_block_to_quantize",
|
164 |
+
]
|
165 |
+
|
166 |
+
if self.bits not in [2, 3, 4, 8]:
|
167 |
+
raise ValueError("only support quantize to [2,3,4,8] bits.")
|
168 |
+
if self.group_size != -1 and self.group_size <= 0:
|
169 |
+
raise ValueError("group_size must be greater than 0 or equal to -1")
|
170 |
+
if not (0 < self.damp_percent < 1):
|
171 |
+
raise ValueError("damp_percent must between 0 and 1.")
|
172 |
+
|
173 |
+
if self.exllama_config is None:
|
174 |
+
self.exllama_config = {"version": ExllamaVersion.TWO}
|
175 |
+
else:
|
176 |
+
if "version" not in self.exllama_config:
|
177 |
+
raise ValueError("`exllama_config` needs to have a `version` key")
|
178 |
+
elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
|
179 |
+
version = self.exllama_config["version"]
|
180 |
+
raise ValueError(
|
181 |
+
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
|
182 |
+
)
|
183 |
+
self.exllama_version = self.exllama_config["version"]
|
184 |
+
|
185 |
+
def to_dict(self):
|
186 |
+
"""
|
187 |
+
Returns the args in dict format.
|
188 |
+
"""
|
189 |
+
gptq_dict = {}
|
190 |
+
for key in self.serialization_keys:
|
191 |
+
gptq_dict[key] = getattr(self, key)
|
192 |
+
return gptq_dict
|
193 |
+
|
194 |
+
@classmethod
|
195 |
+
def from_dict(cls, config_dict: Dict[str, Any]):
|
196 |
+
"""
|
197 |
+
Instantiates a `GPTQQuantizer` using config_dict as kwargs
|
198 |
+
|
199 |
+
Args:
|
200 |
+
config_dict (`Dict[str,Any]`):
|
201 |
+
quantization config
|
202 |
+
|
203 |
+
Returns:
|
204 |
+
`GPTQQuantizer`: The quantizer object instantiated from those parameters.
|
205 |
+
"""
|
206 |
+
return cls(**config_dict)
|
207 |
+
|
208 |
+
def convert_model(self, model: nn.Module):
|
209 |
+
"""
|
210 |
+
Convert the model to a GPTQ model by getting and replacing the layers.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
model (`nn.Module`):
|
214 |
+
Model to be converted
|
215 |
+
|
216 |
+
"""
|
217 |
+
if self.block_name_to_quantize is None:
|
218 |
+
self.block_name_to_quantize = get_block_name_with_pattern(model)
|
219 |
+
block_name = self.block_name_to_quantize
|
220 |
+
layers_to_be_replaced = get_layers(model, prefix=block_name)
|
221 |
+
if self.modules_in_block_to_quantize is not None:
|
222 |
+
layers_to_keep = sum(self.modules_in_block_to_quantize, [])
|
223 |
+
for name in list(layers_to_be_replaced.keys()):
|
224 |
+
if not any(name.endswith(layer) for layer in layers_to_keep):
|
225 |
+
logger.info(
|
226 |
+
f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
|
227 |
+
)
|
228 |
+
del layers_to_be_replaced[name]
|
229 |
+
self._replace_by_quant_layers(model, layers_to_be_replaced)
|
230 |
+
return model
|
231 |
+
|
232 |
+
def get_no_split_module_classes(self, model):
|
233 |
+
"""
|
234 |
+
Get the modules that should not be split across multiple devices.
|
235 |
+
Args:
|
236 |
+
model (`nn.Module`):
|
237 |
+
The input model
|
238 |
+
"""
|
239 |
+
|
240 |
+
block_class_name = recurse_getattr(model, self.block_name_to_quantize)[0].__class__.__name__
|
241 |
+
no_split_module_classes = [block_class_name]
|
242 |
+
return no_split_module_classes
|
243 |
+
|
244 |
+
def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: str = ""):
|
245 |
+
"""
|
246 |
+
Replaces linear layers in `module` by `QuantLinear`
|
247 |
+
|
248 |
+
Args:
|
249 |
+
module (`nn.Module`):
|
250 |
+
Module to quantize
|
251 |
+
names (`List[str]`):
|
252 |
+
List of names of the module to quantize
|
253 |
+
name (`str`, defaults to `""`):
|
254 |
+
To keep track of the name of the current module
|
255 |
+
"""
|
256 |
+
QuantLinear = dynamically_import_QuantLinear(
|
257 |
+
use_triton=False,
|
258 |
+
desc_act=self.desc_act,
|
259 |
+
group_size=self.group_size,
|
260 |
+
bits=self.bits,
|
261 |
+
disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
|
262 |
+
disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
|
263 |
+
)
|
264 |
+
if isinstance(module, QuantLinear):
|
265 |
+
return
|
266 |
+
for attr in dir(module):
|
267 |
+
layer = getattr(module, attr)
|
268 |
+
name1 = name + "." + attr if name != "" else attr
|
269 |
+
if name1 in names:
|
270 |
+
device = get_device(layer)
|
271 |
+
delattr(module, attr)
|
272 |
+
if isinstance(layer, nn.Linear):
|
273 |
+
in_features = layer.in_features
|
274 |
+
out_features = layer.out_features
|
275 |
+
elif isinstance(layer, nn.Conv2d):
|
276 |
+
in_features = layer.in_channels
|
277 |
+
out_features = layer.out_channels
|
278 |
+
elif isinstance(layer, Conv1D):
|
279 |
+
in_features = layer.weight.shape[0]
|
280 |
+
out_features = layer.weight.shape[1]
|
281 |
+
bias = layer.bias is not None
|
282 |
+
if not (self.desc_act) or self.group_size == -1:
|
283 |
+
new_layer = QuantLinear(
|
284 |
+
self.bits,
|
285 |
+
self.group_size,
|
286 |
+
in_features,
|
287 |
+
out_features,
|
288 |
+
bias,
|
289 |
+
use_cuda_fp16=self.use_cuda_fp16,
|
290 |
+
weight_dtype=layer.weight.dtype,
|
291 |
+
)
|
292 |
+
else:
|
293 |
+
new_layer = QuantLinear(
|
294 |
+
self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
|
295 |
+
)
|
296 |
+
new_layer.device = device
|
297 |
+
setattr(module, attr, new_layer.to(device))
|
298 |
+
for name1, child in module.named_children():
|
299 |
+
self._replace_by_quant_layers(child, names, name + "." + name1 if name != "" else name1)
|
300 |
+
|
301 |
+
@torch.no_grad()
|
302 |
+
def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
|
303 |
+
"""
|
304 |
+
Quantizes the model using the dataset
|
305 |
+
|
306 |
+
Args:
|
307 |
+
model (`nn.Module`):
|
308 |
+
The model to quantize
|
309 |
+
tokenizer (Optional[`Any`], defaults to `None`):
|
310 |
+
The tokenizer to use in order to prepare the dataset. You can pass either:
|
311 |
+
- A custom tokenizer object.
|
312 |
+
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
|
313 |
+
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
314 |
+
user or organization name, like `dbmdz/bert-base-german-cased`.
|
315 |
+
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
|
316 |
+
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
|
317 |
+
Returns:
|
318 |
+
`nn.Module`: The quantized model
|
319 |
+
"""
|
320 |
+
|
321 |
+
if not is_auto_gptq_available():
|
322 |
+
raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
|
323 |
+
if not torch.cuda.is_available():
|
324 |
+
raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
|
325 |
+
|
326 |
+
model.eval()
|
327 |
+
|
328 |
+
# For Transformer model
|
329 |
+
has_config = False
|
330 |
+
has_device_map = False
|
331 |
+
if hasattr(model, "config"):
|
332 |
+
has_config = True
|
333 |
+
use_cache = model.config.use_cache
|
334 |
+
model.config.use_cache = False
|
335 |
+
|
336 |
+
# If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
|
337 |
+
if hasattr(model, "hf_device_map"):
|
338 |
+
devices = list(model.hf_device_map.values())
|
339 |
+
has_device_map = True
|
340 |
+
if "disk" in devices:
|
341 |
+
raise ValueError("disk offload is not supported with GPTQ quantization")
|
342 |
+
if "cpu" in devices or torch.device("cpu") in devices:
|
343 |
+
if len(model.hf_device_map) > 1:
|
344 |
+
logger.info("Cpu offload is not recommended. There might be some issues with the memory")
|
345 |
+
hook = None
|
346 |
+
for name, device in model.hf_device_map.items():
|
347 |
+
if device == "cpu":
|
348 |
+
module = recurse_getattr(model, name)
|
349 |
+
remove_hook_from_module(module, recurse=True)
|
350 |
+
module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
|
351 |
+
else:
|
352 |
+
has_device_map = False
|
353 |
+
|
354 |
+
if hasattr(model, "dtype"):
|
355 |
+
self.use_cuda_fp16 = model.dtype == torch.float16
|
356 |
+
|
357 |
+
if self.model_seqlen is None:
|
358 |
+
# We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
|
359 |
+
self.model_seqlen = min(4028, get_seqlen(model))
|
360 |
+
|
361 |
+
device = get_device(model)
|
362 |
+
|
363 |
+
# Step 1: Prepare the data
|
364 |
+
if isinstance(self.dataset, list) and not isinstance(self.dataset[0], str):
|
365 |
+
dataset = self.dataset
|
366 |
+
logger.info("GPTQQuantizer dataset appears to be already tokenized. Skipping tokenization.")
|
367 |
+
else:
|
368 |
+
if isinstance(tokenizer, str):
|
369 |
+
try:
|
370 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
|
371 |
+
except Exception:
|
372 |
+
raise ValueError(
|
373 |
+
f"""We were not able to get the tokenizer using `AutoTokenizer.from_pretrained`
|
374 |
+
with the string that you have passed {tokenizer}. If you have a custom tokenizer, you can pass it as input.
|
375 |
+
For now, we only support quantization for text model. Support for vision, speech and multimodel will come later."""
|
376 |
+
)
|
377 |
+
if self.dataset is None:
|
378 |
+
raise ValueError("You need to pass `dataset` in order to quantize your model")
|
379 |
+
elif isinstance(self.dataset, str):
|
380 |
+
dataset = get_dataset(self.dataset, tokenizer, seqlen=self.model_seqlen, split="train")
|
381 |
+
elif isinstance(self.dataset, list):
|
382 |
+
dataset = [tokenizer(data, return_tensors="pt") for data in self.dataset]
|
383 |
+
else:
|
384 |
+
raise ValueError(
|
385 |
+
f"You need to pass a list of string, a list of tokenized data or a string for `dataset`. Found: {type(self.dataset)}."
|
386 |
+
)
|
387 |
+
|
388 |
+
dataset = prepare_dataset(dataset, pad_token_id=self.pad_token_id, batch_size=self.batch_size)
|
389 |
+
|
390 |
+
# Step 2: get the input of the 1st block
|
391 |
+
# To do that, we need to put the modules preceding the first block on the same device as the first bloc.
|
392 |
+
# Then we run the model and it will stop at the first bloc as we added a prehook that raise an Exception after storing the inputs.
|
393 |
+
|
394 |
+
layer_inputs = []
|
395 |
+
layer_outputs = []
|
396 |
+
layer_input_kwargs = []
|
397 |
+
|
398 |
+
if self.block_name_to_quantize is None:
|
399 |
+
self.block_name_to_quantize = get_block_name_with_pattern(model)
|
400 |
+
|
401 |
+
if self.module_name_preceding_first_block is None:
|
402 |
+
self.module_name_preceding_first_block = get_preceding_modules(model, self.block_name_to_quantize)
|
403 |
+
|
404 |
+
blocks = recurse_getattr(model, self.block_name_to_quantize)
|
405 |
+
|
406 |
+
if not has_device_map:
|
407 |
+
# put modules from module_name_preceding_first_block on cuda
|
408 |
+
for module_name in self.module_name_preceding_first_block:
|
409 |
+
module = recurse_getattr(model, module_name)
|
410 |
+
if module is None:
|
411 |
+
raise ValueError(f"Module {module_name} was not found in model")
|
412 |
+
module = module.to(0)
|
413 |
+
blocks[0] = blocks[0].to(0)
|
414 |
+
|
415 |
+
def store_input_hook(_, input, *args):
|
416 |
+
kwargs = args[0]
|
417 |
+
if input is None:
|
418 |
+
if "hidden_states" in kwargs:
|
419 |
+
input = (kwargs["hidden_states"],)
|
420 |
+
else:
|
421 |
+
raise ValueError("No input value found in the foward pass")
|
422 |
+
layer_inputs.append(input)
|
423 |
+
other_kwargs = {}
|
424 |
+
for k, v in kwargs.items(): # make sure other arguments also be captured
|
425 |
+
if k not in ["hidden_states"]:
|
426 |
+
other_kwargs[k] = v
|
427 |
+
layer_input_kwargs.append(other_kwargs)
|
428 |
+
raise ValueError
|
429 |
+
|
430 |
+
if self.cache_block_outputs:
|
431 |
+
handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
|
432 |
+
for data in dataset:
|
433 |
+
for k, v in data.items():
|
434 |
+
# put the data on gpu, we won't put them back to cpu
|
435 |
+
data[k] = v.to(0)
|
436 |
+
try:
|
437 |
+
model(**data)
|
438 |
+
except ValueError:
|
439 |
+
pass
|
440 |
+
handle.remove()
|
441 |
+
|
442 |
+
if not has_device_map:
|
443 |
+
blocks[0].to(device)
|
444 |
+
for module_name in self.module_name_preceding_first_block:
|
445 |
+
module = recurse_getattr(model, module_name)
|
446 |
+
if module is None:
|
447 |
+
raise ValueError(f"Module {module_name} was not found in model")
|
448 |
+
|
449 |
+
torch.cuda.empty_cache()
|
450 |
+
|
451 |
+
# Step 3: Quantize the blocks
|
452 |
+
quantizers = {}
|
453 |
+
for i, block in enumerate(tqdm(blocks, desc=f"Quantizing {self.block_name_to_quantize} blocks ")):
|
454 |
+
logger.info(f"Start quantizing block {self.block_name_to_quantize} {i + 1}/{len(blocks)}")
|
455 |
+
|
456 |
+
if not self.cache_block_outputs:
|
457 |
+
handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
|
458 |
+
for data in dataset:
|
459 |
+
for k, v in data.items():
|
460 |
+
# put the data on gpu, we won't put them back to cpu
|
461 |
+
data[k] = v.to(0)
|
462 |
+
try:
|
463 |
+
model(**data)
|
464 |
+
except ValueError:
|
465 |
+
pass
|
466 |
+
handle.remove()
|
467 |
+
|
468 |
+
# move block to cuda if needed
|
469 |
+
# in case we have offload modules, we need to put them on cuda because of GPTQ object
|
470 |
+
if not has_device_map or get_device(block) == torch.device("cpu"):
|
471 |
+
block = block.to(0)
|
472 |
+
layers = get_layers(block)
|
473 |
+
if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
|
474 |
+
if self.true_sequential:
|
475 |
+
layers_name_list = self.modules_in_block_to_quantize
|
476 |
+
else:
|
477 |
+
layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
|
478 |
+
else:
|
479 |
+
if self.true_sequential:
|
480 |
+
# lazy sequential but works well
|
481 |
+
layers_name_list = [[key] for key in layers.keys()]
|
482 |
+
else:
|
483 |
+
layers_name_list = [list(layers.keys())]
|
484 |
+
logger.info(f"Module to quantize {layers_name_list}")
|
485 |
+
for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
|
486 |
+
subset_layers = {name: layers[name] for name in subset_name_list}
|
487 |
+
gptq = {}
|
488 |
+
handles = []
|
489 |
+
# add hook for each layer in subset_layers
|
490 |
+
for name in subset_layers:
|
491 |
+
gptq[name] = GPTQ(subset_layers[name])
|
492 |
+
gptq[name].quantizer.configure(bits=self.bits, sym=self.sym, perchannel=True)
|
493 |
+
|
494 |
+
def add_batch(name):
|
495 |
+
def tmp(_, input, output):
|
496 |
+
gptq[name].add_batch(input[0].data, output.data)
|
497 |
+
|
498 |
+
return tmp
|
499 |
+
|
500 |
+
# because it adding a hook will replace the old one.
|
501 |
+
handles.append(subset_layers[name].register_forward_hook(add_batch(name)))
|
502 |
+
# update Hessian for each layer in subset_layers thanks to the hook
|
503 |
+
for j in range(len(dataset)):
|
504 |
+
# the args are already on the gpu
|
505 |
+
# don't need to store the output
|
506 |
+
block(*layer_inputs[j], **layer_input_kwargs[j])
|
507 |
+
# remove hook
|
508 |
+
for h in handles:
|
509 |
+
h.remove()
|
510 |
+
for name in subset_name_list:
|
511 |
+
logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
|
512 |
+
scale, zero, g_idx = gptq[name].fasterquant(
|
513 |
+
percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
|
514 |
+
)
|
515 |
+
quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
|
516 |
+
gptq[name].quantizer,
|
517 |
+
scale,
|
518 |
+
zero,
|
519 |
+
g_idx,
|
520 |
+
)
|
521 |
+
gptq[name].free()
|
522 |
+
del subset_layers
|
523 |
+
# we get the new output from the partial quantized block
|
524 |
+
if self.cache_block_outputs:
|
525 |
+
for j in range(len(dataset)):
|
526 |
+
layer_output = block(*layer_inputs[j], **layer_input_kwargs[j])
|
527 |
+
layer_outputs.append(layer_output)
|
528 |
+
|
529 |
+
# put back to device
|
530 |
+
if not has_device_map:
|
531 |
+
blocks[i] = block.to(device)
|
532 |
+
del layers
|
533 |
+
del layer_inputs
|
534 |
+
layer_inputs, layer_outputs = layer_outputs, []
|
535 |
+
else:
|
536 |
+
del layers
|
537 |
+
del layer_inputs
|
538 |
+
layer_inputs = []
|
539 |
+
torch.cuda.empty_cache()
|
540 |
+
if i==5:
|
541 |
+
break
|
542 |
+
|
543 |
+
if self.bits == 4:
|
544 |
+
# device not on gpu
|
545 |
+
if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
|
546 |
+
if not self.disable_exllama:
|
547 |
+
logger.warning(
|
548 |
+
"Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
|
549 |
+
)
|
550 |
+
self.disable_exllama = True
|
551 |
+
# act order and exllama
|
552 |
+
elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE:
|
553 |
+
logger.warning(
|
554 |
+
"Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
|
555 |
+
"Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
|
556 |
+
)
|
557 |
+
self.disable_exllama = True
|
558 |
+
elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO:
|
559 |
+
logger.warning(
|
560 |
+
"Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
|
561 |
+
"Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. "
|
562 |
+
)
|
563 |
+
self.disable_exllama = True
|
564 |
+
# Step 4: Pack the model at the end (Replacing the layers)
|
565 |
+
self.pack_model(model=model, quantizers=quantizers)
|
566 |
+
|
567 |
+
model.is_quantized = True
|
568 |
+
model.quantization_method = QuantizationMethod.GPTQ
|
569 |
+
if has_config:
|
570 |
+
model.config.use_cache = use_cache
|
571 |
+
model.config.quantization_config = self.to_dict()
|
572 |
+
|
573 |
+
# Step 5: Any post-initialization that require device information, for example buffers initialization on device.
|
574 |
+
model = self.post_init_model(model)
|
575 |
+
|
576 |
+
torch.cuda.empty_cache()
|
577 |
+
return model
|
578 |
+
|
579 |
+
def post_init_model(self, model):
|
580 |
+
"""
|
581 |
+
Post-initialization that require device information, for example buffers initialization on device.
|
582 |
+
|
583 |
+
Args:
|
584 |
+
model (`nn.Module`):
|
585 |
+
The input model
|
586 |
+
"""
|
587 |
+
if self.bits == 4 and not self.disable_exllama:
|
588 |
+
if get_device(model) == torch.device("cpu") or (
|
589 |
+
hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
|
590 |
+
):
|
591 |
+
raise ValueError(
|
592 |
+
"Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
|
593 |
+
"You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
|
594 |
+
)
|
595 |
+
|
596 |
+
class StoreAttr(object):
|
597 |
+
pass
|
598 |
+
|
599 |
+
model.quantize_config = StoreAttr()
|
600 |
+
model.quantize_config.desc_act = self.desc_act
|
601 |
+
model = autogptq_post_init(model, use_act_order=self.desc_act)
|
602 |
+
if (
|
603 |
+
self.desc_act
|
604 |
+
and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
|
605 |
+
and self.max_input_length is not None
|
606 |
+
):
|
607 |
+
model = exllama_set_max_input_length(model, self.max_input_length)
|
608 |
+
return model
|
609 |
+
|
610 |
+
def pack_model(
|
611 |
+
self,
|
612 |
+
model: nn.Module,
|
613 |
+
quantizers: Dict[str, Tuple],
|
614 |
+
):
|
615 |
+
"""
|
616 |
+
Pack the model by replacing the layers by quantized layers
|
617 |
+
|
618 |
+
Args:
|
619 |
+
model (`nn.Module`):
|
620 |
+
The model to pack
|
621 |
+
quantizers (`Dict[str,Tuple]`):
|
622 |
+
A mapping of the layer name and the data needed to pack the layer
|
623 |
+
"""
|
624 |
+
QuantLinear = dynamically_import_QuantLinear(
|
625 |
+
use_triton=False,
|
626 |
+
desc_act=self.desc_act,
|
627 |
+
group_size=self.group_size,
|
628 |
+
bits=self.bits,
|
629 |
+
disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
|
630 |
+
disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
|
631 |
+
)
|
632 |
+
logger.info("Packing model...")
|
633 |
+
layers = get_layers(model)
|
634 |
+
layers = {n: layers[n] for n in quantizers}
|
635 |
+
self._replace_by_quant_layers(model, quantizers)
|
636 |
+
qlayers = get_layers(model, [QuantLinear])
|
637 |
+
autogptq_blobs = OrderedDict()
|
638 |
+
for i, name in enumerate(qlayers):
|
639 |
+
logger.info(name)
|
640 |
+
quantizers[name], scale, zero, g_idx = quantizers[name]
|
641 |
+
# so far can only pack layer on CPU
|
642 |
+
layer_device = qlayers[name].device
|
643 |
+
qlayers[name].to("cpu")
|
644 |
+
layers[name], scale, zero, g_idx = layers[name].to("cpu"), scale.to("cpu"), zero.to("cpu"), g_idx.to("cpu")
|
645 |
+
autogptq_blobs[name] = {
|
646 |
+
"prepack": dict(
|
647 |
+
w=layers[name].weight,
|
648 |
+
b=layers[name].bias,
|
649 |
+
scale=scale,
|
650 |
+
zero=zero,
|
651 |
+
g_idx=g_idx
|
652 |
+
)
|
653 |
+
}
|
654 |
+
qlayers[name].pack(layers[name], scale, zero, g_idx)
|
655 |
+
autogptq_blobs[name]["pack"] = dict(
|
656 |
+
qweight=qlayers[name].qweight,
|
657 |
+
bias=qlayers[name].bias,
|
658 |
+
scales=qlayers[name].scales,
|
659 |
+
qzeros=qlayers[name].qzeros,
|
660 |
+
g_idx=qlayers[name].g_idx,
|
661 |
+
intweight=qlayers[name].intweight
|
662 |
+
)
|
663 |
+
qlayers[name].to(layer_device)
|
664 |
+
if i==5:
|
665 |
+
break
|
666 |
+
torch.save(autogptq_blobs, "./opt-125m-gptq4.pth")
|
667 |
+
exit()
|
668 |
+
logger.info("Model packed.")
|
669 |
+
|
670 |
+
def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
|
671 |
+
"""
|
672 |
+
Save model state dict and configs
|
673 |
+
|
674 |
+
Args:
|
675 |
+
model (`nn.Module`):
|
676 |
+
Model to be saved. The model can be wrapped or unwraped.
|
677 |
+
save_dir (`str`):
|
678 |
+
Directory to which to save. Will be created if it doesn't exist.
|
679 |
+
max_shard_size (`str`, defaults to `"10GB"`):
|
680 |
+
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
|
681 |
+
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
|
682 |
+
<Tip warning={true}>
|
683 |
+
|
684 |
+
If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
|
685 |
+
which will be bigger than `max_shard_size`.
|
686 |
+
|
687 |
+
</Tip>
|
688 |
+
safe_serialization (`bool`, defaults to `True`):
|
689 |
+
Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
|
690 |
+
|
691 |
+
"""
|
692 |
+
os.makedirs(save_dir, exist_ok=True)
|
693 |
+
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
|
694 |
+
with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
|
695 |
+
json.dump(self.to_dict(), f, indent=2)
|
696 |
+
|
697 |
+
|
698 |
+
def load_quantized_model(
|
699 |
+
model: nn.Module,
|
700 |
+
save_folder: str,
|
701 |
+
quant_config_name: str = GPTQ_CONFIG,
|
702 |
+
state_dict_name: Optional[str] = None,
|
703 |
+
device_map: Optional[str] = None,
|
704 |
+
max_memory: Optional[Dict] = None,
|
705 |
+
no_split_module_classes: Optional[Dict] = None,
|
706 |
+
offload_folder: Optional[str] = None,
|
707 |
+
offload_buffers: Optional[str] = None,
|
708 |
+
offload_state_dict: bool = False,
|
709 |
+
disable_exllama: bool = False,
|
710 |
+
exllama_config: Optional[Dict[str, Any]] = None,
|
711 |
+
max_input_length: Optional[int] = None,
|
712 |
+
):
|
713 |
+
"""
|
714 |
+
Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map.
|
715 |
+
|
716 |
+
Args:
|
717 |
+
model (`nn.Module`):
|
718 |
+
The model can be enpty or not.
|
719 |
+
save_folder (`str`):
|
720 |
+
Directory to which to load the weights.
|
721 |
+
quant_config_name (`str`, defaults to `GPTQ_CONFIG`):
|
722 |
+
Name of the quantization config file
|
723 |
+
state_dict_name (`Optional[str]`, defaults to `None`):
|
724 |
+
Name of the state dict file
|
725 |
+
device_map (`Optional[str]`, defaults to `None`):
|
726 |
+
A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
|
727 |
+
name, once a given module name is inside, every submodule of it will be sent to the same device.
|
728 |
+
To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
|
729 |
+
max_memory (`Optional[Dict]`, defaults to `None`):
|
730 |
+
A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
|
731 |
+
and the available CPU RAM if unset.
|
732 |
+
no_split_module_classes (`Optional[Dict]`, defaults to `None`):
|
733 |
+
A list of layer class names that should never be split across device (for instance any layer that has a
|
734 |
+
residual connection).
|
735 |
+
offload_folder (`Optional[str]`, defaults to `None`):
|
736 |
+
If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
|
737 |
+
offload_buffers (`Optional[str]`, defaults to `None`):
|
738 |
+
In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
|
739 |
+
well as the parameters.
|
740 |
+
offload_state_dict (`bool`, defaults to `False`):
|
741 |
+
If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
|
742 |
+
the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
|
743 |
+
picked contains `"disk"` values.
|
744 |
+
disable_exllama (`Optional[bool]`, defaults to `None`):
|
745 |
+
Whether to use exllama backend. Only works with `bits` = 4.
|
746 |
+
exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
|
747 |
+
The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
|
748 |
+
max_input_length (`Optional[int]`, defaults to `None`):
|
749 |
+
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
|
750 |
+
It is specific to the exllama backend with act-order.
|
751 |
+
|
752 |
+
Returns:
|
753 |
+
`nn.Module`: The quantized model
|
754 |
+
"""
|
755 |
+
if not torch.cuda.is_available():
|
756 |
+
raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
|
757 |
+
if not is_auto_gptq_available():
|
758 |
+
raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
|
759 |
+
if not is_accelerate_available():
|
760 |
+
raise RuntimeError(
|
761 |
+
"You need to install accelerate in order to load and dispatch weights to"
|
762 |
+
"a quantized model. You can do it with `pip install accelerate`"
|
763 |
+
)
|
764 |
+
if device_map is None:
|
765 |
+
device_map = {"": torch.cuda.current_device()}
|
766 |
+
logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
|
767 |
+
|
768 |
+
if exllama_config is None:
|
769 |
+
exllama_config = {"version": ExllamaVersion.TWO}
|
770 |
+
else:
|
771 |
+
if "version" not in exllama_config:
|
772 |
+
raise ValueError("`exllama_config` needs to have a `version` key")
|
773 |
+
elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
|
774 |
+
version = exllama_config["version"]
|
775 |
+
raise ValueError(
|
776 |
+
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
|
777 |
+
)
|
778 |
+
|
779 |
+
# this branch will check if model is from huggingface
|
780 |
+
try:
|
781 |
+
if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
|
782 |
+
quantize_config_dict = model.config.quantization_config.to_dict()
|
783 |
+
else:
|
784 |
+
with open(os.path.join(save_folder, quant_config_name), "r", encoding="utf-8") as f:
|
785 |
+
quantize_config_dict = json.load(f)
|
786 |
+
except Exception as err:
|
787 |
+
raise ValueError(
|
788 |
+
f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
|
789 |
+
) from err
|
790 |
+
quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
|
791 |
+
quantizer.disable_exllama = disable_exllama
|
792 |
+
quantizer.exllama_config = exllama_config
|
793 |
+
quantizer.exllama_version = quantizer.exllama_config["version"]
|
794 |
+
quantizer.max_input_length = max_input_length
|
795 |
+
|
796 |
+
model = quantizer.convert_model(model)
|
797 |
+
|
798 |
+
if no_split_module_classes is None:
|
799 |
+
no_split_module_classes = quantizer.get_no_split_module_classes(model)
|
800 |
+
|
801 |
+
model = load_checkpoint_and_dispatch(
|
802 |
+
model,
|
803 |
+
checkpoint=os.path.join(save_folder, state_dict_name) if state_dict_name is not None else save_folder,
|
804 |
+
device_map=device_map,
|
805 |
+
max_memory=max_memory,
|
806 |
+
no_split_module_classes=no_split_module_classes,
|
807 |
+
offload_folder=offload_folder,
|
808 |
+
offload_buffers=offload_buffers,
|
809 |
+
offload_state_dict=offload_state_dict,
|
810 |
+
)
|
811 |
+
|
812 |
+
model = quantizer.post_init_model(model)
|
813 |
+
model.is_quantized = True
|
814 |
+
model.quantization_method = QuantizationMethod.GPTQ
|
815 |
+
model.eval()
|
816 |
+
return model
|
internal/donttouch_unpacking_autogptq/quantizer.py.ori.py
ADDED
@@ -0,0 +1,793 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
import json
|
16 |
+
import os
|
17 |
+
from enum import Enum
|
18 |
+
from logging import getLogger
|
19 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
20 |
+
|
21 |
+
import torch
|
22 |
+
from torch import nn
|
23 |
+
from tqdm.auto import tqdm
|
24 |
+
from transformers import AutoTokenizer
|
25 |
+
from transformers.pytorch_utils import Conv1D
|
26 |
+
from transformers.utils.quantization_config import QuantizationMethod
|
27 |
+
|
28 |
+
from ..utils import is_accelerate_available, is_auto_gptq_available
|
29 |
+
from ..utils.modeling_utils import recurse_getattr
|
30 |
+
from .constants import GPTQ_CONFIG
|
31 |
+
from .data import get_dataset, prepare_dataset
|
32 |
+
from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
|
33 |
+
|
34 |
+
|
35 |
+
if is_accelerate_available():
|
36 |
+
from accelerate import (
|
37 |
+
cpu_offload_with_hook,
|
38 |
+
load_checkpoint_and_dispatch,
|
39 |
+
)
|
40 |
+
from accelerate.hooks import remove_hook_from_module
|
41 |
+
|
42 |
+
if is_auto_gptq_available():
|
43 |
+
from auto_gptq import exllama_set_max_input_length
|
44 |
+
from auto_gptq.modeling._utils import autogptq_post_init
|
45 |
+
from auto_gptq.quantization import GPTQ
|
46 |
+
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
|
47 |
+
|
48 |
+
logger = getLogger(__name__)
|
49 |
+
|
50 |
+
|
51 |
+
class ExllamaVersion(int, Enum):
|
52 |
+
ONE = 1
|
53 |
+
TWO = 2
|
54 |
+
|
55 |
+
|
56 |
+
class GPTQQuantizer(object):
|
57 |
+
r"""
|
58 |
+
A simple API for GPTQ Quantization
|
59 |
+
"""
|
60 |
+
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
bits: int,
|
64 |
+
dataset: Optional[Union[List[str], str]] = None,
|
65 |
+
group_size: int = 128,
|
66 |
+
damp_percent: float = 0.1,
|
67 |
+
desc_act: bool = False,
|
68 |
+
sym: bool = True,
|
69 |
+
true_sequential: bool = True,
|
70 |
+
use_cuda_fp16: bool = False,
|
71 |
+
model_seqlen: Optional[int] = None,
|
72 |
+
block_name_to_quantize: Optional[str] = None,
|
73 |
+
module_name_preceding_first_block: Optional[List[str]] = None,
|
74 |
+
batch_size: int = 1,
|
75 |
+
pad_token_id: Optional[int] = None,
|
76 |
+
disable_exllama: bool = False,
|
77 |
+
exllama_config: Dict[str, Any] = None,
|
78 |
+
max_input_length: Optional[int] = None,
|
79 |
+
cache_block_outputs: Optional[bool] = True,
|
80 |
+
modules_in_block_to_quantize: Optional[List[List[str]]] = None,
|
81 |
+
*args,
|
82 |
+
**kwargs,
|
83 |
+
):
|
84 |
+
"""
|
85 |
+
Args:
|
86 |
+
bits (`int`):
|
87 |
+
The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
|
88 |
+
dataset (`Union[List[str], str, Any]`, defaults to `None`):
|
89 |
+
The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
|
90 |
+
(e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
|
91 |
+
or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
|
92 |
+
group_size (int, defaults to 128):
|
93 |
+
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
|
94 |
+
damp_percent (`float`, defaults to `0.1`):
|
95 |
+
The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1.
|
96 |
+
desc_act (`bool`, defaults to `False`):
|
97 |
+
Whether to quantize columns in order of decreasing activation size.
|
98 |
+
Setting it to False can significantly speed up inference but the perplexity may become slightly worse.
|
99 |
+
Also known as act-order.
|
100 |
+
sym (`bool`, defaults to `True`):
|
101 |
+
Whether to use symetric quantization.
|
102 |
+
true_sequential (`bool`, defaults to `True`):
|
103 |
+
Whether to perform sequential quantization even within a single Transformer block.
|
104 |
+
Instead of quantizing the entire block at once, we perform layer-wise quantization.
|
105 |
+
As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
|
106 |
+
use_cuda_fp16 (`bool`, defaults to `False`):
|
107 |
+
Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
|
108 |
+
model_seqlen (`Optional[int]`, defaults to `None`):
|
109 |
+
The maximum sequence length that the model can take.
|
110 |
+
block_name_to_quantize (`Optional[str]`, defaults to `None`):
|
111 |
+
The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
|
112 |
+
module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
|
113 |
+
The layers that are preceding the first Transformer block.
|
114 |
+
batch_size (`int`, defaults to `1`):
|
115 |
+
The batch size of the dataset
|
116 |
+
pad_token_id (`Optional[int]`, defaults to `None`):
|
117 |
+
The pad token id. Needed to prepare the dataset when `batch_size` > 1.
|
118 |
+
disable_exllama (`bool`, defaults to `False`):
|
119 |
+
Whether to use exllama backend. Only works with `bits` = 4.
|
120 |
+
exllama_config (`Dict[str, Any]`, *optional*):
|
121 |
+
The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
|
122 |
+
max_input_length (`Optional[int]`, defaults to `None`):
|
123 |
+
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
|
124 |
+
It is specific to the exllama backend with act-order.
|
125 |
+
cache_block_outputs (`bool`, defaults to `True`):
|
126 |
+
Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
|
127 |
+
(e.g. ChatGLM) but can require more time.
|
128 |
+
modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
|
129 |
+
List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
|
130 |
+
The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
|
131 |
+
If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
|
132 |
+
"""
|
133 |
+
|
134 |
+
self.bits = bits
|
135 |
+
self.dataset = dataset
|
136 |
+
self.group_size = group_size
|
137 |
+
self.damp_percent = damp_percent
|
138 |
+
self.desc_act = desc_act
|
139 |
+
self.sym = sym
|
140 |
+
self.true_sequential = true_sequential
|
141 |
+
self.use_cuda_fp16 = use_cuda_fp16
|
142 |
+
self.model_seqlen = model_seqlen
|
143 |
+
self.block_name_to_quantize = block_name_to_quantize
|
144 |
+
self.module_name_preceding_first_block = module_name_preceding_first_block
|
145 |
+
self.batch_size = batch_size
|
146 |
+
self.pad_token_id = pad_token_id
|
147 |
+
self.disable_exllama = disable_exllama
|
148 |
+
self.exllama_config = exllama_config
|
149 |
+
self.max_input_length = max_input_length
|
150 |
+
self.quant_method = QuantizationMethod.GPTQ
|
151 |
+
self.cache_block_outputs = cache_block_outputs
|
152 |
+
self.modules_in_block_to_quantize = modules_in_block_to_quantize
|
153 |
+
|
154 |
+
self.serialization_keys = [
|
155 |
+
"bits",
|
156 |
+
"dataset",
|
157 |
+
"group_size",
|
158 |
+
"damp_percent",
|
159 |
+
"desc_act",
|
160 |
+
"sym",
|
161 |
+
"true_sequential",
|
162 |
+
"quant_method",
|
163 |
+
"modules_in_block_to_quantize",
|
164 |
+
]
|
165 |
+
|
166 |
+
if self.bits not in [2, 3, 4, 8]:
|
167 |
+
raise ValueError("only support quantize to [2,3,4,8] bits.")
|
168 |
+
if self.group_size != -1 and self.group_size <= 0:
|
169 |
+
raise ValueError("group_size must be greater than 0 or equal to -1")
|
170 |
+
if not (0 < self.damp_percent < 1):
|
171 |
+
raise ValueError("damp_percent must between 0 and 1.")
|
172 |
+
|
173 |
+
if self.exllama_config is None:
|
174 |
+
self.exllama_config = {"version": ExllamaVersion.TWO}
|
175 |
+
else:
|
176 |
+
if "version" not in self.exllama_config:
|
177 |
+
raise ValueError("`exllama_config` needs to have a `version` key")
|
178 |
+
elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
|
179 |
+
version = self.exllama_config["version"]
|
180 |
+
raise ValueError(
|
181 |
+
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
|
182 |
+
)
|
183 |
+
self.exllama_version = self.exllama_config["version"]
|
184 |
+
|
185 |
+
def to_dict(self):
|
186 |
+
"""
|
187 |
+
Returns the args in dict format.
|
188 |
+
"""
|
189 |
+
gptq_dict = {}
|
190 |
+
for key in self.serialization_keys:
|
191 |
+
gptq_dict[key] = getattr(self, key)
|
192 |
+
return gptq_dict
|
193 |
+
|
194 |
+
@classmethod
|
195 |
+
def from_dict(cls, config_dict: Dict[str, Any]):
|
196 |
+
"""
|
197 |
+
Instantiates a `GPTQQuantizer` using config_dict as kwargs
|
198 |
+
|
199 |
+
Args:
|
200 |
+
config_dict (`Dict[str,Any]`):
|
201 |
+
quantization config
|
202 |
+
|
203 |
+
Returns:
|
204 |
+
`GPTQQuantizer`: The quantizer object instantiated from those parameters.
|
205 |
+
"""
|
206 |
+
return cls(**config_dict)
|
207 |
+
|
208 |
+
def convert_model(self, model: nn.Module):
|
209 |
+
"""
|
210 |
+
Convert the model to a GPTQ model by getting and replacing the layers.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
model (`nn.Module`):
|
214 |
+
Model to be converted
|
215 |
+
|
216 |
+
"""
|
217 |
+
if self.block_name_to_quantize is None:
|
218 |
+
self.block_name_to_quantize = get_block_name_with_pattern(model)
|
219 |
+
block_name = self.block_name_to_quantize
|
220 |
+
layers_to_be_replaced = get_layers(model, prefix=block_name)
|
221 |
+
if self.modules_in_block_to_quantize is not None:
|
222 |
+
layers_to_keep = sum(self.modules_in_block_to_quantize, [])
|
223 |
+
for name in list(layers_to_be_replaced.keys()):
|
224 |
+
if not any(name.endswith(layer) for layer in layers_to_keep):
|
225 |
+
logger.info(
|
226 |
+
f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
|
227 |
+
)
|
228 |
+
del layers_to_be_replaced[name]
|
229 |
+
self._replace_by_quant_layers(model, layers_to_be_replaced)
|
230 |
+
return model
|
231 |
+
|
232 |
+
def get_no_split_module_classes(self, model):
|
233 |
+
"""
|
234 |
+
Get the modules that should not be split across multiple devices.
|
235 |
+
Args:
|
236 |
+
model (`nn.Module`):
|
237 |
+
The input model
|
238 |
+
"""
|
239 |
+
|
240 |
+
block_class_name = recurse_getattr(model, self.block_name_to_quantize)[0].__class__.__name__
|
241 |
+
no_split_module_classes = [block_class_name]
|
242 |
+
return no_split_module_classes
|
243 |
+
|
244 |
+
def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: str = ""):
|
245 |
+
"""
|
246 |
+
Replaces linear layers in `module` by `QuantLinear`
|
247 |
+
|
248 |
+
Args:
|
249 |
+
module (`nn.Module`):
|
250 |
+
Module to quantize
|
251 |
+
names (`List[str]`):
|
252 |
+
List of names of the module to quantize
|
253 |
+
name (`str`, defaults to `""`):
|
254 |
+
To keep track of the name of the current module
|
255 |
+
"""
|
256 |
+
QuantLinear = dynamically_import_QuantLinear(
|
257 |
+
use_triton=False,
|
258 |
+
desc_act=self.desc_act,
|
259 |
+
group_size=self.group_size,
|
260 |
+
bits=self.bits,
|
261 |
+
disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
|
262 |
+
disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
|
263 |
+
)
|
264 |
+
if isinstance(module, QuantLinear):
|
265 |
+
return
|
266 |
+
for attr in dir(module):
|
267 |
+
layer = getattr(module, attr)
|
268 |
+
name1 = name + "." + attr if name != "" else attr
|
269 |
+
if name1 in names:
|
270 |
+
device = get_device(layer)
|
271 |
+
delattr(module, attr)
|
272 |
+
if isinstance(layer, nn.Linear):
|
273 |
+
in_features = layer.in_features
|
274 |
+
out_features = layer.out_features
|
275 |
+
elif isinstance(layer, nn.Conv2d):
|
276 |
+
in_features = layer.in_channels
|
277 |
+
out_features = layer.out_channels
|
278 |
+
elif isinstance(layer, Conv1D):
|
279 |
+
in_features = layer.weight.shape[0]
|
280 |
+
out_features = layer.weight.shape[1]
|
281 |
+
bias = layer.bias is not None
|
282 |
+
if not (self.desc_act) or self.group_size == -1:
|
283 |
+
new_layer = QuantLinear(
|
284 |
+
self.bits,
|
285 |
+
self.group_size,
|
286 |
+
in_features,
|
287 |
+
out_features,
|
288 |
+
bias,
|
289 |
+
use_cuda_fp16=self.use_cuda_fp16,
|
290 |
+
weight_dtype=layer.weight.dtype,
|
291 |
+
)
|
292 |
+
else:
|
293 |
+
new_layer = QuantLinear(
|
294 |
+
self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
|
295 |
+
)
|
296 |
+
new_layer.device = device
|
297 |
+
setattr(module, attr, new_layer.to(device))
|
298 |
+
for name1, child in module.named_children():
|
299 |
+
self._replace_by_quant_layers(child, names, name + "." + name1 if name != "" else name1)
|
300 |
+
|
301 |
+
@torch.no_grad()
|
302 |
+
def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
|
303 |
+
"""
|
304 |
+
Quantizes the model using the dataset
|
305 |
+
|
306 |
+
Args:
|
307 |
+
model (`nn.Module`):
|
308 |
+
The model to quantize
|
309 |
+
tokenizer (Optional[`Any`], defaults to `None`):
|
310 |
+
The tokenizer to use in order to prepare the dataset. You can pass either:
|
311 |
+
- A custom tokenizer object.
|
312 |
+
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
|
313 |
+
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
314 |
+
user or organization name, like `dbmdz/bert-base-german-cased`.
|
315 |
+
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
|
316 |
+
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
|
317 |
+
Returns:
|
318 |
+
`nn.Module`: The quantized model
|
319 |
+
"""
|
320 |
+
|
321 |
+
if not is_auto_gptq_available():
|
322 |
+
raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
|
323 |
+
if not torch.cuda.is_available():
|
324 |
+
raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
|
325 |
+
|
326 |
+
model.eval()
|
327 |
+
|
328 |
+
# For Transformer model
|
329 |
+
has_config = False
|
330 |
+
has_device_map = False
|
331 |
+
if hasattr(model, "config"):
|
332 |
+
has_config = True
|
333 |
+
use_cache = model.config.use_cache
|
334 |
+
model.config.use_cache = False
|
335 |
+
|
336 |
+
# If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
|
337 |
+
if hasattr(model, "hf_device_map"):
|
338 |
+
devices = list(model.hf_device_map.values())
|
339 |
+
has_device_map = True
|
340 |
+
if "disk" in devices:
|
341 |
+
raise ValueError("disk offload is not supported with GPTQ quantization")
|
342 |
+
if "cpu" in devices or torch.device("cpu") in devices:
|
343 |
+
if len(model.hf_device_map) > 1:
|
344 |
+
logger.info("Cpu offload is not recommended. There might be some issues with the memory")
|
345 |
+
hook = None
|
346 |
+
for name, device in model.hf_device_map.items():
|
347 |
+
if device == "cpu":
|
348 |
+
module = recurse_getattr(model, name)
|
349 |
+
remove_hook_from_module(module, recurse=True)
|
350 |
+
module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
|
351 |
+
else:
|
352 |
+
has_device_map = False
|
353 |
+
|
354 |
+
if hasattr(model, "dtype"):
|
355 |
+
self.use_cuda_fp16 = model.dtype == torch.float16
|
356 |
+
|
357 |
+
if self.model_seqlen is None:
|
358 |
+
# We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
|
359 |
+
self.model_seqlen = min(4028, get_seqlen(model))
|
360 |
+
|
361 |
+
device = get_device(model)
|
362 |
+
|
363 |
+
# Step 1: Prepare the data
|
364 |
+
if isinstance(self.dataset, list) and not isinstance(self.dataset[0], str):
|
365 |
+
dataset = self.dataset
|
366 |
+
logger.info("GPTQQuantizer dataset appears to be already tokenized. Skipping tokenization.")
|
367 |
+
else:
|
368 |
+
if isinstance(tokenizer, str):
|
369 |
+
try:
|
370 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
|
371 |
+
except Exception:
|
372 |
+
raise ValueError(
|
373 |
+
f"""We were not able to get the tokenizer using `AutoTokenizer.from_pretrained`
|
374 |
+
with the string that you have passed {tokenizer}. If you have a custom tokenizer, you can pass it as input.
|
375 |
+
For now, we only support quantization for text model. Support for vision, speech and multimodel will come later."""
|
376 |
+
)
|
377 |
+
if self.dataset is None:
|
378 |
+
raise ValueError("You need to pass `dataset` in order to quantize your model")
|
379 |
+
elif isinstance(self.dataset, str):
|
380 |
+
dataset = get_dataset(self.dataset, tokenizer, seqlen=self.model_seqlen, split="train")
|
381 |
+
elif isinstance(self.dataset, list):
|
382 |
+
dataset = [tokenizer(data, return_tensors="pt") for data in self.dataset]
|
383 |
+
else:
|
384 |
+
raise ValueError(
|
385 |
+
f"You need to pass a list of string, a list of tokenized data or a string for `dataset`. Found: {type(self.dataset)}."
|
386 |
+
)
|
387 |
+
|
388 |
+
dataset = prepare_dataset(dataset, pad_token_id=self.pad_token_id, batch_size=self.batch_size)
|
389 |
+
|
390 |
+
# Step 2: get the input of the 1st block
|
391 |
+
# To do that, we need to put the modules preceding the first block on the same device as the first bloc.
|
392 |
+
# Then we run the model and it will stop at the first bloc as we added a prehook that raise an Exception after storing the inputs.
|
393 |
+
|
394 |
+
layer_inputs = []
|
395 |
+
layer_outputs = []
|
396 |
+
layer_input_kwargs = []
|
397 |
+
|
398 |
+
if self.block_name_to_quantize is None:
|
399 |
+
self.block_name_to_quantize = get_block_name_with_pattern(model)
|
400 |
+
|
401 |
+
if self.module_name_preceding_first_block is None:
|
402 |
+
self.module_name_preceding_first_block = get_preceding_modules(model, self.block_name_to_quantize)
|
403 |
+
|
404 |
+
blocks = recurse_getattr(model, self.block_name_to_quantize)
|
405 |
+
|
406 |
+
if not has_device_map:
|
407 |
+
# put modules from module_name_preceding_first_block on cuda
|
408 |
+
for module_name in self.module_name_preceding_first_block:
|
409 |
+
module = recurse_getattr(model, module_name)
|
410 |
+
if module is None:
|
411 |
+
raise ValueError(f"Module {module_name} was not found in model")
|
412 |
+
module = module.to(0)
|
413 |
+
blocks[0] = blocks[0].to(0)
|
414 |
+
|
415 |
+
def store_input_hook(_, input, *args):
|
416 |
+
kwargs = args[0]
|
417 |
+
if input is None:
|
418 |
+
if "hidden_states" in kwargs:
|
419 |
+
input = (kwargs["hidden_states"],)
|
420 |
+
else:
|
421 |
+
raise ValueError("No input value found in the foward pass")
|
422 |
+
layer_inputs.append(input)
|
423 |
+
other_kwargs = {}
|
424 |
+
for k, v in kwargs.items(): # make sure other arguments also be captured
|
425 |
+
if k not in ["hidden_states"]:
|
426 |
+
other_kwargs[k] = v
|
427 |
+
layer_input_kwargs.append(other_kwargs)
|
428 |
+
raise ValueError
|
429 |
+
|
430 |
+
if self.cache_block_outputs:
|
431 |
+
handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
|
432 |
+
for data in dataset:
|
433 |
+
for k, v in data.items():
|
434 |
+
# put the data on gpu, we won't put them back to cpu
|
435 |
+
data[k] = v.to(0)
|
436 |
+
try:
|
437 |
+
model(**data)
|
438 |
+
except ValueError:
|
439 |
+
pass
|
440 |
+
handle.remove()
|
441 |
+
|
442 |
+
if not has_device_map:
|
443 |
+
blocks[0].to(device)
|
444 |
+
for module_name in self.module_name_preceding_first_block:
|
445 |
+
module = recurse_getattr(model, module_name)
|
446 |
+
if module is None:
|
447 |
+
raise ValueError(f"Module {module_name} was not found in model")
|
448 |
+
|
449 |
+
torch.cuda.empty_cache()
|
450 |
+
|
451 |
+
# Step 3: Quantize the blocks
|
452 |
+
quantizers = {}
|
453 |
+
for i, block in enumerate(tqdm(blocks, desc=f"Quantizing {self.block_name_to_quantize} blocks ")):
|
454 |
+
logger.info(f"Start quantizing block {self.block_name_to_quantize} {i + 1}/{len(blocks)}")
|
455 |
+
|
456 |
+
if not self.cache_block_outputs:
|
457 |
+
handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
|
458 |
+
for data in dataset:
|
459 |
+
for k, v in data.items():
|
460 |
+
# put the data on gpu, we won't put them back to cpu
|
461 |
+
data[k] = v.to(0)
|
462 |
+
try:
|
463 |
+
model(**data)
|
464 |
+
except ValueError:
|
465 |
+
pass
|
466 |
+
handle.remove()
|
467 |
+
|
468 |
+
# move block to cuda if needed
|
469 |
+
# in case we have offload modules, we need to put them on cuda because of GPTQ object
|
470 |
+
if not has_device_map or get_device(block) == torch.device("cpu"):
|
471 |
+
block = block.to(0)
|
472 |
+
layers = get_layers(block)
|
473 |
+
if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
|
474 |
+
if self.true_sequential:
|
475 |
+
layers_name_list = self.modules_in_block_to_quantize
|
476 |
+
else:
|
477 |
+
layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
|
478 |
+
else:
|
479 |
+
if self.true_sequential:
|
480 |
+
# lazy sequential but works well
|
481 |
+
layers_name_list = [[key] for key in layers.keys()]
|
482 |
+
else:
|
483 |
+
layers_name_list = [list(layers.keys())]
|
484 |
+
logger.info(f"Module to quantize {layers_name_list}")
|
485 |
+
for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
|
486 |
+
subset_layers = {name: layers[name] for name in subset_name_list}
|
487 |
+
gptq = {}
|
488 |
+
handles = []
|
489 |
+
# add hook for each layer in subset_layers
|
490 |
+
for name in subset_layers:
|
491 |
+
gptq[name] = GPTQ(subset_layers[name])
|
492 |
+
gptq[name].quantizer.configure(bits=self.bits, sym=self.sym, perchannel=True)
|
493 |
+
|
494 |
+
def add_batch(name):
|
495 |
+
def tmp(_, input, output):
|
496 |
+
gptq[name].add_batch(input[0].data, output.data)
|
497 |
+
|
498 |
+
return tmp
|
499 |
+
|
500 |
+
# because it adding a hook will replace the old one.
|
501 |
+
handles.append(subset_layers[name].register_forward_hook(add_batch(name)))
|
502 |
+
# update Hessian for each layer in subset_layers thanks to the hook
|
503 |
+
for j in range(len(dataset)):
|
504 |
+
# the args are already on the gpu
|
505 |
+
# don't need to store the output
|
506 |
+
block(*layer_inputs[j], **layer_input_kwargs[j])
|
507 |
+
# remove hook
|
508 |
+
for h in handles:
|
509 |
+
h.remove()
|
510 |
+
for name in subset_name_list:
|
511 |
+
logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
|
512 |
+
scale, zero, g_idx = gptq[name].fasterquant(
|
513 |
+
percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
|
514 |
+
)
|
515 |
+
quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
|
516 |
+
gptq[name].quantizer,
|
517 |
+
scale,
|
518 |
+
zero,
|
519 |
+
g_idx,
|
520 |
+
)
|
521 |
+
gptq[name].free()
|
522 |
+
del subset_layers
|
523 |
+
# we get the new output from the partial quantized block
|
524 |
+
if self.cache_block_outputs:
|
525 |
+
for j in range(len(dataset)):
|
526 |
+
layer_output = block(*layer_inputs[j], **layer_input_kwargs[j])
|
527 |
+
layer_outputs.append(layer_output)
|
528 |
+
|
529 |
+
# put back to device
|
530 |
+
if not has_device_map:
|
531 |
+
blocks[i] = block.to(device)
|
532 |
+
del layers
|
533 |
+
del layer_inputs
|
534 |
+
layer_inputs, layer_outputs = layer_outputs, []
|
535 |
+
else:
|
536 |
+
del layers
|
537 |
+
del layer_inputs
|
538 |
+
layer_inputs = []
|
539 |
+
torch.cuda.empty_cache()
|
540 |
+
|
541 |
+
if self.bits == 4:
|
542 |
+
# device not on gpu
|
543 |
+
if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
|
544 |
+
if not self.disable_exllama:
|
545 |
+
logger.warning(
|
546 |
+
"Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
|
547 |
+
)
|
548 |
+
self.disable_exllama = True
|
549 |
+
# act order and exllama
|
550 |
+
elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE:
|
551 |
+
logger.warning(
|
552 |
+
"Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
|
553 |
+
"Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
|
554 |
+
)
|
555 |
+
self.disable_exllama = True
|
556 |
+
elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO:
|
557 |
+
logger.warning(
|
558 |
+
"Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
|
559 |
+
"Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. "
|
560 |
+
)
|
561 |
+
self.disable_exllama = True
|
562 |
+
# Step 4: Pack the model at the end (Replacing the layers)
|
563 |
+
self.pack_model(model=model, quantizers=quantizers)
|
564 |
+
|
565 |
+
model.is_quantized = True
|
566 |
+
model.quantization_method = QuantizationMethod.GPTQ
|
567 |
+
if has_config:
|
568 |
+
model.config.use_cache = use_cache
|
569 |
+
model.config.quantization_config = self.to_dict()
|
570 |
+
|
571 |
+
# Step 5: Any post-initialization that require device information, for example buffers initialization on device.
|
572 |
+
model = self.post_init_model(model)
|
573 |
+
|
574 |
+
torch.cuda.empty_cache()
|
575 |
+
return model
|
576 |
+
|
577 |
+
def post_init_model(self, model):
|
578 |
+
"""
|
579 |
+
Post-initialization that require device information, for example buffers initialization on device.
|
580 |
+
|
581 |
+
Args:
|
582 |
+
model (`nn.Module`):
|
583 |
+
The input model
|
584 |
+
"""
|
585 |
+
if self.bits == 4 and not self.disable_exllama:
|
586 |
+
if get_device(model) == torch.device("cpu") or (
|
587 |
+
hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
|
588 |
+
):
|
589 |
+
raise ValueError(
|
590 |
+
"Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
|
591 |
+
"You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
|
592 |
+
)
|
593 |
+
|
594 |
+
class StoreAttr(object):
|
595 |
+
pass
|
596 |
+
|
597 |
+
model.quantize_config = StoreAttr()
|
598 |
+
model.quantize_config.desc_act = self.desc_act
|
599 |
+
model = autogptq_post_init(model, use_act_order=self.desc_act)
|
600 |
+
if (
|
601 |
+
self.desc_act
|
602 |
+
and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
|
603 |
+
and self.max_input_length is not None
|
604 |
+
):
|
605 |
+
model = exllama_set_max_input_length(model, self.max_input_length)
|
606 |
+
return model
|
607 |
+
|
608 |
+
def pack_model(
|
609 |
+
self,
|
610 |
+
model: nn.Module,
|
611 |
+
quantizers: Dict[str, Tuple],
|
612 |
+
):
|
613 |
+
"""
|
614 |
+
Pack the model by replacing the layers by quantized layers
|
615 |
+
|
616 |
+
Args:
|
617 |
+
model (`nn.Module`):
|
618 |
+
The model to pack
|
619 |
+
quantizers (`Dict[str,Tuple]`):
|
620 |
+
A mapping of the layer name and the data needed to pack the layer
|
621 |
+
"""
|
622 |
+
QuantLinear = dynamically_import_QuantLinear(
|
623 |
+
use_triton=False,
|
624 |
+
desc_act=self.desc_act,
|
625 |
+
group_size=self.group_size,
|
626 |
+
bits=self.bits,
|
627 |
+
disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
|
628 |
+
disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
|
629 |
+
)
|
630 |
+
logger.info("Packing model...")
|
631 |
+
layers = get_layers(model)
|
632 |
+
layers = {n: layers[n] for n in quantizers}
|
633 |
+
self._replace_by_quant_layers(model, quantizers)
|
634 |
+
qlayers = get_layers(model, [QuantLinear])
|
635 |
+
for name in qlayers:
|
636 |
+
logger.info(name)
|
637 |
+
quantizers[name], scale, zero, g_idx = quantizers[name]
|
638 |
+
# so far can only pack layer on CPU
|
639 |
+
layer_device = qlayers[name].device
|
640 |
+
qlayers[name].to("cpu")
|
641 |
+
layers[name], scale, zero, g_idx = layers[name].to("cpu"), scale.to("cpu"), zero.to("cpu"), g_idx.to("cpu")
|
642 |
+
qlayers[name].pack(layers[name], scale, zero, g_idx)
|
643 |
+
qlayers[name].to(layer_device)
|
644 |
+
|
645 |
+
logger.info("Model packed.")
|
646 |
+
|
647 |
+
def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
|
648 |
+
"""
|
649 |
+
Save model state dict and configs
|
650 |
+
|
651 |
+
Args:
|
652 |
+
model (`nn.Module`):
|
653 |
+
Model to be saved. The model can be wrapped or unwraped.
|
654 |
+
save_dir (`str`):
|
655 |
+
Directory to which to save. Will be created if it doesn't exist.
|
656 |
+
max_shard_size (`str`, defaults to `"10GB"`):
|
657 |
+
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
|
658 |
+
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
|
659 |
+
<Tip warning={true}>
|
660 |
+
|
661 |
+
If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
|
662 |
+
which will be bigger than `max_shard_size`.
|
663 |
+
|
664 |
+
</Tip>
|
665 |
+
safe_serialization (`bool`, defaults to `True`):
|
666 |
+
Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
|
667 |
+
|
668 |
+
"""
|
669 |
+
os.makedirs(save_dir, exist_ok=True)
|
670 |
+
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
|
671 |
+
with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
|
672 |
+
json.dump(self.to_dict(), f, indent=2)
|
673 |
+
|
674 |
+
|
675 |
+
def load_quantized_model(
|
676 |
+
model: nn.Module,
|
677 |
+
save_folder: str,
|
678 |
+
quant_config_name: str = GPTQ_CONFIG,
|
679 |
+
state_dict_name: Optional[str] = None,
|
680 |
+
device_map: Optional[str] = None,
|
681 |
+
max_memory: Optional[Dict] = None,
|
682 |
+
no_split_module_classes: Optional[Dict] = None,
|
683 |
+
offload_folder: Optional[str] = None,
|
684 |
+
offload_buffers: Optional[str] = None,
|
685 |
+
offload_state_dict: bool = False,
|
686 |
+
disable_exllama: bool = False,
|
687 |
+
exllama_config: Optional[Dict[str, Any]] = None,
|
688 |
+
max_input_length: Optional[int] = None,
|
689 |
+
):
|
690 |
+
"""
|
691 |
+
Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map.
|
692 |
+
|
693 |
+
Args:
|
694 |
+
model (`nn.Module`):
|
695 |
+
The model can be enpty or not.
|
696 |
+
save_folder (`str`):
|
697 |
+
Directory to which to load the weights.
|
698 |
+
quant_config_name (`str`, defaults to `GPTQ_CONFIG`):
|
699 |
+
Name of the quantization config file
|
700 |
+
state_dict_name (`Optional[str]`, defaults to `None`):
|
701 |
+
Name of the state dict file
|
702 |
+
device_map (`Optional[str]`, defaults to `None`):
|
703 |
+
A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
|
704 |
+
name, once a given module name is inside, every submodule of it will be sent to the same device.
|
705 |
+
To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
|
706 |
+
max_memory (`Optional[Dict]`, defaults to `None`):
|
707 |
+
A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
|
708 |
+
and the available CPU RAM if unset.
|
709 |
+
no_split_module_classes (`Optional[Dict]`, defaults to `None`):
|
710 |
+
A list of layer class names that should never be split across device (for instance any layer that has a
|
711 |
+
residual connection).
|
712 |
+
offload_folder (`Optional[str]`, defaults to `None`):
|
713 |
+
If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
|
714 |
+
offload_buffers (`Optional[str]`, defaults to `None`):
|
715 |
+
In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
|
716 |
+
well as the parameters.
|
717 |
+
offload_state_dict (`bool`, defaults to `False`):
|
718 |
+
If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
|
719 |
+
the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
|
720 |
+
picked contains `"disk"` values.
|
721 |
+
disable_exllama (`Optional[bool]`, defaults to `None`):
|
722 |
+
Whether to use exllama backend. Only works with `bits` = 4.
|
723 |
+
exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
|
724 |
+
The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
|
725 |
+
max_input_length (`Optional[int]`, defaults to `None`):
|
726 |
+
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
|
727 |
+
It is specific to the exllama backend with act-order.
|
728 |
+
|
729 |
+
Returns:
|
730 |
+
`nn.Module`: The quantized model
|
731 |
+
"""
|
732 |
+
if not torch.cuda.is_available():
|
733 |
+
raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
|
734 |
+
if not is_auto_gptq_available():
|
735 |
+
raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
|
736 |
+
if not is_accelerate_available():
|
737 |
+
raise RuntimeError(
|
738 |
+
"You need to install accelerate in order to load and dispatch weights to"
|
739 |
+
"a quantized model. You can do it with `pip install accelerate`"
|
740 |
+
)
|
741 |
+
if device_map is None:
|
742 |
+
device_map = {"": torch.cuda.current_device()}
|
743 |
+
logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
|
744 |
+
|
745 |
+
if exllama_config is None:
|
746 |
+
exllama_config = {"version": ExllamaVersion.TWO}
|
747 |
+
else:
|
748 |
+
if "version" not in exllama_config:
|
749 |
+
raise ValueError("`exllama_config` needs to have a `version` key")
|
750 |
+
elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
|
751 |
+
version = exllama_config["version"]
|
752 |
+
raise ValueError(
|
753 |
+
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
|
754 |
+
)
|
755 |
+
|
756 |
+
# this branch will check if model is from huggingface
|
757 |
+
try:
|
758 |
+
if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
|
759 |
+
quantize_config_dict = model.config.quantization_config.to_dict()
|
760 |
+
else:
|
761 |
+
with open(os.path.join(save_folder, quant_config_name), "r", encoding="utf-8") as f:
|
762 |
+
quantize_config_dict = json.load(f)
|
763 |
+
except Exception as err:
|
764 |
+
raise ValueError(
|
765 |
+
f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
|
766 |
+
) from err
|
767 |
+
quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
|
768 |
+
quantizer.disable_exllama = disable_exllama
|
769 |
+
quantizer.exllama_config = exllama_config
|
770 |
+
quantizer.exllama_version = quantizer.exllama_config["version"]
|
771 |
+
quantizer.max_input_length = max_input_length
|
772 |
+
|
773 |
+
model = quantizer.convert_model(model)
|
774 |
+
|
775 |
+
if no_split_module_classes is None:
|
776 |
+
no_split_module_classes = quantizer.get_no_split_module_classes(model)
|
777 |
+
|
778 |
+
model = load_checkpoint_and_dispatch(
|
779 |
+
model,
|
780 |
+
checkpoint=os.path.join(save_folder, state_dict_name) if state_dict_name is not None else save_folder,
|
781 |
+
device_map=device_map,
|
782 |
+
max_memory=max_memory,
|
783 |
+
no_split_module_classes=no_split_module_classes,
|
784 |
+
offload_folder=offload_folder,
|
785 |
+
offload_buffers=offload_buffers,
|
786 |
+
offload_state_dict=offload_state_dict,
|
787 |
+
)
|
788 |
+
|
789 |
+
model = quantizer.post_init_model(model)
|
790 |
+
model.is_quantized = True
|
791 |
+
model.quantization_method = QuantizationMethod.GPTQ
|
792 |
+
model.eval()
|
793 |
+
return model
|
internal/donttouch_unpacking_autogptq/readme.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
use autogpt_sample.py to dump opt-125m-gptq4.pth
|
2 |
+
but before that we need to hack a few files
|
3 |
+
|
4 |
+
patch according to delta
|
5 |
+
/data/vchua/miniconda3/envs/240531-hgx1-hf-clm/lib/python3.11/site-packages/optimum/gptq/quantizer.py
|
6 |
+
/data/vchua/miniconda3/envs/240531-hgx1-hf-clm/lib/python3.11/site-packages/auto_gptq/nn_modules/qlinear/
|
7 |
+
|
8 |
+
then use blob_manipulate.py
|
9 |
+
|
10 |
+
verify_unpacking_logic.py
|
11 |
+
|
12 |
+
fake_dequantize.py
|
internal/donttouch_unpacking_autogptq/run_sqft.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import transformers
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import numpy as np
|
5 |
+
from transformers import LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
|
6 |
+
from fake_dequantize import fake_dequantize
|
7 |
+
from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear
|
8 |
+
|
9 |
+
DEBUG=False
|
10 |
+
|
11 |
+
class SparseCompressLinear(nn.Linear):
|
12 |
+
def __init__(self, in_features, out_features, bias=True, verbose=DEBUG):
|
13 |
+
super(SparseCompressLinear, self).__init__(in_features, out_features, bias)
|
14 |
+
self.verbose = verbose # for debug
|
15 |
+
|
16 |
+
def forward(self, input):
|
17 |
+
if self.verbose is True:
|
18 |
+
print("SparseCompressLinear Forward!")
|
19 |
+
return super(SparseCompressLinear, self).forward(input)
|
20 |
+
|
21 |
+
def __repr__(self):
|
22 |
+
# Custom print out
|
23 |
+
return f"SparseCompressLinear(in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None})"
|
24 |
+
|
25 |
+
|
26 |
+
def make_linear_from_QuantLinear(QuantLinearObj):
|
27 |
+
device = QuantLinearObj.scales.device
|
28 |
+
|
29 |
+
qweight = QuantLinearObj.qweight
|
30 |
+
scales = QuantLinearObj.scales
|
31 |
+
qzeros = QuantLinearObj.qzeros
|
32 |
+
|
33 |
+
with torch.no_grad():
|
34 |
+
W, scales, zeros = fake_dequantize(qweight, scales, qzeros)
|
35 |
+
IC, OC = W.shape
|
36 |
+
|
37 |
+
linear = SparseCompressLinear(in_features=IC, out_features=OC, bias=(QuantLinearObj.bias != None))
|
38 |
+
|
39 |
+
assert linear.weight.shape == W.t().shape, "Logical Error"
|
40 |
+
linear.weight.data = W.t().contiguous()
|
41 |
+
|
42 |
+
if QuantLinearObj.bias is not None:
|
43 |
+
linear.bias.data = QuantLinearObj.bias
|
44 |
+
|
45 |
+
linear.register_buffer("scales", scales)
|
46 |
+
linear.register_buffer("zeros", zeros)
|
47 |
+
|
48 |
+
return linear.to(device)
|
49 |
+
|
50 |
+
|
51 |
+
def replace_QuantLinear_with_SparseCompressLinear(model):
|
52 |
+
for name, module in model.named_children():
|
53 |
+
if isinstance(module, QuantLinear):
|
54 |
+
if DEBUG is True:
|
55 |
+
print(f"Restoring {name}")
|
56 |
+
restored_linear = make_linear_from_QuantLinear(module)
|
57 |
+
restored_linear = restored_linear.to(torch.float16) #TODO: Hardcoding
|
58 |
+
setattr(model, name, restored_linear)
|
59 |
+
else:
|
60 |
+
# Recursively apply to child modules
|
61 |
+
replace_QuantLinear_with_SparseCompressLinear(module)
|
62 |
+
return model
|
63 |
+
|
64 |
+
|
65 |
+
if __name__ == "__main__":
|
66 |
+
|
67 |
+
# model_id = "/data4/vchua/hf-model/Meta-Llama-3-8B-Instruct"
|
68 |
+
# model_id = "/data4/vchua/hf-model/Meta-Llama-3-70B"
|
69 |
+
|
70 |
+
model_id = "/home/vchua/sqft-qa-sparsepeft-llama-3-8b-50-gptq-gsm8k"
|
71 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="cuda")
|
72 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
73 |
+
|
74 |
+
prompt = "Alan Turing theorized that computers would one day become"
|
75 |
+
input_ids = tokenizer([prompt]).input_ids
|
76 |
+
input_ids = torch.as_tensor(input_ids)
|
77 |
+
|
78 |
+
# -----------------------------------------
|
79 |
+
output_ids = model.generate(
|
80 |
+
input_ids.cuda(), do_sample=False, top_p=None, num_beams=1, max_new_tokens=256
|
81 |
+
)
|
82 |
+
|
83 |
+
output_sqft = tokenizer.batch_decode(output_ids.cpu())
|
84 |
+
print(f"\n++ Baseline sqft output:\n\n{output_sqft[0]}\n\n")
|
85 |
+
|
86 |
+
# -----------------------------------------
|
87 |
+
replace_QuantLinear_with_SparseCompressLinear(model)
|
88 |
+
output_ids = model.generate(
|
89 |
+
input_ids.cuda(), do_sample=False, top_p=None, num_beams=1, max_new_tokens=256
|
90 |
+
)
|
91 |
+
|
92 |
+
output_fake_dequantize = tokenizer.batch_decode(output_ids.cpu())
|
93 |
+
print(f"\n++ fake dequantize sqft output:\n\n{output_fake_dequantize[0]}\n\n")
|
94 |
+
|
95 |
+
tx1mlp = model.model.layers[0].mlp
|
96 |
+
torch.save(tx1mlp.state_dict(), "./sqft_llama3_8B_gptq_tx1_mlp.pth")
|
97 |
+
# -----------------------------------------
|
98 |
+
print()
|
99 |
+
|
100 |
+
|
101 |
+
# torch.save(tx1mlp.state_dict(), "./sqft_llama3_8B_gptq_tx1_mlp.pth")
|
internal/donttouch_unpacking_autogptq/verify_unpacking_logic.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
blob = torch.load("./opt-125m-gptq4.pth")
|
5 |
+
|
6 |
+
def verify_unpack_logic(prepack, pack, nbit=4):
|
7 |
+
numel_per_int32 = 32//nbit
|
8 |
+
|
9 |
+
qweight = pack['qweight'].numpy()
|
10 |
+
scales = pack['scales'].numpy() #(ngroup, OC)
|
11 |
+
qzeros = pack['qzeros'].numpy() #(ngroup, OC//numel_per_int32)
|
12 |
+
|
13 |
+
IC = qweight.shape[0]*numel_per_int32
|
14 |
+
OC = qweight.shape[1]
|
15 |
+
group_size = IC//scales.shape[0]
|
16 |
+
|
17 |
+
qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
|
18 |
+
for row in range(0, qweight.shape[0]):
|
19 |
+
for k in range(0, numel_per_int32):
|
20 |
+
qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
|
21 |
+
|
22 |
+
intweight_match = torch.allclose(
|
23 |
+
torch.from_numpy(qweight_unpack).to(torch.int32),
|
24 |
+
torch.from_numpy(pack['intweight'].astype(np.int32))
|
25 |
+
)
|
26 |
+
|
27 |
+
assert intweight_match, "intweight and qweight_unpack do not match! pls debug"
|
28 |
+
|
29 |
+
scales_float = scales.astype(np.float32)
|
30 |
+
|
31 |
+
# TODO: verify with asym zero point. sym zero points are all identical
|
32 |
+
qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
|
33 |
+
for i in range(0, numel_per_int32):
|
34 |
+
# shift multiplier
|
35 |
+
shift_multiplier = numel_per_int32 - 1 - i
|
36 |
+
shift_by = shift_multiplier * nbit
|
37 |
+
qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
|
38 |
+
qzeros_unpack += 1 # for some reason they minus 1
|
39 |
+
|
40 |
+
qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
|
41 |
+
qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
|
42 |
+
scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
|
43 |
+
|
44 |
+
deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
|
45 |
+
for i in range(IC):
|
46 |
+
gid = i//group_size
|
47 |
+
deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
|
48 |
+
|
49 |
+
simulated_match = torch.allclose(deqweight_unpack, prepack['w'].t(), atol=0.0005)
|
50 |
+
|
51 |
+
assert simulated_match, "prepack['w'] and deqweight_unpack do not match! pls debug"
|
52 |
+
|
53 |
+
print(f"intweight_match: {intweight_match}, simulated_match: {simulated_match}")
|
54 |
+
|
55 |
+
|
56 |
+
for layer, lblob in blob.items():
|
57 |
+
print(f"\n\n--> {layer}")
|
58 |
+
prepack = lblob['prepack']
|
59 |
+
pack = lblob['pack']
|
60 |
+
|
61 |
+
# for k, v in prepack.items():
|
62 |
+
# print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
|
63 |
+
|
64 |
+
# for k, v in pack.items():
|
65 |
+
# print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
|
66 |
+
|
67 |
+
verify_unpack_logic(prepack, pack)
|
internal/pack_sparse_linear.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import os
|
4 |
+
|
5 |
+
def calc_sparsity(tensor):
|
6 |
+
if isinstance(tensor, torch.Tensor):
|
7 |
+
nnz = tensor.count_nonzero()
|
8 |
+
rate = 1-(nnz/tensor.numel())
|
9 |
+
return rate.item(), nnz
|
10 |
+
else:
|
11 |
+
nnz = np.count_nonzero(tensor)
|
12 |
+
rate = 1-(nnz/tensor.size)
|
13 |
+
return rate, nnz
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
sd = torch.load("./sqft_llama3_8B_gptq_tx1_mlp.pth")
|
17 |
+
|
18 |
+
for k,v in sd.items():
|
19 |
+
print(k)
|
20 |
+
|
21 |
+
weight = sd['up_proj.weight'] # OC x IC
|
22 |
+
scales = sd['up_proj.scales'] # n_group x OC
|
23 |
+
zeros = sd['up_proj.zeros'] # n_group x OC
|
24 |
+
|
25 |
+
nbit=4
|
26 |
+
OC, IC = weight.shape
|
27 |
+
numel_per_int32 = 32//nbit
|
28 |
+
#16x128B tile
|
29 |
+
stride_oc = 16
|
30 |
+
stride_ic = 128 * 8 // nbit
|
31 |
+
|
32 |
+
# always make contigous!
|
33 |
+
weight = weight.contiguous() # OC x IC
|
34 |
+
scales = scales.t().contiguous() # OC x n_group
|
35 |
+
zeros = zeros.t().contiguous() # OC x n_group
|
36 |
+
|
37 |
+
#TODO: hardcoding, temporary, Livia requires group size of 32. but our model is 128, we are going to repeat the value
|
38 |
+
group_size = 32
|
39 |
+
scales = scales.repeat_interleave(4, dim=1)
|
40 |
+
zeros = zeros.repeat_interleave(4, dim=1)
|
41 |
+
|
42 |
+
# Tile weight into target block size
|
43 |
+
tiled_weight = weight.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic, stride_ic)
|
44 |
+
tiled_scales = scales.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)
|
45 |
+
tiled_zeros = zeros.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)
|
46 |
+
|
47 |
+
assert tiled_weight.shape[:2] == tiled_scales.shape[:2], "pls debug"
|
48 |
+
assert tiled_weight.shape[:2] == tiled_zeros.shape[:2], "pls debug"
|
49 |
+
|
50 |
+
tiled_qweight = torch.zeros_like(tiled_weight)
|
51 |
+
tiled_bitmap = torch.zeros_like(tiled_weight).to(torch.bool)
|
52 |
+
tiled_nnz = torch.zeros(tiled_weight.shape[:2]).to(torch.int16)
|
53 |
+
|
54 |
+
non_zero_removed_tiled_qweight = torch.zeros_like(tiled_weight) # for debug
|
55 |
+
for tile_r in range(0, tiled_weight.shape[0]):
|
56 |
+
for tile_c in range(0, tiled_weight.shape[1]):
|
57 |
+
|
58 |
+
# metadata: number of non-zero elements (nnz)
|
59 |
+
sparsity, nnz = calc_sparsity(tiled_weight[tile_r, tile_c])
|
60 |
+
print(f"tile [{tile_r:4},{tile_c:4}], sparsity: {sparsity*100:4.1f}%, nnz: {nnz:5}")
|
61 |
+
|
62 |
+
# metadata: generate bitmask
|
63 |
+
nonzero_bool = (tiled_weight[tile_r, tile_c] != 0)
|
64 |
+
assert nonzero_bool.sum() == nnz, "pls debug"
|
65 |
+
tiled_bitmap[tile_r, tile_c] = nonzero_bool
|
66 |
+
tiled_nnz[tile_r, tile_c] = nnz
|
67 |
+
|
68 |
+
r = tile_r
|
69 |
+
c = tile_c
|
70 |
+
|
71 |
+
# get quantize val
|
72 |
+
w = tiled_weight[r, c]
|
73 |
+
qw = torch.zeros_like(tiled_weight[r, c])
|
74 |
+
s = tiled_scales[r, c]
|
75 |
+
z = tiled_zeros[r, c]
|
76 |
+
|
77 |
+
# for every column of groups
|
78 |
+
for col in range(tiled_scales.shape[-1]):
|
79 |
+
sidx = col*group_size
|
80 |
+
eidx = (col+1)*group_size
|
81 |
+
|
82 |
+
# unsqueeze is needed to make the vector as column
|
83 |
+
qw[:, sidx:eidx] = ( w[:, sidx:eidx] + (s[:,col]*z[:,col]).unsqueeze(-1) ) / s[:,col].unsqueeze(-1)
|
84 |
+
|
85 |
+
#for debug
|
86 |
+
non_zero_removed_tiled_qweight[r, c]=qw
|
87 |
+
|
88 |
+
# Zero Removal and pad to tile length (per Livia's request)
|
89 |
+
assert len(qw[nonzero_bool]) == nnz, "pls debug"
|
90 |
+
compress_qw = (torch.ones_like(qw)*8).reshape(-1) # because zero is 8, in this manner we achieve padding effect
|
91 |
+
compress_qw[:nnz] = qw[nonzero_bool]
|
92 |
+
assert (compress_qw != 8).sum() == nnz, "pls debug"
|
93 |
+
compress_qw = compress_qw.reshape(qw.shape)
|
94 |
+
|
95 |
+
tiled_qweight[r, c] = compress_qw
|
96 |
+
# nnz
|
97 |
+
# scale
|
98 |
+
# zeros
|
99 |
+
|
100 |
+
tiled_qweight = tiled_qweight.to(torch.int32).contiguous()
|
101 |
+
tiled_zeros = tiled_zeros.to(torch.int32).contiguous()
|
102 |
+
tiled_scales = tiled_scales.to(torch.float16).contiguous()
|
103 |
+
tiled_bitmap = tiled_bitmap.to(torch.int32).contiguous()
|
104 |
+
tiled_nnz = tiled_nnz.to(torch.int16).contiguous()
|
105 |
+
|
106 |
+
|
107 |
+
linear_nnz = tiled_nnz
|
108 |
+
linear_scales = tiled_scales.reshape(-1)
|
109 |
+
|
110 |
+
linear_qweight = tiled_qweight.reshape(-1).reshape(-1, 8).cpu().numpy()
|
111 |
+
linear_qweight_pack = np.zeros((linear_qweight.shape[0], 1), dtype=np.int32)
|
112 |
+
for i in range(0, numel_per_int32):
|
113 |
+
linear_qweight_pack[:, 0] |= linear_qweight[:, i] << (numel_per_int32 - 1 - i)*nbit
|
114 |
+
linear_qweight_pack = linear_qweight_pack.reshape(-1)
|
115 |
+
|
116 |
+
linear_zeros = tiled_zeros.reshape(-1).reshape(-1, 8).cpu().numpy()
|
117 |
+
linear_zeros_pack = np.zeros((linear_zeros.shape[0], 1), dtype=np.int32)
|
118 |
+
for i in range(0, numel_per_int32):
|
119 |
+
linear_zeros_pack[:, 0] |= linear_zeros[:, i] << (numel_per_int32 - 1 - i)*nbit
|
120 |
+
linear_zeros_pack = linear_zeros_pack.reshape(-1)
|
121 |
+
|
122 |
+
linear_bitmap = tiled_bitmap.reshape(-1).reshape(-1, 32).cpu().numpy() # why 32? 32 bitmask for an int32
|
123 |
+
linear_bitmap_pack = np.zeros((linear_bitmap.shape[0], 1), dtype=np.int32)
|
124 |
+
for i in range(0, 32):
|
125 |
+
linear_bitmap_pack[:, 0] |= linear_bitmap[:, i] << (32 - 1 - i)
|
126 |
+
linear_bitmap_pack = linear_bitmap_pack.reshape(-1)
|
127 |
+
|
128 |
+
os.makedirs("sparse_w4", exist_ok=True)
|
129 |
+
linear_qweight_pack.tofile('sparse_w4/linear_compressed_qweight_int32.bin')
|
130 |
+
linear_zeros_pack.tofile('sparse_w4/linear_zeros_int32.bin')
|
131 |
+
linear_scales.cpu().contiguous().numpy().tofile('sparse_w4/linear_scales_float16.bin')
|
132 |
+
linear_bitmap_pack.tofile('sparse_w4/linear_bitmap_int32.bin')
|
133 |
+
linear_nnz.cpu().contiguous().numpy().tofile('sparse_w4/linear_nnz_int16.bin')
|
134 |
+
|
135 |
+
print("joto")
|
136 |
+
|
137 |
+
loaded_linear_nnz = np.fromfile("sparse_w4/linear_nnz_int16.bin", dtype=np.int16)
|
138 |
+
loaded_tiled_nnz = loaded_linear_nnz.reshape(896,16)
|
139 |
+
|
140 |
+
assert torch.all(torch.from_numpy(loaded_tiled_nnz) == tiled_nnz), "pls debug"
|
141 |
+
|
142 |
+
loaded_linear_scales = np.fromfile("sparse_w4/linear_scales_float16.bin", dtype=np.float16)
|
143 |
+
loaded_tiled_scales = loaded_linear_scales.reshape(896, 16, 16, 8)
|
144 |
+
|
145 |
+
assert torch.all(torch.from_numpy(loaded_tiled_scales).to("cuda") == tiled_scales), "pls debug"
|
146 |
+
|
147 |
+
loaded_linear_bitmap_pack = np.fromfile('sparse_w4/linear_bitmap_int32.bin', dtype=np.int32)
|
148 |
+
loaded_linear_bitmap_pack = np.expand_dims(loaded_linear_bitmap_pack, axis=-1)
|
149 |
+
loaded_linear_bitmap = np.zeros((loaded_linear_bitmap_pack.shape[0], 32), dtype=np.int32)
|
150 |
+
for i in range(0, 32):
|
151 |
+
loaded_linear_bitmap[:, i] = ( loaded_linear_bitmap_pack[:, 0] >> (32 - 1 - i) ) & 0x1
|
152 |
+
loaded_tiled_bitmap = loaded_linear_bitmap.reshape(-1).reshape(896, 16, 16, 256)
|
153 |
+
|
154 |
+
assert torch.all(torch.from_numpy(loaded_tiled_bitmap).to("cuda") == tiled_bitmap), "pls debug"
|
155 |
+
|
156 |
+
loaded_linear_qweight_pack = np.fromfile('sparse_w4/linear_compressed_qweight_int32.bin', dtype=np.int32)
|
157 |
+
loaded_linear_qweight_pack = np.expand_dims(loaded_linear_qweight_pack, axis=-1)
|
158 |
+
loaded_linear_qweight = np.zeros((loaded_linear_qweight_pack.shape[0], numel_per_int32), dtype=np.int32)
|
159 |
+
for i in range(0, numel_per_int32):
|
160 |
+
loaded_linear_qweight[:, i] = ( loaded_linear_qweight_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
|
161 |
+
loaded_tiled_qweight = loaded_linear_qweight.reshape(-1).reshape(896, 16, 16, 256)
|
162 |
+
|
163 |
+
assert torch.all(torch.from_numpy(loaded_tiled_qweight).to("cuda") == tiled_qweight), "pls debug"
|
164 |
+
|
165 |
+
loaded_linear_zeros_pack = np.fromfile('sparse_w4/linear_zeros_int32.bin', dtype=np.int32)
|
166 |
+
loaded_linear_zeros_pack = np.expand_dims(loaded_linear_zeros_pack, axis=-1)
|
167 |
+
loaded_linear_zeros = np.zeros((loaded_linear_zeros_pack.shape[0], numel_per_int32), dtype=np.int32)
|
168 |
+
for i in range(0, numel_per_int32):
|
169 |
+
loaded_linear_zeros[:, i] = ( loaded_linear_zeros_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
|
170 |
+
loaded_tiled_zeros = loaded_linear_zeros.reshape(-1).reshape(896, 16, 16, 8)
|
171 |
+
|
172 |
+
assert torch.all(torch.from_numpy(loaded_tiled_zeros).to("cuda") == tiled_zeros), "pls debug"
|
173 |
+
|
174 |
+
zero_recovered_tiles = np.ones_like(loaded_tiled_qweight)*8 # zero is represented by value of 8
|
175 |
+
for r in range(0, loaded_tiled_qweight.shape[0]):
|
176 |
+
for c in range(0, loaded_tiled_qweight.shape[1]):
|
177 |
+
zero_removed_padded_tile = loaded_tiled_qweight[r, c]
|
178 |
+
nnz=loaded_tiled_nnz[r, c]
|
179 |
+
tile_values = zero_removed_padded_tile.reshape(-1)[0:nnz]
|
180 |
+
nnz_indices = np.nonzero(loaded_tiled_bitmap[r, c])
|
181 |
+
zero_recovered_tiles[r, c][nnz_indices] = tile_values
|
182 |
+
|
183 |
+
assert torch.all(non_zero_removed_tiled_qweight.to(torch.int32) == torch.from_numpy(zero_recovered_tiles).to("cuda")), "pls debug"
|
184 |
+
|
185 |
+
dequantized_tiles = np.zeros_like(zero_recovered_tiles, dtype=np.float16)
|
186 |
+
|
187 |
+
zero_recovered_tiles = zero_recovered_tiles.astype(np.float16)
|
188 |
+
loaded_tiled_zeros = loaded_tiled_zeros.astype(np.float16)
|
189 |
+
loaded_tiled_scales = loaded_tiled_scales.astype(np.float16)
|
190 |
+
for i in range(0, zero_recovered_tiles.shape[-1], group_size):
|
191 |
+
gid = i//group_size
|
192 |
+
dequantized_tiles[:, :, :, i:i+group_size] = \
|
193 |
+
( zero_recovered_tiles[:, :, :, i:i+group_size] - \
|
194 |
+
np.expand_dims(loaded_tiled_zeros[:, :, :, gid], axis=-1) ) * \
|
195 |
+
np.expand_dims(loaded_tiled_scales[:, :, :, gid], axis=-1)
|
196 |
+
|
197 |
+
print("joto")
|
198 |
+
# torch.allclose(linear_tiled_W[0], tiled_W[0,0])
|
199 |
+
# torch.allclose(linear_tiled_W[1], tiled_W[0,1])
|
200 |
+
# torch.allclose(linear_tiled_W[12], tiled_W[1,0])
|
201 |
+
# torch.allclose(linear_tiled_W[26], tiled_W[2,2])
|
202 |
+
# torch.allclose(linear_tiled_W[-1], tiled_W[-1,-1])
|
203 |
+
# In [18]: torch.allclose(tiled_W[0,1], W[0:16, 256:512])
|
204 |
+
# Out[18]: True
|
205 |
+
|
206 |
+
# In [19]: torch.allclose(tiled_W[1,1], W[16:32, 256:512])
|
207 |
+
# Out[19]: True
|
208 |
+
|
209 |
+
# In [20]: torch.allclose(tiled_W[-1,-1], W[(768-16):768, (3072-256):3072])
|
210 |
+
# Out[20]: True
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
# If you want to serialize the tensor such that a single bit indicates if an element is zero or non-zero, you can achieve this by creating a byte array where each bit corresponds to the zero/non-zero status of each element. Hereβs how you can do it:
|
215 |
+
|
216 |
+
# Convert the tensor to a boolean tensor indicating zero or non-zero.
|
217 |
+
# Flatten the boolean tensor.
|
218 |
+
# Pack the boolean values into bytes.
|
219 |
+
# Hereβs a step-by-step example:
|
220 |
+
|
221 |
+
# python
|
222 |
+
# Copy code
|
223 |
+
# import torch
|
224 |
+
|
225 |
+
# # Example tensor
|
226 |
+
# tensor = torch.tensor([[0, 1, 2], [3, 0, 4], [5, 6, 0]])
|
227 |
+
|
228 |
+
# # Step 1: Create a boolean tensor indicating zero or non-zero values
|
229 |
+
# zero_indicator = torch.eq(tensor, 0)
|
230 |
+
|
231 |
+
# # Step 2: Flatten the boolean tensor
|
232 |
+
# flat_zero_indicator = zero_indicator.flatten()
|
233 |
+
|
234 |
+
# # Step 3: Convert boolean tensor to a list of bytes
|
235 |
+
# byte_array = []
|
236 |
+
# byte = 0
|
237 |
+
# for i, bit in enumerate(flat_zero_indicator):
|
238 |
+
# if bit:
|
239 |
+
# byte |= 1 << (i % 8)
|
240 |
+
# if (i % 8) == 7:
|
241 |
+
# byte_array.append(byte)
|
242 |
+
# byte = 0
|
243 |
+
|
244 |
+
# # Append the last byte if necessary
|
245 |
+
# if (len(flat_zero_indicator) % 8) != 0:
|
246 |
+
# byte_array.append(byte)
|
247 |
+
|
248 |
+
# # Convert to bytearray
|
249 |
+
# result = bytearray(byte_array)
|
250 |
+
|
251 |
+
# print(result)
|
internal/sqft_llama3_8B_gptq_tx1_mlp.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e4b052cf767df68cde1e08ab4c5e1adf19d821d64b6f9ff5727ef5b615f97a7
|
3 |
+
size 357830528
|
sparse_w4/linear_bitmap_int32.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1db9c9466c5e2f5efdb426685b479794520c35f196e6811e175cb5066b9b874b
|
3 |
+
size 7340032
|
sparse_w4/linear_compressed_qweight_int32.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f20a8d23c239a5d002686ff8c0867bb49ffc0daec5480fedef4a5163877ca7f
|
3 |
+
size 29360128
|
sparse_w4/linear_nnz_int16.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f8d24ef1e4e2af4d04f7ef8e3f52d2023b916336c1bd013a4256f8d96805736
|
3 |
+
size 28672
|
sparse_w4/linear_scales_float16.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f825735214928e40a0c5850f95f5b55bc8de1b31bf7c1a67974df544f247b45
|
3 |
+
size 3670016
|
sparse_w4/linear_zeros_int32.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf1b625d7d0b024b60e69eae10e8f7bf74ec7d6a249ab6e0e2dee6c482123946
|
3 |
+
size 917504
|
unpack_blobs.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
# Python samples to recover the zero compressed W4 blobs
|
4 |
+
|
5 |
+
nbit=4
|
6 |
+
numel_per_int32 = 32//nbit
|
7 |
+
group_size=32
|
8 |
+
|
9 |
+
linear_nnz = np.fromfile("sparse_w4/linear_nnz_int16.bin", dtype=np.int16)
|
10 |
+
tiled_nnz = linear_nnz.reshape(896,16)
|
11 |
+
|
12 |
+
|
13 |
+
linear_scales = np.fromfile("sparse_w4/linear_scales_float16.bin", dtype=np.float16)
|
14 |
+
tiled_scales = linear_scales.reshape(896, 16, 16, 8)
|
15 |
+
|
16 |
+
|
17 |
+
linear_bitmap_pack = np.fromfile('sparse_w4/linear_bitmap_int32.bin', dtype=np.int32)
|
18 |
+
linear_bitmap_pack = np.expand_dims(linear_bitmap_pack, axis=-1)
|
19 |
+
linear_bitmap = np.zeros((linear_bitmap_pack.shape[0], 32), dtype=np.int32)
|
20 |
+
for i in range(0, 32):
|
21 |
+
linear_bitmap[:, i] = ( linear_bitmap_pack[:, 0] >> (32 - 1 - i) ) & 0x1
|
22 |
+
tiled_bitmap = linear_bitmap.reshape(-1).reshape(896, 16, 16, 256)
|
23 |
+
|
24 |
+
|
25 |
+
linear_qweight_pack = np.fromfile('sparse_w4/linear_compressed_qweight_int32.bin', dtype=np.int32)
|
26 |
+
linear_qweight_pack = np.expand_dims(linear_qweight_pack, axis=-1)
|
27 |
+
linear_qweight = np.zeros((linear_qweight_pack.shape[0], numel_per_int32), dtype=np.int32)
|
28 |
+
for i in range(0, numel_per_int32):
|
29 |
+
linear_qweight[:, i] = ( linear_qweight_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
|
30 |
+
tiled_qweight = linear_qweight.reshape(-1).reshape(896, 16, 16, 256)
|
31 |
+
|
32 |
+
|
33 |
+
linear_zeros_pack = np.fromfile('sparse_w4/linear_zeros_int32.bin', dtype=np.int32)
|
34 |
+
linear_zeros_pack = np.expand_dims(linear_zeros_pack, axis=-1)
|
35 |
+
linear_zeros = np.zeros((linear_zeros_pack.shape[0], numel_per_int32), dtype=np.int32)
|
36 |
+
for i in range(0, numel_per_int32):
|
37 |
+
linear_zeros[:, i] = ( linear_zeros_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
|
38 |
+
tiled_zeros = linear_zeros.reshape(-1).reshape(896, 16, 16, 8)
|
39 |
+
|
40 |
+
# ------------------------------------------------------------
|
41 |
+
# Decompress the tile, recover the zero locations
|
42 |
+
zero_recovered_tiles = np.ones_like(tiled_qweight)*8 # zero is represented by value of 8
|
43 |
+
for r in range(0, tiled_qweight.shape[0]):
|
44 |
+
for c in range(0, tiled_qweight.shape[1]):
|
45 |
+
zero_removed_padded_tile = tiled_qweight[r, c]
|
46 |
+
nnz=tiled_nnz[r, c]
|
47 |
+
tile_values = zero_removed_padded_tile.reshape(-1)[0:nnz]
|
48 |
+
nnz_indices = np.nonzero(tiled_bitmap[r, c])
|
49 |
+
zero_recovered_tiles[r, c][nnz_indices] = tile_values
|
50 |
+
|
51 |
+
# ------------------------------------------------------------
|
52 |
+
# Simulate dequantization of 4-bit weight to floating value
|
53 |
+
dequantized_tiles = np.zeros_like(zero_recovered_tiles, dtype=np.float16)
|
54 |
+
|
55 |
+
zero_recovered_tiles = zero_recovered_tiles.astype(np.float16)
|
56 |
+
tiled_zeros = tiled_zeros.astype(np.float16)
|
57 |
+
tiled_scales = tiled_scales.astype(np.float16)
|
58 |
+
for i in range(0, zero_recovered_tiles.shape[-1], group_size):
|
59 |
+
gid = i//group_size
|
60 |
+
dequantized_tiles[:, :, :, i:i+group_size] = \
|
61 |
+
( zero_recovered_tiles[:, :, :, i:i+group_size] - \
|
62 |
+
np.expand_dims(tiled_zeros[:, :, :, gid], axis=-1) ) * \
|
63 |
+
np.expand_dims(tiled_scales[:, :, :, gid], axis=-1)
|
64 |
+
|
65 |
+
# ------------------------------------------------------------
|
66 |
+
# Check sparsity per tile
|
67 |
+
def calc_sparsity(tensor):
|
68 |
+
nnz = np.count_nonzero(tensor)
|
69 |
+
rate = 1-(nnz/tensor.size)
|
70 |
+
return rate, nnz
|
71 |
+
|
72 |
+
for tile_r in range(0, dequantized_tiles.shape[0]):
|
73 |
+
for tile_c in range(0, dequantized_tiles.shape[1]):
|
74 |
+
sparsity, nnz = calc_sparsity(dequantized_tiles[tile_r, tile_c])
|
75 |
+
print(f"tile [{tile_r:4},{tile_c:4}], sparsity: {sparsity*100:4.1f}%, nnz: {nnz:5}")
|
76 |
+
|
77 |
+
print("end.")
|