vuiseng9
/

gptq-w4-gs32-sparse-compressed-oc14336-ic4096

Model card Files Files and versions Community

Vui Seng Chua commited on 9 days ago

Commit

cfb9114

•

1 Parent(s): e3cc684

Add content

Browse files

Files changed (21) hide show

README.md +20 -0
internal/donttouch_unpacking_autogptq/__pycache__/fake_dequantize.cpython-311.pyc +0 -0
internal/donttouch_unpacking_autogptq/autogpt_sample.py +13 -0
internal/donttouch_unpacking_autogptq/blob_manipulate.py +73 -0
internal/donttouch_unpacking_autogptq/fake_dequantize.py +65 -0
internal/donttouch_unpacking_autogptq/opt-125m-gptq4.pth +3 -0
internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py +359 -0
internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py.ori.py +358 -0
internal/donttouch_unpacking_autogptq/quantizer.py +816 -0
internal/donttouch_unpacking_autogptq/quantizer.py.ori.py +793 -0
internal/donttouch_unpacking_autogptq/readme.md +12 -0
internal/donttouch_unpacking_autogptq/run_sqft.py +101 -0
internal/donttouch_unpacking_autogptq/verify_unpacking_logic.py +67 -0
internal/pack_sparse_linear.py +251 -0
internal/sqft_llama3_8B_gptq_tx1_mlp.pth +3 -0
sparse_w4/linear_bitmap_int32.bin +3 -0
sparse_w4/linear_compressed_qweight_int32.bin +3 -0
sparse_w4/linear_nnz_int16.bin +3 -0
sparse_w4/linear_scales_float16.bin +3 -0
sparse_w4/linear_zeros_int32.bin +3 -0
unpack_blobs.py +77 -0

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+This repo contains serialized blobs of an up projection layer of llama3-8B (oc=14336, ic=4096).
+The linear layer has been quantized (GPTQ W4 Sym with group size 32) and sparsified by 50%.
+```
+├── sparse_w4
+│   ├── linear_bitmap_int32.bin
+│   ├── linear_compressed_qweight_int32.bin
+│   ├── linear_nnz_int16.bin
+│   ├── linear_scales_float16.bin
+│   └── linear_zeros_int32.bin
+```
+### Usage
+The following script shows how to process the blobs in python. It shows unpacking, zero location recovery, as well as weight dequantization process.
+```bash
+python unpack_blobs.py
+```
+> you can ignore `internal/`

internal/donttouch_unpacking_autogptq/__pycache__/fake_dequantize.cpython-311.pyc ADDED Viewed

Binary file (3.75 kB). View file

internal/donttouch_unpacking_autogptq/autogpt_sample.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+quantization_config = GPTQConfig(bits=4, sym=True, dataset = 'wikitext2', tokenizer=tokenizer, group_size=128, desc_act=False, use_exllama=False)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)
+print("joto")

internal/donttouch_unpacking_autogptq/blob_manipulate.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import numpy as np
+blob = torch.load("./opt-125m-gptq4.pth")
+for layer, lblob in blob.items():
+    if 'model.decoder.layers.0.fc1' in layer:
+        print(f"--> {layer}")
+        prepack = lblob['prepack']
+        pack = lblob['pack']
+        for k, v in prepack.items():
+            print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
+        for k, v in pack.items():
+            print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
+        break
+qweight = pack['qweight'].numpy()
+scales = pack['scales'].numpy() #(ngroup, OC)
+qzeros = pack['qzeros'].numpy() #(ngroup, OC//numel_per_int32)
+nbit=4
+numel_per_int32 = 32//nbit
+IC = qweight.shape[0]*numel_per_int32
+OC = qweight.shape[1]
+group_size = IC//scales.shape[0]
+qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
+for row in range(0, qweight.shape[0]):
+    for k in range(0, numel_per_int32):
+        qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
+torch.allclose(
+    torch.from_numpy(qweight_unpack).to(torch.int32),
+    torch.from_numpy(pack['intweight'].astype(np.int32))
+)
+scales_float = scales.astype(np.float32)
+# TODO: verify with asym zero point. sym zero points are all identical
+qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
+for i in range(0, numel_per_int32):
+    # shift multiplier
+    shift_multiplier = numel_per_int32 - 1 - i
+    shift_by = shift_multiplier * nbit
+    qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
+qzeros_unpack += 1 # for some reason they minus 1
+qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
+qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
+scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
+deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
+for i in range(IC):
+    gid = i//group_size
+    deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
+print(torch.allclose(deqweight_unpack, prepack['w'].t(), atol=0.0005))
+print("temp")
+# Numpy path
+# deqweight_unpack = np.zeros((IC,OC), dtype=np.float32)
+# for i in range(IC):
+#     gid = i//group_size
+#     deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
+# deqweight_unpack = torch.from_numpy(deqweight_unpack).to(torch.float16)
+torch.allclose(dequant_float, prepack['w'].t(), atol=0.0005)
+print("blob")

internal/donttouch_unpacking_autogptq/fake_dequantize.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import numpy as np
+def fake_dequantize(qweight, scales, qzeros):
+    nbit=4
+    numel_per_int32 = 32//nbit
+    qweight = qweight.cpu().numpy()
+    scales = scales.cpu().numpy() #(ngroup, OC)
+    qzeros = qzeros.cpu().numpy() #(ngroup, OC//numel_per_int32)
+    IC = qweight.shape[0]*numel_per_int32
+    OC = qweight.shape[1]
+    group_size = IC//scales.shape[0]
+    qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
+    for row in range(0, qweight.shape[0]):
+        for k in range(0, numel_per_int32):
+            qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
+    scales_float = scales.astype(np.float32)
+    qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
+    for i in range(0, numel_per_int32):
+        # shift multiplier
+        shift_multiplier = numel_per_int32 - 1 - i
+        shift_by = shift_multiplier * nbit
+        qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
+    qzeros_unpack += 1 # for some reason they minus 1
+    qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
+    qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
+    scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
+    deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
+    for i in range(IC):
+        gid = i//group_size
+        deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
+    return deqweight_unpack, scales_float, qzeros_unpack
+if __name__ == "__main__":
+    blob = torch.load("./opt-125m-gptq4.pth")
+    for layer, lblob in blob.items():
+        print(f"\n\n--> {layer}")
+        prepack = lblob['prepack']
+        pack = lblob['pack']
+        # for k, v in prepack.items():
+        #     print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
+        # for k, v in pack.items():
+        #     print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
+        W, _, _ = fake_dequantize(pack['qweight'], pack['scales'], pack['qzeros'])
+        simulated_match = torch.allclose(W, prepack['w'].t(), atol=0.0005)
+        print(f"simulated_match? {simulated_match}")

internal/donttouch_unpacking_autogptq/opt-125m-gptq4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0269cabd58cd27261fde469502a01e84760a413b16ffa7989f395c53c65e46f4
+size 46688098

internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import math
+from logging import getLogger
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+logger = getLogger(__name__)
+try:
+    import autogptq_cuda_64
+    import autogptq_cuda_256
+    _autogptq_cuda_available = True
+except ImportError:
+    logger.warning("CUDA extension not installed.")
+    autogptq_cuda_256 = None
+    autogptq_cuda_64 = None
+    _autogptq_cuda_available = False
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "cuda-old"
+    def __init__(
+        self,
+        bits,
+        group_size,
+        infeatures,
+        outfeatures,
+        bias,
+        use_cuda_fp16=True,
+        kernel_switch_threshold=128,
+        trainable=False,
+        weight_dtype=torch.float16,
+    ):
+        super().__init__()
+        global _autogptq_cuda_available
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        if trainable:
+            _autogptq_cuda_available = False
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.maxq = 2**self.bits - 1
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=weight_dtype,
+            ),
+        )
+        self.register_buffer(
+            "g_idx",
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
+        else:
+            self.bias = None
+        self.half_indim = self.infeatures // 2
+        self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
+        # is performed by unpacking the weights and using torch.matmul
+        if self.bits in [2, 4, 8]:
+            self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
+        elif self.bits == 3:
+            self.wf = torch.tensor(
+                [
+                    [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
+                    [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
+                    [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
+                ],
+                dtype=torch.int32,
+            ).reshape(1, 3, 12)
+        self.kernel_switch_threshold = kernel_switch_threshold
+        self.autogptq_cuda_available = _autogptq_cuda_available
+        self.autogptq_cuda = autogptq_cuda_256
+        if infeatures % 256 != 0 or outfeatures % 256 != 0:
+            self.autogptq_cuda = autogptq_cuda_64
+        if infeatures % 64 != 0 or outfeatures % 64 != 0:
+            self.autogptq_cuda_available = False
+        self.trainable = trainable
+    def post_init(self):
+        pass
+    def pack(self, linear, scales, zeros, g_idx):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().to(dtype=linear.weight.dtype)
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
+        intweight = []
+        for idx in range(self.infeatures):
+            g_idx = idx // self.group_size
+            intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        self.intweight = intweight
+        i = 0
+        row = 0
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+    def forward(self, x):
+        x_dtype = x.dtype
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        x = x.reshape(-1, x.shape[-1])
+        if (
+            x.device.type == "cuda"
+            and self.autogptq_cuda_available is True
+            and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
+        ):
+            out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
+            if self.use_cuda_fp16:
+                if x_dtype != torch.float16:
+                    logger.warning_once(
+                        f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
+                    )
+                if self.bits == 2:
+                    self.autogptq_cuda.vecquant2matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                elif self.bits == 3:
+                    self.autogptq_cuda.vecquant3matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                elif self.bits == 4:
+                    self.autogptq_cuda.vecquant4matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                else:
+                    raise NotImplementedError("Only 2,3,4 bits are supported.")
+            else:
+                x = x.to(torch.float32)  # This is required for autocast compatibility.
+                if self.bits == 2:
+                    self.autogptq_cuda.vecquant2matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 3:
+                    self.autogptq_cuda.vecquant3matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 4:
+                    self.autogptq_cuda.vecquant4matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 8:
+                    self.autogptq_cuda.vecquant8matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                else:
+                    raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        else:
+            if self.wf.device != self.qzeros.device:
+                self.wf = self.wf.to(self.qzeros.device)
+            if self.bits in [2, 4, 8]:
+                zeros = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
+                    self.wf.unsqueeze(0),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                zeros = zeros + 1
+                zeros = torch.bitwise_and(
+                    zeros, (2**self.bits) - 1
+                )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
+                zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+                scales = self.scales
+                scales = scales.reshape(-1, 1, scales.shape[-1])
+                weight = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+                    self.wf.unsqueeze(-1),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                weight = torch.bitwise_and(weight, (2**self.bits) - 1)
+                weight = weight.reshape(-1, self.group_size, weight.shape[2])
+            elif self.bits == 3:
+                zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
+                    -1, -1, -1, 12
+                )
+                zeros = zeros >> self.wf.unsqueeze(0)
+                zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
+                zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
+                zeros = zeros & 0x7
+                zeros = torch.cat(
+                    [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
+                    dim=2,
+                )
+                zeros = zeros + 1
+                zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+                scales = self.scales
+                scales = scales.reshape(-1, 1, scales.shape[-1])
+                weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
+                    -1, -1, 12, -1
+                )
+                weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
+                weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
+                weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
+                weight = weight & 0x7
+                weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
+                weight = weight.reshape(-1, self.group_size, weight.shape[2])
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+            weight = scales * (weight - zeros)
+            weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
+            out = torch.matmul(x, weight)
+        out = out.to(dtype=x_dtype).reshape(
+            out_shape
+        )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
+        out = out + self.bias if self.bias is not None else out
+        return out
+__all__ = ["QuantLinear"]

internal/donttouch_unpacking_autogptq/qlinear_cuda_old.py.ori.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import math
+from logging import getLogger
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+logger = getLogger(__name__)
+try:
+    import autogptq_cuda_64
+    import autogptq_cuda_256
+    _autogptq_cuda_available = True
+except ImportError:
+    logger.warning("CUDA extension not installed.")
+    autogptq_cuda_256 = None
+    autogptq_cuda_64 = None
+    _autogptq_cuda_available = False
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "cuda-old"
+    def __init__(
+        self,
+        bits,
+        group_size,
+        infeatures,
+        outfeatures,
+        bias,
+        use_cuda_fp16=True,
+        kernel_switch_threshold=128,
+        trainable=False,
+        weight_dtype=torch.float16,
+    ):
+        super().__init__()
+        global _autogptq_cuda_available
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        if trainable:
+            _autogptq_cuda_available = False
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.maxq = 2**self.bits - 1
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=weight_dtype,
+            ),
+        )
+        self.register_buffer(
+            "g_idx",
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
+        else:
+            self.bias = None
+        self.half_indim = self.infeatures // 2
+        self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
+        # is performed by unpacking the weights and using torch.matmul
+        if self.bits in [2, 4, 8]:
+            self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
+        elif self.bits == 3:
+            self.wf = torch.tensor(
+                [
+                    [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
+                    [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
+                    [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
+                ],
+                dtype=torch.int32,
+            ).reshape(1, 3, 12)
+        self.kernel_switch_threshold = kernel_switch_threshold
+        self.autogptq_cuda_available = _autogptq_cuda_available
+        self.autogptq_cuda = autogptq_cuda_256
+        if infeatures % 256 != 0 or outfeatures % 256 != 0:
+            self.autogptq_cuda = autogptq_cuda_64
+        if infeatures % 64 != 0 or outfeatures % 64 != 0:
+            self.autogptq_cuda_available = False
+        self.trainable = trainable
+    def post_init(self):
+        pass
+    def pack(self, linear, scales, zeros, g_idx):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().to(dtype=linear.weight.dtype)
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
+        intweight = []
+        for idx in range(self.infeatures):
+            g_idx = idx // self.group_size
+            intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        i = 0
+        row = 0
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+    def forward(self, x):
+        x_dtype = x.dtype
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        x = x.reshape(-1, x.shape[-1])
+        if (
+            x.device.type == "cuda"
+            and self.autogptq_cuda_available is True
+            and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
+        ):
+            out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
+            if self.use_cuda_fp16:
+                if x_dtype != torch.float16:
+                    logger.warning_once(
+                        f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
+                    )
+                if self.bits == 2:
+                    self.autogptq_cuda.vecquant2matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                elif self.bits == 3:
+                    self.autogptq_cuda.vecquant3matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                elif self.bits == 4:
+                    self.autogptq_cuda.vecquant4matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                else:
+                    raise NotImplementedError("Only 2,3,4 bits are supported.")
+            else:
+                x = x.to(torch.float32)  # This is required for autocast compatibility.
+                if self.bits == 2:
+                    self.autogptq_cuda.vecquant2matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 3:
+                    self.autogptq_cuda.vecquant3matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 4:
+                    self.autogptq_cuda.vecquant4matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 8:
+                    self.autogptq_cuda.vecquant8matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                else:
+                    raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        else:
+            if self.wf.device != self.qzeros.device:
+                self.wf = self.wf.to(self.qzeros.device)
+            if self.bits in [2, 4, 8]:
+                zeros = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
+                    self.wf.unsqueeze(0),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                zeros = zeros + 1
+                zeros = torch.bitwise_and(
+                    zeros, (2**self.bits) - 1
+                )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
+                zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+                scales = self.scales
+                scales = scales.reshape(-1, 1, scales.shape[-1])
+                weight = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+                    self.wf.unsqueeze(-1),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                weight = torch.bitwise_and(weight, (2**self.bits) - 1)
+                weight = weight.reshape(-1, self.group_size, weight.shape[2])
+            elif self.bits == 3:
+                zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
+                    -1, -1, -1, 12
+                )
+                zeros = zeros >> self.wf.unsqueeze(0)
+                zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
+                zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
+                zeros = zeros & 0x7
+                zeros = torch.cat(
+                    [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
+                    dim=2,
+                )
+                zeros = zeros + 1
+                zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+                scales = self.scales
+                scales = scales.reshape(-1, 1, scales.shape[-1])
+                weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
+                    -1, -1, 12, -1
+                )
+                weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
+                weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
+                weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
+                weight = weight & 0x7
+                weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
+                weight = weight.reshape(-1, self.group_size, weight.shape[2])
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+            weight = scales * (weight - zeros)
+            weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
+            out = torch.matmul(x, weight)
+        out = out.to(dtype=x_dtype).reshape(
+            out_shape
+        )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
+        out = out + self.bias if self.bias is not None else out
+        return out
+__all__ = ["QuantLinear"]

internal/donttouch_unpacking_autogptq/quantizer.py ADDED Viewed

	@@ -0,0 +1,816 @@

+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from enum import Enum
+from logging import getLogger
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+from transformers.pytorch_utils import Conv1D
+from transformers.utils.quantization_config import QuantizationMethod
+from ..utils import is_accelerate_available, is_auto_gptq_available
+from ..utils.modeling_utils import recurse_getattr
+from .constants import GPTQ_CONFIG
+from .data import get_dataset, prepare_dataset
+from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
+from collections import OrderedDict
+if is_accelerate_available():
+    from accelerate import (
+        cpu_offload_with_hook,
+        load_checkpoint_and_dispatch,
+    )
+    from accelerate.hooks import remove_hook_from_module
+if is_auto_gptq_available():
+    from auto_gptq import exllama_set_max_input_length
+    from auto_gptq.modeling._utils import autogptq_post_init
+    from auto_gptq.quantization import GPTQ
+    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+logger = getLogger(__name__)
+class ExllamaVersion(int, Enum):
+    ONE = 1
+    TWO = 2
+class GPTQQuantizer(object):
+    r"""
+    A simple API for GPTQ Quantization
+    """
+    def __init__(
+        self,
+        bits: int,
+        dataset: Optional[Union[List[str], str]] = None,
+        group_size: int = 128,
+        damp_percent: float = 0.1,
+        desc_act: bool = False,
+        sym: bool = True,
+        true_sequential: bool = True,
+        use_cuda_fp16: bool = False,
+        model_seqlen: Optional[int] = None,
+        block_name_to_quantize: Optional[str] = None,
+        module_name_preceding_first_block: Optional[List[str]] = None,
+        batch_size: int = 1,
+        pad_token_id: Optional[int] = None,
+        disable_exllama: bool = False,
+        exllama_config: Dict[str, Any] = None,
+        max_input_length: Optional[int] = None,
+        cache_block_outputs: Optional[bool] = True,
+        modules_in_block_to_quantize: Optional[List[List[str]]] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Args:
+            bits (`int`):
+                The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
+            dataset (`Union[List[str], str, Any]`, defaults to `None`):
+                The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
+                (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
+                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
+            group_size (int, defaults to 128):
+                The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
+            damp_percent (`float`, defaults to `0.1`):
+                The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1.
+            desc_act (`bool`, defaults to `False`):
+                Whether to quantize columns in order of decreasing activation size.
+                Setting it to False can significantly speed up inference but the perplexity may become slightly worse.
+                Also known as act-order.
+            sym (`bool`, defaults to `True`):
+                Whether to use symetric quantization.
+            true_sequential (`bool`, defaults to `True`):
+                Whether to perform sequential quantization even within a single Transformer block.
+                Instead of quantizing the entire block at once, we perform layer-wise quantization.
+                As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
+            use_cuda_fp16 (`bool`, defaults to `False`):
+                Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
+            model_seqlen (`Optional[int]`, defaults to `None`):
+                The maximum sequence length that the model can take.
+            block_name_to_quantize (`Optional[str]`, defaults to `None`):
+                The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
+            module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
+                The layers that are preceding the first Transformer block.
+            batch_size (`int`, defaults to `1`):
+                The batch size of the dataset
+            pad_token_id (`Optional[int]`, defaults to `None`):
+                The pad token id. Needed to prepare the dataset when `batch_size` > 1.
+            disable_exllama (`bool`, defaults to `False`):
+                Whether to use exllama backend. Only works with `bits` = 4.
+            exllama_config (`Dict[str, Any]`, *optional*):
+                The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
+            max_input_length (`Optional[int]`, defaults to `None`):
+                The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
+                It is specific to the exllama backend with act-order.
+            cache_block_outputs (`bool`, defaults to `True`):
+                Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
+                (e.g. ChatGLM) but can require more time.
+            modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
+                List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
+                The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
+                If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
+        """
+        self.bits = bits
+        self.dataset = dataset
+        self.group_size = group_size
+        self.damp_percent = damp_percent
+        self.desc_act = desc_act
+        self.sym = sym
+        self.true_sequential = true_sequential
+        self.use_cuda_fp16 = use_cuda_fp16
+        self.model_seqlen = model_seqlen
+        self.block_name_to_quantize = block_name_to_quantize
+        self.module_name_preceding_first_block = module_name_preceding_first_block
+        self.batch_size = batch_size
+        self.pad_token_id = pad_token_id
+        self.disable_exllama = disable_exllama
+        self.exllama_config = exllama_config
+        self.max_input_length = max_input_length
+        self.quant_method = QuantizationMethod.GPTQ
+        self.cache_block_outputs = cache_block_outputs
+        self.modules_in_block_to_quantize = modules_in_block_to_quantize
+        self.serialization_keys = [
+            "bits",
+            "dataset",
+            "group_size",
+            "damp_percent",
+            "desc_act",
+            "sym",
+            "true_sequential",
+            "quant_method",
+            "modules_in_block_to_quantize",
+        ]
+        if self.bits not in [2, 3, 4, 8]:
+            raise ValueError("only support quantize to [2,3,4,8] bits.")
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("group_size must be greater than 0 or equal to -1")
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+        if self.exllama_config is None:
+            self.exllama_config = {"version": ExllamaVersion.TWO}
+        else:
+            if "version" not in self.exllama_config:
+                raise ValueError("`exllama_config` needs to have a `version` key")
+            elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+                version = self.exllama_config["version"]
+                raise ValueError(
+                    f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+                )
+        self.exllama_version = self.exllama_config["version"]
+    def to_dict(self):
+        """
+        Returns the args in dict format.
+        """
+        gptq_dict = {}
+        for key in self.serialization_keys:
+            gptq_dict[key] = getattr(self, key)
+        return gptq_dict
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]):
+        """
+        Instantiates a `GPTQQuantizer` using config_dict as kwargs
+        Args:
+            config_dict (`Dict[str,Any]`):
+                quantization config
+        Returns:
+            `GPTQQuantizer`:  The quantizer object instantiated from those parameters.
+        """
+        return cls(**config_dict)
+    def convert_model(self, model: nn.Module):
+        """
+        Convert the model to a GPTQ model by getting and replacing the layers.
+        Args:
+            model (`nn.Module`):
+                Model to be converted
+        """
+        if self.block_name_to_quantize is None:
+            self.block_name_to_quantize = get_block_name_with_pattern(model)
+        block_name = self.block_name_to_quantize
+        layers_to_be_replaced = get_layers(model, prefix=block_name)
+        if self.modules_in_block_to_quantize is not None:
+            layers_to_keep = sum(self.modules_in_block_to_quantize, [])
+            for name in list(layers_to_be_replaced.keys()):
+                if not any(name.endswith(layer) for layer in layers_to_keep):
+                    logger.info(
+                        f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
+                    )
+                    del layers_to_be_replaced[name]
+        self._replace_by_quant_layers(model, layers_to_be_replaced)
+        return model
+    def get_no_split_module_classes(self, model):
+        """
+        Get the modules that should not be split across multiple devices.
+        Args:
+            model (`nn.Module`):
+                The input model
+        """
+        block_class_name = recurse_getattr(model, self.block_name_to_quantize)[0].__class__.__name__
+        no_split_module_classes = [block_class_name]
+        return no_split_module_classes
+    def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: str = ""):
+        """
+        Replaces linear layers in `module` by `QuantLinear`
+        Args:
+            module (`nn.Module`):
+                Module to quantize
+            names (`List[str]`):
+                List of names of the module to quantize
+            name (`str`, defaults to `""`):
+                To keep track of the name of the current module
+        """
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=self.desc_act,
+            group_size=self.group_size,
+            bits=self.bits,
+            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
+        )
+        if isinstance(module, QuantLinear):
+            return
+        for attr in dir(module):
+            layer = getattr(module, attr)
+            name1 = name + "." + attr if name != "" else attr
+            if name1 in names:
+                device = get_device(layer)
+                delattr(module, attr)
+                if isinstance(layer, nn.Linear):
+                    in_features = layer.in_features
+                    out_features = layer.out_features
+                elif isinstance(layer, nn.Conv2d):
+                    in_features = layer.in_channels
+                    out_features = layer.out_channels
+                elif isinstance(layer, Conv1D):
+                    in_features = layer.weight.shape[0]
+                    out_features = layer.weight.shape[1]
+                bias = layer.bias is not None
+                if not (self.desc_act) or self.group_size == -1:
+                    new_layer = QuantLinear(
+                        self.bits,
+                        self.group_size,
+                        in_features,
+                        out_features,
+                        bias,
+                        use_cuda_fp16=self.use_cuda_fp16,
+                        weight_dtype=layer.weight.dtype,
+                    )
+                else:
+                    new_layer = QuantLinear(
+                        self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
+                    )
+                new_layer.device = device
+                setattr(module, attr, new_layer.to(device))
+        for name1, child in module.named_children():
+            self._replace_by_quant_layers(child, names, name + "." + name1 if name != "" else name1)
+    @torch.no_grad()
+    def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
+        """
+        Quantizes the model using the dataset
+        Args:
+            model (`nn.Module`):
+                The model to quantize
+            tokenizer (Optional[`Any`], defaults to `None`):
+                The tokenizer to use in order to prepare the dataset. You can pass either:
+                    - A custom tokenizer object.
+                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        Returns:
+            `nn.Module`: The quantized model
+        """
+        if not is_auto_gptq_available():
+            raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
+        model.eval()
+        # For Transformer model
+        has_config = False
+        has_device_map = False
+        if hasattr(model, "config"):
+            has_config = True
+            use_cache = model.config.use_cache
+            model.config.use_cache = False
+        # If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
+        if hasattr(model, "hf_device_map"):
+            devices = list(model.hf_device_map.values())
+            has_device_map = True
+            if "disk" in devices:
+                raise ValueError("disk offload is not supported with GPTQ quantization")
+            if "cpu" in devices or torch.device("cpu") in devices:
+                if len(model.hf_device_map) > 1:
+                    logger.info("Cpu offload is not recommended. There might be some issues with the memory")
+                    hook = None
+                    for name, device in model.hf_device_map.items():
+                        if device == "cpu":
+                            module = recurse_getattr(model, name)
+                            remove_hook_from_module(module, recurse=True)
+                            module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
+                else:
+                    has_device_map = False
+        if hasattr(model, "dtype"):
+            self.use_cuda_fp16 = model.dtype == torch.float16
+        if self.model_seqlen is None:
+            # We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
+            self.model_seqlen = min(4028, get_seqlen(model))
+        device = get_device(model)
+        # Step 1: Prepare the data
+        if isinstance(self.dataset, list) and not isinstance(self.dataset[0], str):
+            dataset = self.dataset
+            logger.info("GPTQQuantizer dataset appears to be already tokenized. Skipping tokenization.")
+        else:
+            if isinstance(tokenizer, str):
+                try:
+                    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+                except Exception:
+                    raise ValueError(
+                        f"""We were not able to get the tokenizer using `AutoTokenizer.from_pretrained`
+                        with the string that you have passed {tokenizer}. If you have a custom tokenizer, you can pass it as input.
+                        For now, we only support quantization for text model. Support for vision, speech and multimodel will come later."""
+                    )
+            if self.dataset is None:
+                raise ValueError("You need to pass `dataset` in order to quantize your model")
+            elif isinstance(self.dataset, str):
+                dataset = get_dataset(self.dataset, tokenizer, seqlen=self.model_seqlen, split="train")
+            elif isinstance(self.dataset, list):
+                dataset = [tokenizer(data, return_tensors="pt") for data in self.dataset]
+            else:
+                raise ValueError(
+                    f"You need to pass a list of string, a list of tokenized data or a string for `dataset`. Found: {type(self.dataset)}."
+                )
+        dataset = prepare_dataset(dataset, pad_token_id=self.pad_token_id, batch_size=self.batch_size)
+        # Step 2: get the input of the 1st block
+        # To do that, we need to put the modules preceding the first block on the same device as the first bloc.
+        # Then we run the model and it will stop at the first bloc as we added a prehook that raise an Exception after storing the inputs.
+        layer_inputs = []
+        layer_outputs = []
+        layer_input_kwargs = []
+        if self.block_name_to_quantize is None:
+            self.block_name_to_quantize = get_block_name_with_pattern(model)
+        if self.module_name_preceding_first_block is None:
+            self.module_name_preceding_first_block = get_preceding_modules(model, self.block_name_to_quantize)
+        blocks = recurse_getattr(model, self.block_name_to_quantize)
+        if not has_device_map:
+            # put modules from module_name_preceding_first_block on cuda
+            for module_name in self.module_name_preceding_first_block:
+                module = recurse_getattr(model, module_name)
+                if module is None:
+                    raise ValueError(f"Module {module_name} was not found in model")
+                module = module.to(0)
+            blocks[0] = blocks[0].to(0)
+        def store_input_hook(_, input, *args):
+            kwargs = args[0]
+            if input is None:
+                if "hidden_states" in kwargs:
+                    input = (kwargs["hidden_states"],)
+                else:
+                    raise ValueError("No input value found in the foward pass")
+            layer_inputs.append(input)
+            other_kwargs = {}
+            for k, v in kwargs.items():  # make sure other arguments also be captured
+                if k not in ["hidden_states"]:
+                    other_kwargs[k] = v
+            layer_input_kwargs.append(other_kwargs)
+            raise ValueError
+        if self.cache_block_outputs:
+            handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
+            for data in dataset:
+                for k, v in data.items():
+                    # put the data on gpu, we won't put them back to cpu
+                    data[k] = v.to(0)
+                try:
+                    model(**data)
+                except ValueError:
+                    pass
+            handle.remove()
+        if not has_device_map:
+            blocks[0].to(device)
+            for module_name in self.module_name_preceding_first_block:
+                module = recurse_getattr(model, module_name)
+                if module is None:
+                    raise ValueError(f"Module {module_name} was not found in model")
+        torch.cuda.empty_cache()
+        # Step 3: Quantize the blocks
+        quantizers = {}
+        for i, block in enumerate(tqdm(blocks, desc=f"Quantizing {self.block_name_to_quantize} blocks ")):
+            logger.info(f"Start quantizing block {self.block_name_to_quantize} {i + 1}/{len(blocks)}")
+            if not self.cache_block_outputs:
+                handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
+                for data in dataset:
+                    for k, v in data.items():
+                        # put the data on gpu, we won't put them back to cpu
+                        data[k] = v.to(0)
+                    try:
+                        model(**data)
+                    except ValueError:
+                        pass
+                handle.remove()
+            # move block to cuda if needed
+            # in case we have offload modules, we need to put them on cuda because of GPTQ object
+            if not has_device_map or get_device(block) == torch.device("cpu"):
+                block = block.to(0)
+            layers = get_layers(block)
+            if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
+                if self.true_sequential:
+                    layers_name_list = self.modules_in_block_to_quantize
+                else:
+                    layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
+            else:
+                if self.true_sequential:
+                    # lazy sequential but works well
+                    layers_name_list = [[key] for key in layers.keys()]
+                else:
+                    layers_name_list = [list(layers.keys())]
+            logger.info(f"Module to quantize {layers_name_list}")
+            for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
+                subset_layers = {name: layers[name] for name in subset_name_list}
+                gptq = {}
+                handles = []
+                # add hook for each layer in subset_layers
+                for name in subset_layers:
+                    gptq[name] = GPTQ(subset_layers[name])
+                    gptq[name].quantizer.configure(bits=self.bits, sym=self.sym, perchannel=True)
+                    def add_batch(name):
+                        def tmp(_, input, output):
+                            gptq[name].add_batch(input[0].data, output.data)
+                        return tmp
+                    # because it adding a hook will replace the old one.
+                    handles.append(subset_layers[name].register_forward_hook(add_batch(name)))
+                # update Hessian for each layer in subset_layers thanks to the hook
+                for j in range(len(dataset)):
+                    # the args are already on the gpu
+                    # don't need to store the output
+                    block(*layer_inputs[j], **layer_input_kwargs[j])
+                # remove hook
+                for h in handles:
+                    h.remove()
+                for name in subset_name_list:
+                    logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
+                    scale, zero, g_idx = gptq[name].fasterquant(
+                        percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
+                    )
+                    quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
+                        gptq[name].quantizer,
+                        scale,
+                        zero,
+                        g_idx,
+                    )
+                    gptq[name].free()
+                del subset_layers
+            # we get the new output from the partial quantized block
+            if self.cache_block_outputs:
+                for j in range(len(dataset)):
+                    layer_output = block(*layer_inputs[j], **layer_input_kwargs[j])
+                    layer_outputs.append(layer_output)
+                # put back to device
+                if not has_device_map:
+                    blocks[i] = block.to(device)
+                del layers
+                del layer_inputs
+                layer_inputs, layer_outputs = layer_outputs, []
+            else:
+                del layers
+                del layer_inputs
+                layer_inputs = []
+            torch.cuda.empty_cache()
+            if i==5:
+                break
+        if self.bits == 4:
+            # device not on gpu
+            if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
+                if not self.disable_exllama:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                    )
+                    self.disable_exllama = True
+            # act order and exllama
+            elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE:
+                logger.warning(
+                    "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
+                    "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
+                )
+                self.disable_exllama = True
+            elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO:
+                logger.warning(
+                    "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
+                    "Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. "
+                )
+                self.disable_exllama = True
+        # Step 4: Pack the model at the end (Replacing the layers)
+        self.pack_model(model=model, quantizers=quantizers)
+        model.is_quantized = True
+        model.quantization_method = QuantizationMethod.GPTQ
+        if has_config:
+            model.config.use_cache = use_cache
+            model.config.quantization_config = self.to_dict()
+        # Step 5: Any post-initialization that require device information, for example buffers initialization on device.
+        model = self.post_init_model(model)
+        torch.cuda.empty_cache()
+        return model
+    def post_init_model(self, model):
+        """
+        Post-initialization that require device information, for example buffers initialization on device.
+        Args:
+            model (`nn.Module`):
+                The input model
+        """
+        if self.bits == 4 and not self.disable_exllama:
+            if get_device(model) == torch.device("cpu") or (
+                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
+            ):
+                raise ValueError(
+                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
+                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
+                )
+        class StoreAttr(object):
+            pass
+        model.quantize_config = StoreAttr()
+        model.quantize_config.desc_act = self.desc_act
+        model = autogptq_post_init(model, use_act_order=self.desc_act)
+        if (
+            self.desc_act
+            and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
+            and self.max_input_length is not None
+        ):
+            model = exllama_set_max_input_length(model, self.max_input_length)
+        return model
+    def pack_model(
+        self,
+        model: nn.Module,
+        quantizers: Dict[str, Tuple],
+    ):
+        """
+        Pack the model by replacing the layers by quantized layers
+        Args:
+            model (`nn.Module`):
+                The model to pack
+            quantizers (`Dict[str,Tuple]`):
+                A mapping of the layer name and the data needed to pack the layer
+        """
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=self.desc_act,
+            group_size=self.group_size,
+            bits=self.bits,
+            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
+        )
+        logger.info("Packing model...")
+        layers = get_layers(model)
+        layers = {n: layers[n] for n in quantizers}
+        self._replace_by_quant_layers(model, quantizers)
+        qlayers = get_layers(model, [QuantLinear])
+        autogptq_blobs = OrderedDict()
+        for i, name in enumerate(qlayers):
+            logger.info(name)
+            quantizers[name], scale, zero, g_idx = quantizers[name]
+            # so far can only pack layer on CPU
+            layer_device = qlayers[name].device
+            qlayers[name].to("cpu")
+            layers[name], scale, zero, g_idx = layers[name].to("cpu"), scale.to("cpu"), zero.to("cpu"), g_idx.to("cpu")
+            autogptq_blobs[name] = {
+                "prepack": dict(
+                    w=layers[name].weight,
+                    b=layers[name].bias,
+                    scale=scale,
+                    zero=zero,
+                    g_idx=g_idx
+                )
+            }
+            qlayers[name].pack(layers[name], scale, zero, g_idx)
+            autogptq_blobs[name]["pack"] = dict(
+                    qweight=qlayers[name].qweight,
+                    bias=qlayers[name].bias,
+                    scales=qlayers[name].scales,
+                    qzeros=qlayers[name].qzeros,
+                    g_idx=qlayers[name].g_idx,
+                    intweight=qlayers[name].intweight
+                )
+            qlayers[name].to(layer_device)
+            if i==5:
+                break
+        torch.save(autogptq_blobs, "./opt-125m-gptq4.pth")
+        exit()
+        logger.info("Model packed.")
+    def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
+        """
+        Save model state dict and configs
+        Args:
+            model (`nn.Module`):
+                Model to be saved. The model can be wrapped or unwraped.
+            save_dir (`str`):
+                Directory to which to save. Will be created if it doesn't exist.
+            max_shard_size (`str`, defaults to `"10GB"`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
+                <Tip warning={true}>
+                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
+                which will be bigger than `max_shard_size`.
+                </Tip>
+            safe_serialization (`bool`, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        """
+        os.makedirs(save_dir, exist_ok=True)
+        model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+        with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
+            json.dump(self.to_dict(), f, indent=2)
+def load_quantized_model(
+    model: nn.Module,
+    save_folder: str,
+    quant_config_name: str = GPTQ_CONFIG,
+    state_dict_name: Optional[str] = None,
+    device_map: Optional[str] = None,
+    max_memory: Optional[Dict] = None,
+    no_split_module_classes: Optional[Dict] = None,
+    offload_folder: Optional[str] = None,
+    offload_buffers: Optional[str] = None,
+    offload_state_dict: bool = False,
+    disable_exllama: bool = False,
+    exllama_config: Optional[Dict[str, Any]] = None,
+    max_input_length: Optional[int] = None,
+):
+    """
+    Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map.
+    Args:
+        model (`nn.Module`):
+            The model can be enpty or not.
+        save_folder (`str`):
+            Directory to which to load the weights.
+        quant_config_name (`str`, defaults to `GPTQ_CONFIG`):
+            Name of the quantization config file
+        state_dict_name (`Optional[str]`, defaults to `None`):
+            Name of the state dict file
+        device_map (`Optional[str]`, defaults to `None`):
+            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
+            name, once a given module name is inside, every submodule of it will be sent to the same device.
+            To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
+        max_memory (`Optional[Dict]`, defaults to `None`):
+            A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
+            and the available CPU RAM if unset.
+        no_split_module_classes (`Optional[Dict]`, defaults to `None`):
+            A list of layer class names that should never be split across device (for instance any layer that has a
+            residual connection).
+        offload_folder (`Optional[str]`, defaults to `None`):
+            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+        offload_buffers (`Optional[str]`, defaults to `None`):
+            In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
+            well as the parameters.
+        offload_state_dict (`bool`, defaults to `False`):
+            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
+            the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
+            picked contains `"disk"` values.
+        disable_exllama (`Optional[bool]`, defaults to `None`):
+            Whether to use exllama backend. Only works with `bits` = 4.
+        exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
+            The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
+        max_input_length (`Optional[int]`, defaults to `None`):
+            The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
+            It is specific to the exllama backend with act-order.
+    Returns:
+        `nn.Module`: The quantized model
+    """
+    if not torch.cuda.is_available():
+        raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
+    if not is_auto_gptq_available():
+        raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
+    if not is_accelerate_available():
+        raise RuntimeError(
+            "You need to install accelerate in order to load and dispatch weights to"
+            "a quantized model. You can do it with `pip install accelerate`"
+        )
+    if device_map is None:
+        device_map = {"": torch.cuda.current_device()}
+        logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
+    if exllama_config is None:
+        exllama_config = {"version": ExllamaVersion.TWO}
+    else:
+        if "version" not in exllama_config:
+            raise ValueError("`exllama_config` needs to have a `version` key")
+        elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+            version = exllama_config["version"]
+            raise ValueError(
+                f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+            )
+    # this branch will check if model is from huggingface
+    try:
+        if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
+            quantize_config_dict = model.config.quantization_config.to_dict()
+        else:
+            with open(os.path.join(save_folder, quant_config_name), "r", encoding="utf-8") as f:
+                quantize_config_dict = json.load(f)
+    except Exception as err:
+        raise ValueError(
+            f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
+        ) from err
+    quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
+    quantizer.disable_exllama = disable_exllama
+    quantizer.exllama_config = exllama_config
+    quantizer.exllama_version = quantizer.exllama_config["version"]
+    quantizer.max_input_length = max_input_length
+    model = quantizer.convert_model(model)
+    if no_split_module_classes is None:
+        no_split_module_classes = quantizer.get_no_split_module_classes(model)
+    model = load_checkpoint_and_dispatch(
+        model,
+        checkpoint=os.path.join(save_folder, state_dict_name) if state_dict_name is not None else save_folder,
+        device_map=device_map,
+        max_memory=max_memory,
+        no_split_module_classes=no_split_module_classes,
+        offload_folder=offload_folder,
+        offload_buffers=offload_buffers,
+        offload_state_dict=offload_state_dict,
+    )
+    model = quantizer.post_init_model(model)
+    model.is_quantized = True
+    model.quantization_method = QuantizationMethod.GPTQ
+    model.eval()
+    return model

internal/donttouch_unpacking_autogptq/quantizer.py.ori.py ADDED Viewed

	@@ -0,0 +1,793 @@

+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from enum import Enum
+from logging import getLogger
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+from transformers.pytorch_utils import Conv1D
+from transformers.utils.quantization_config import QuantizationMethod
+from ..utils import is_accelerate_available, is_auto_gptq_available
+from ..utils.modeling_utils import recurse_getattr
+from .constants import GPTQ_CONFIG
+from .data import get_dataset, prepare_dataset
+from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
+if is_accelerate_available():
+    from accelerate import (
+        cpu_offload_with_hook,
+        load_checkpoint_and_dispatch,
+    )
+    from accelerate.hooks import remove_hook_from_module
+if is_auto_gptq_available():
+    from auto_gptq import exllama_set_max_input_length
+    from auto_gptq.modeling._utils import autogptq_post_init
+    from auto_gptq.quantization import GPTQ
+    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+logger = getLogger(__name__)
+class ExllamaVersion(int, Enum):
+    ONE = 1
+    TWO = 2
+class GPTQQuantizer(object):
+    r"""
+    A simple API for GPTQ Quantization
+    """
+    def __init__(
+        self,
+        bits: int,
+        dataset: Optional[Union[List[str], str]] = None,
+        group_size: int = 128,
+        damp_percent: float = 0.1,
+        desc_act: bool = False,
+        sym: bool = True,
+        true_sequential: bool = True,
+        use_cuda_fp16: bool = False,
+        model_seqlen: Optional[int] = None,
+        block_name_to_quantize: Optional[str] = None,
+        module_name_preceding_first_block: Optional[List[str]] = None,
+        batch_size: int = 1,
+        pad_token_id: Optional[int] = None,
+        disable_exllama: bool = False,
+        exllama_config: Dict[str, Any] = None,
+        max_input_length: Optional[int] = None,
+        cache_block_outputs: Optional[bool] = True,
+        modules_in_block_to_quantize: Optional[List[List[str]]] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Args:
+            bits (`int`):
+                The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
+            dataset (`Union[List[str], str, Any]`, defaults to `None`):
+                The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
+                (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
+                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
+            group_size (int, defaults to 128):
+                The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
+            damp_percent (`float`, defaults to `0.1`):
+                The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1.
+            desc_act (`bool`, defaults to `False`):
+                Whether to quantize columns in order of decreasing activation size.
+                Setting it to False can significantly speed up inference but the perplexity may become slightly worse.
+                Also known as act-order.
+            sym (`bool`, defaults to `True`):
+                Whether to use symetric quantization.
+            true_sequential (`bool`, defaults to `True`):
+                Whether to perform sequential quantization even within a single Transformer block.
+                Instead of quantizing the entire block at once, we perform layer-wise quantization.
+                As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
+            use_cuda_fp16 (`bool`, defaults to `False`):
+                Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
+            model_seqlen (`Optional[int]`, defaults to `None`):
+                The maximum sequence length that the model can take.
+            block_name_to_quantize (`Optional[str]`, defaults to `None`):
+                The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
+            module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
+                The layers that are preceding the first Transformer block.
+            batch_size (`int`, defaults to `1`):
+                The batch size of the dataset
+            pad_token_id (`Optional[int]`, defaults to `None`):
+                The pad token id. Needed to prepare the dataset when `batch_size` > 1.
+            disable_exllama (`bool`, defaults to `False`):
+                Whether to use exllama backend. Only works with `bits` = 4.
+            exllama_config (`Dict[str, Any]`, *optional*):
+                The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
+            max_input_length (`Optional[int]`, defaults to `None`):
+                The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
+                It is specific to the exllama backend with act-order.
+            cache_block_outputs (`bool`, defaults to `True`):
+                Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
+                (e.g. ChatGLM) but can require more time.
+            modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
+                List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
+                The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
+                If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
+        """
+        self.bits = bits
+        self.dataset = dataset
+        self.group_size = group_size
+        self.damp_percent = damp_percent
+        self.desc_act = desc_act
+        self.sym = sym
+        self.true_sequential = true_sequential
+        self.use_cuda_fp16 = use_cuda_fp16
+        self.model_seqlen = model_seqlen
+        self.block_name_to_quantize = block_name_to_quantize
+        self.module_name_preceding_first_block = module_name_preceding_first_block
+        self.batch_size = batch_size
+        self.pad_token_id = pad_token_id
+        self.disable_exllama = disable_exllama
+        self.exllama_config = exllama_config
+        self.max_input_length = max_input_length
+        self.quant_method = QuantizationMethod.GPTQ
+        self.cache_block_outputs = cache_block_outputs
+        self.modules_in_block_to_quantize = modules_in_block_to_quantize
+        self.serialization_keys = [
+            "bits",
+            "dataset",
+            "group_size",
+            "damp_percent",
+            "desc_act",
+            "sym",
+            "true_sequential",
+            "quant_method",
+            "modules_in_block_to_quantize",
+        ]
+        if self.bits not in [2, 3, 4, 8]:
+            raise ValueError("only support quantize to [2,3,4,8] bits.")
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("group_size must be greater than 0 or equal to -1")
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+        if self.exllama_config is None:
+            self.exllama_config = {"version": ExllamaVersion.TWO}
+        else:
+            if "version" not in self.exllama_config:
+                raise ValueError("`exllama_config` needs to have a `version` key")
+            elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+                version = self.exllama_config["version"]
+                raise ValueError(
+                    f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+                )
+        self.exllama_version = self.exllama_config["version"]
+    def to_dict(self):
+        """
+        Returns the args in dict format.
+        """
+        gptq_dict = {}
+        for key in self.serialization_keys:
+            gptq_dict[key] = getattr(self, key)
+        return gptq_dict
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]):
+        """
+        Instantiates a `GPTQQuantizer` using config_dict as kwargs
+        Args:
+            config_dict (`Dict[str,Any]`):
+                quantization config
+        Returns:
+            `GPTQQuantizer`:  The quantizer object instantiated from those parameters.
+        """
+        return cls(**config_dict)
+    def convert_model(self, model: nn.Module):
+        """
+        Convert the model to a GPTQ model by getting and replacing the layers.
+        Args:
+            model (`nn.Module`):
+                Model to be converted
+        """
+        if self.block_name_to_quantize is None:
+            self.block_name_to_quantize = get_block_name_with_pattern(model)
+        block_name = self.block_name_to_quantize
+        layers_to_be_replaced = get_layers(model, prefix=block_name)
+        if self.modules_in_block_to_quantize is not None:
+            layers_to_keep = sum(self.modules_in_block_to_quantize, [])
+            for name in list(layers_to_be_replaced.keys()):
+                if not any(name.endswith(layer) for layer in layers_to_keep):
+                    logger.info(
+                        f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
+                    )
+                    del layers_to_be_replaced[name]
+        self._replace_by_quant_layers(model, layers_to_be_replaced)
+        return model
+    def get_no_split_module_classes(self, model):
+        """
+        Get the modules that should not be split across multiple devices.
+        Args:
+            model (`nn.Module`):
+                The input model
+        """
+        block_class_name = recurse_getattr(model, self.block_name_to_quantize)[0].__class__.__name__
+        no_split_module_classes = [block_class_name]
+        return no_split_module_classes
+    def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: str = ""):
+        """
+        Replaces linear layers in `module` by `QuantLinear`
+        Args:
+            module (`nn.Module`):
+                Module to quantize
+            names (`List[str]`):
+                List of names of the module to quantize
+            name (`str`, defaults to `""`):
+                To keep track of the name of the current module
+        """
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=self.desc_act,
+            group_size=self.group_size,
+            bits=self.bits,
+            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
+        )
+        if isinstance(module, QuantLinear):
+            return
+        for attr in dir(module):
+            layer = getattr(module, attr)
+            name1 = name + "." + attr if name != "" else attr
+            if name1 in names:
+                device = get_device(layer)
+                delattr(module, attr)
+                if isinstance(layer, nn.Linear):
+                    in_features = layer.in_features
+                    out_features = layer.out_features
+                elif isinstance(layer, nn.Conv2d):
+                    in_features = layer.in_channels
+                    out_features = layer.out_channels
+                elif isinstance(layer, Conv1D):
+                    in_features = layer.weight.shape[0]
+                    out_features = layer.weight.shape[1]
+                bias = layer.bias is not None
+                if not (self.desc_act) or self.group_size == -1:
+                    new_layer = QuantLinear(
+                        self.bits,
+                        self.group_size,
+                        in_features,
+                        out_features,
+                        bias,
+                        use_cuda_fp16=self.use_cuda_fp16,
+                        weight_dtype=layer.weight.dtype,
+                    )
+                else:
+                    new_layer = QuantLinear(
+                        self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
+                    )
+                new_layer.device = device
+                setattr(module, attr, new_layer.to(device))
+        for name1, child in module.named_children():
+            self._replace_by_quant_layers(child, names, name + "." + name1 if name != "" else name1)
+    @torch.no_grad()
+    def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
+        """
+        Quantizes the model using the dataset
+        Args:
+            model (`nn.Module`):
+                The model to quantize
+            tokenizer (Optional[`Any`], defaults to `None`):
+                The tokenizer to use in order to prepare the dataset. You can pass either:
+                    - A custom tokenizer object.
+                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        Returns:
+            `nn.Module`: The quantized model
+        """
+        if not is_auto_gptq_available():
+            raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
+        model.eval()
+        # For Transformer model
+        has_config = False
+        has_device_map = False
+        if hasattr(model, "config"):
+            has_config = True
+            use_cache = model.config.use_cache
+            model.config.use_cache = False
+        # If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
+        if hasattr(model, "hf_device_map"):
+            devices = list(model.hf_device_map.values())
+            has_device_map = True
+            if "disk" in devices:
+                raise ValueError("disk offload is not supported with GPTQ quantization")
+            if "cpu" in devices or torch.device("cpu") in devices:
+                if len(model.hf_device_map) > 1:
+                    logger.info("Cpu offload is not recommended. There might be some issues with the memory")
+                    hook = None
+                    for name, device in model.hf_device_map.items():
+                        if device == "cpu":
+                            module = recurse_getattr(model, name)
+                            remove_hook_from_module(module, recurse=True)
+                            module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
+                else:
+                    has_device_map = False
+        if hasattr(model, "dtype"):
+            self.use_cuda_fp16 = model.dtype == torch.float16
+        if self.model_seqlen is None:
+            # We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
+            self.model_seqlen = min(4028, get_seqlen(model))
+        device = get_device(model)
+        # Step 1: Prepare the data
+        if isinstance(self.dataset, list) and not isinstance(self.dataset[0], str):
+            dataset = self.dataset
+            logger.info("GPTQQuantizer dataset appears to be already tokenized. Skipping tokenization.")
+        else:
+            if isinstance(tokenizer, str):
+                try:
+                    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+                except Exception:
+                    raise ValueError(
+                        f"""We were not able to get the tokenizer using `AutoTokenizer.from_pretrained`
+                        with the string that you have passed {tokenizer}. If you have a custom tokenizer, you can pass it as input.
+                        For now, we only support quantization for text model. Support for vision, speech and multimodel will come later."""
+                    )
+            if self.dataset is None:
+                raise ValueError("You need to pass `dataset` in order to quantize your model")
+            elif isinstance(self.dataset, str):
+                dataset = get_dataset(self.dataset, tokenizer, seqlen=self.model_seqlen, split="train")
+            elif isinstance(self.dataset, list):
+                dataset = [tokenizer(data, return_tensors="pt") for data in self.dataset]
+            else:
+                raise ValueError(
+                    f"You need to pass a list of string, a list of tokenized data or a string for `dataset`. Found: {type(self.dataset)}."
+                )
+        dataset = prepare_dataset(dataset, pad_token_id=self.pad_token_id, batch_size=self.batch_size)
+        # Step 2: get the input of the 1st block
+        # To do that, we need to put the modules preceding the first block on the same device as the first bloc.
+        # Then we run the model and it will stop at the first bloc as we added a prehook that raise an Exception after storing the inputs.
+        layer_inputs = []
+        layer_outputs = []
+        layer_input_kwargs = []
+        if self.block_name_to_quantize is None:
+            self.block_name_to_quantize = get_block_name_with_pattern(model)
+        if self.module_name_preceding_first_block is None:
+            self.module_name_preceding_first_block = get_preceding_modules(model, self.block_name_to_quantize)
+        blocks = recurse_getattr(model, self.block_name_to_quantize)
+        if not has_device_map:
+            # put modules from module_name_preceding_first_block on cuda
+            for module_name in self.module_name_preceding_first_block:
+                module = recurse_getattr(model, module_name)
+                if module is None:
+                    raise ValueError(f"Module {module_name} was not found in model")
+                module = module.to(0)
+            blocks[0] = blocks[0].to(0)
+        def store_input_hook(_, input, *args):
+            kwargs = args[0]
+            if input is None:
+                if "hidden_states" in kwargs:
+                    input = (kwargs["hidden_states"],)
+                else:
+                    raise ValueError("No input value found in the foward pass")
+            layer_inputs.append(input)
+            other_kwargs = {}
+            for k, v in kwargs.items():  # make sure other arguments also be captured
+                if k not in ["hidden_states"]:
+                    other_kwargs[k] = v
+            layer_input_kwargs.append(other_kwargs)
+            raise ValueError
+        if self.cache_block_outputs:
+            handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
+            for data in dataset:
+                for k, v in data.items():
+                    # put the data on gpu, we won't put them back to cpu
+                    data[k] = v.to(0)
+                try:
+                    model(**data)
+                except ValueError:
+                    pass
+            handle.remove()
+        if not has_device_map:
+            blocks[0].to(device)
+            for module_name in self.module_name_preceding_first_block:
+                module = recurse_getattr(model, module_name)
+                if module is None:
+                    raise ValueError(f"Module {module_name} was not found in model")
+        torch.cuda.empty_cache()
+        # Step 3: Quantize the blocks
+        quantizers = {}
+        for i, block in enumerate(tqdm(blocks, desc=f"Quantizing {self.block_name_to_quantize} blocks ")):
+            logger.info(f"Start quantizing block {self.block_name_to_quantize} {i + 1}/{len(blocks)}")
+            if not self.cache_block_outputs:
+                handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
+                for data in dataset:
+                    for k, v in data.items():
+                        # put the data on gpu, we won't put them back to cpu
+                        data[k] = v.to(0)
+                    try:
+                        model(**data)
+                    except ValueError:
+                        pass
+                handle.remove()
+            # move block to cuda if needed
+            # in case we have offload modules, we need to put them on cuda because of GPTQ object
+            if not has_device_map or get_device(block) == torch.device("cpu"):
+                block = block.to(0)
+            layers = get_layers(block)
+            if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
+                if self.true_sequential:
+                    layers_name_list = self.modules_in_block_to_quantize
+                else:
+                    layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
+            else:
+                if self.true_sequential:
+                    # lazy sequential but works well
+                    layers_name_list = [[key] for key in layers.keys()]
+                else:
+                    layers_name_list = [list(layers.keys())]
+            logger.info(f"Module to quantize {layers_name_list}")
+            for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
+                subset_layers = {name: layers[name] for name in subset_name_list}
+                gptq = {}
+                handles = []
+                # add hook for each layer in subset_layers
+                for name in subset_layers:
+                    gptq[name] = GPTQ(subset_layers[name])
+                    gptq[name].quantizer.configure(bits=self.bits, sym=self.sym, perchannel=True)
+                    def add_batch(name):
+                        def tmp(_, input, output):
+                            gptq[name].add_batch(input[0].data, output.data)
+                        return tmp
+                    # because it adding a hook will replace the old one.
+                    handles.append(subset_layers[name].register_forward_hook(add_batch(name)))
+                # update Hessian for each layer in subset_layers thanks to the hook
+                for j in range(len(dataset)):
+                    # the args are already on the gpu
+                    # don't need to store the output
+                    block(*layer_inputs[j], **layer_input_kwargs[j])
+                # remove hook
+                for h in handles:
+                    h.remove()
+                for name in subset_name_list:
+                    logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
+                    scale, zero, g_idx = gptq[name].fasterquant(
+                        percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
+                    )
+                    quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
+                        gptq[name].quantizer,
+                        scale,
+                        zero,
+                        g_idx,
+                    )
+                    gptq[name].free()
+                del subset_layers
+            # we get the new output from the partial quantized block
+            if self.cache_block_outputs:
+                for j in range(len(dataset)):
+                    layer_output = block(*layer_inputs[j], **layer_input_kwargs[j])
+                    layer_outputs.append(layer_output)
+                # put back to device
+                if not has_device_map:
+                    blocks[i] = block.to(device)
+                del layers
+                del layer_inputs
+                layer_inputs, layer_outputs = layer_outputs, []
+            else:
+                del layers
+                del layer_inputs
+                layer_inputs = []
+            torch.cuda.empty_cache()
+        if self.bits == 4:
+            # device not on gpu
+            if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
+                if not self.disable_exllama:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                    )
+                    self.disable_exllama = True
+            # act order and exllama
+            elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE:
+                logger.warning(
+                    "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
+                    "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
+                )
+                self.disable_exllama = True
+            elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO:
+                logger.warning(
+                    "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
+                    "Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. "
+                )
+                self.disable_exllama = True
+        # Step 4: Pack the model at the end (Replacing the layers)
+        self.pack_model(model=model, quantizers=quantizers)
+        model.is_quantized = True
+        model.quantization_method = QuantizationMethod.GPTQ
+        if has_config:
+            model.config.use_cache = use_cache
+            model.config.quantization_config = self.to_dict()
+        # Step 5: Any post-initialization that require device information, for example buffers initialization on device.
+        model = self.post_init_model(model)
+        torch.cuda.empty_cache()
+        return model
+    def post_init_model(self, model):
+        """
+        Post-initialization that require device information, for example buffers initialization on device.
+        Args:
+            model (`nn.Module`):
+                The input model
+        """
+        if self.bits == 4 and not self.disable_exllama:
+            if get_device(model) == torch.device("cpu") or (
+                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
+            ):
+                raise ValueError(
+                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
+                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
+                )
+        class StoreAttr(object):
+            pass
+        model.quantize_config = StoreAttr()
+        model.quantize_config.desc_act = self.desc_act
+        model = autogptq_post_init(model, use_act_order=self.desc_act)
+        if (
+            self.desc_act
+            and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
+            and self.max_input_length is not None
+        ):
+            model = exllama_set_max_input_length(model, self.max_input_length)
+        return model
+    def pack_model(
+        self,
+        model: nn.Module,
+        quantizers: Dict[str, Tuple],
+    ):
+        """
+        Pack the model by replacing the layers by quantized layers
+        Args:
+            model (`nn.Module`):
+                The model to pack
+            quantizers (`Dict[str,Tuple]`):
+                A mapping of the layer name and the data needed to pack the layer
+        """
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=self.desc_act,
+            group_size=self.group_size,
+            bits=self.bits,
+            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
+        )
+        logger.info("Packing model...")
+        layers = get_layers(model)
+        layers = {n: layers[n] for n in quantizers}
+        self._replace_by_quant_layers(model, quantizers)
+        qlayers = get_layers(model, [QuantLinear])
+        for name in qlayers:
+            logger.info(name)
+            quantizers[name], scale, zero, g_idx = quantizers[name]
+            # so far can only pack layer on CPU
+            layer_device = qlayers[name].device
+            qlayers[name].to("cpu")
+            layers[name], scale, zero, g_idx = layers[name].to("cpu"), scale.to("cpu"), zero.to("cpu"), g_idx.to("cpu")
+            qlayers[name].pack(layers[name], scale, zero, g_idx)
+            qlayers[name].to(layer_device)
+        logger.info("Model packed.")
+    def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
+        """
+        Save model state dict and configs
+        Args:
+            model (`nn.Module`):
+                Model to be saved. The model can be wrapped or unwraped.
+            save_dir (`str`):
+                Directory to which to save. Will be created if it doesn't exist.
+            max_shard_size (`str`, defaults to `"10GB"`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
+                <Tip warning={true}>
+                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
+                which will be bigger than `max_shard_size`.
+                </Tip>
+            safe_serialization (`bool`, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        """
+        os.makedirs(save_dir, exist_ok=True)
+        model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+        with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
+            json.dump(self.to_dict(), f, indent=2)
+def load_quantized_model(
+    model: nn.Module,
+    save_folder: str,
+    quant_config_name: str = GPTQ_CONFIG,
+    state_dict_name: Optional[str] = None,
+    device_map: Optional[str] = None,
+    max_memory: Optional[Dict] = None,
+    no_split_module_classes: Optional[Dict] = None,
+    offload_folder: Optional[str] = None,
+    offload_buffers: Optional[str] = None,
+    offload_state_dict: bool = False,
+    disable_exllama: bool = False,
+    exllama_config: Optional[Dict[str, Any]] = None,
+    max_input_length: Optional[int] = None,
+):
+    """
+    Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map.
+    Args:
+        model (`nn.Module`):
+            The model can be enpty or not.
+        save_folder (`str`):
+            Directory to which to load the weights.
+        quant_config_name (`str`, defaults to `GPTQ_CONFIG`):
+            Name of the quantization config file
+        state_dict_name (`Optional[str]`, defaults to `None`):
+            Name of the state dict file
+        device_map (`Optional[str]`, defaults to `None`):
+            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
+            name, once a given module name is inside, every submodule of it will be sent to the same device.
+            To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
+        max_memory (`Optional[Dict]`, defaults to `None`):
+            A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
+            and the available CPU RAM if unset.
+        no_split_module_classes (`Optional[Dict]`, defaults to `None`):
+            A list of layer class names that should never be split across device (for instance any layer that has a
+            residual connection).
+        offload_folder (`Optional[str]`, defaults to `None`):
+            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+        offload_buffers (`Optional[str]`, defaults to `None`):
+            In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
+            well as the parameters.
+        offload_state_dict (`bool`, defaults to `False`):
+            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
+            the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
+            picked contains `"disk"` values.
+        disable_exllama (`Optional[bool]`, defaults to `None`):
+            Whether to use exllama backend. Only works with `bits` = 4.
+        exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
+            The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
+        max_input_length (`Optional[int]`, defaults to `None`):
+            The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
+            It is specific to the exllama backend with act-order.
+    Returns:
+        `nn.Module`: The quantized model
+    """
+    if not torch.cuda.is_available():
+        raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
+    if not is_auto_gptq_available():
+        raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
+    if not is_accelerate_available():
+        raise RuntimeError(
+            "You need to install accelerate in order to load and dispatch weights to"
+            "a quantized model. You can do it with `pip install accelerate`"
+        )
+    if device_map is None:
+        device_map = {"": torch.cuda.current_device()}
+        logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
+    if exllama_config is None:
+        exllama_config = {"version": ExllamaVersion.TWO}
+    else:
+        if "version" not in exllama_config:
+            raise ValueError("`exllama_config` needs to have a `version` key")
+        elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+            version = exllama_config["version"]
+            raise ValueError(
+                f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+            )
+    # this branch will check if model is from huggingface
+    try:
+        if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
+            quantize_config_dict = model.config.quantization_config.to_dict()
+        else:
+            with open(os.path.join(save_folder, quant_config_name), "r", encoding="utf-8") as f:
+                quantize_config_dict = json.load(f)
+    except Exception as err:
+        raise ValueError(
+            f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
+        ) from err
+    quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
+    quantizer.disable_exllama = disable_exllama
+    quantizer.exllama_config = exllama_config
+    quantizer.exllama_version = quantizer.exllama_config["version"]
+    quantizer.max_input_length = max_input_length
+    model = quantizer.convert_model(model)
+    if no_split_module_classes is None:
+        no_split_module_classes = quantizer.get_no_split_module_classes(model)
+    model = load_checkpoint_and_dispatch(
+        model,
+        checkpoint=os.path.join(save_folder, state_dict_name) if state_dict_name is not None else save_folder,
+        device_map=device_map,
+        max_memory=max_memory,
+        no_split_module_classes=no_split_module_classes,
+        offload_folder=offload_folder,
+        offload_buffers=offload_buffers,
+        offload_state_dict=offload_state_dict,
+    )
+    model = quantizer.post_init_model(model)
+    model.is_quantized = True
+    model.quantization_method = QuantizationMethod.GPTQ
+    model.eval()
+    return model

internal/donttouch_unpacking_autogptq/readme.md ADDED Viewed

	@@ -0,0 +1,12 @@

+use autogpt_sample.py to dump opt-125m-gptq4.pth
+but before that we need to hack a few files
+patch according to delta
+/data/vchua/miniconda3/envs/240531-hgx1-hf-clm/lib/python3.11/site-packages/optimum/gptq/quantizer.py
+/data/vchua/miniconda3/envs/240531-hgx1-hf-clm/lib/python3.11/site-packages/auto_gptq/nn_modules/qlinear/
+then use blob_manipulate.py
+verify_unpacking_logic.py
+fake_dequantize.py

internal/donttouch_unpacking_autogptq/run_sqft.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import transformers
+import torch
+import torch.nn as nn
+import numpy as np
+from transformers import LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
+from fake_dequantize import fake_dequantize
+from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear
+DEBUG=False
+class SparseCompressLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True, verbose=DEBUG):
+        super(SparseCompressLinear, self).__init__(in_features, out_features, bias)
+        self.verbose = verbose # for debug
+    def forward(self, input):
+        if self.verbose is True:
+            print("SparseCompressLinear Forward!")
+        return super(SparseCompressLinear, self).forward(input)
+    def __repr__(self):
+        # Custom print out
+        return f"SparseCompressLinear(in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None})"
+def make_linear_from_QuantLinear(QuantLinearObj):
+    device = QuantLinearObj.scales.device
+    qweight = QuantLinearObj.qweight
+    scales = QuantLinearObj.scales
+    qzeros = QuantLinearObj.qzeros
+    with torch.no_grad():
+        W, scales, zeros = fake_dequantize(qweight, scales, qzeros)
+        IC, OC = W.shape
+        linear = SparseCompressLinear(in_features=IC, out_features=OC, bias=(QuantLinearObj.bias != None))
+        assert linear.weight.shape == W.t().shape, "Logical Error"
+        linear.weight.data = W.t().contiguous()
+        if QuantLinearObj.bias is not None:
+            linear.bias.data = QuantLinearObj.bias
+        linear.register_buffer("scales", scales)
+        linear.register_buffer("zeros", zeros)
+    return linear.to(device)
+def replace_QuantLinear_with_SparseCompressLinear(model):
+    for name, module in model.named_children():
+        if isinstance(module, QuantLinear):
+            if DEBUG is True:
+                print(f"Restoring {name}")
+            restored_linear = make_linear_from_QuantLinear(module)
+            restored_linear = restored_linear.to(torch.float16) #TODO: Hardcoding
+            setattr(model, name, restored_linear)
+        else:
+            # Recursively apply to child modules
+            replace_QuantLinear_with_SparseCompressLinear(module)
+    return model
+if __name__ == "__main__":
+    # model_id = "/data4/vchua/hf-model/Meta-Llama-3-8B-Instruct"
+    # model_id = "/data4/vchua/hf-model/Meta-Llama-3-70B"
+    model_id = "/home/vchua/sqft-qa-sparsepeft-llama-3-8b-50-gptq-gsm8k"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="cuda")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    prompt = "Alan Turing theorized that computers would one day become"
+    input_ids = tokenizer([prompt]).input_ids
+    input_ids = torch.as_tensor(input_ids)
+    # -----------------------------------------
+    output_ids = model.generate(
+        input_ids.cuda(), do_sample=False, top_p=None, num_beams=1, max_new_tokens=256
+    )
+    output_sqft = tokenizer.batch_decode(output_ids.cpu())
+    print(f"\n++ Baseline sqft output:\n\n{output_sqft[0]}\n\n")
+    # -----------------------------------------
+    replace_QuantLinear_with_SparseCompressLinear(model)
+    output_ids = model.generate(
+        input_ids.cuda(), do_sample=False, top_p=None, num_beams=1, max_new_tokens=256
+    )
+    output_fake_dequantize = tokenizer.batch_decode(output_ids.cpu())
+    print(f"\n++ fake dequantize sqft output:\n\n{output_fake_dequantize[0]}\n\n")
+    tx1mlp = model.model.layers[0].mlp
+    torch.save(tx1mlp.state_dict(), "./sqft_llama3_8B_gptq_tx1_mlp.pth")
+    # -----------------------------------------
+    print()
+    # torch.save(tx1mlp.state_dict(), "./sqft_llama3_8B_gptq_tx1_mlp.pth")

internal/donttouch_unpacking_autogptq/verify_unpacking_logic.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import numpy as np
+blob = torch.load("./opt-125m-gptq4.pth")
+def verify_unpack_logic(prepack, pack, nbit=4):
+    numel_per_int32 = 32//nbit
+    qweight = pack['qweight'].numpy()
+    scales = pack['scales'].numpy() #(ngroup, OC)
+    qzeros = pack['qzeros'].numpy() #(ngroup, OC//numel_per_int32)
+    IC = qweight.shape[0]*numel_per_int32
+    OC = qweight.shape[1]
+    group_size = IC//scales.shape[0]
+    qweight_unpack = np.zeros((IC,OC), dtype=np.float32)
+    for row in range(0, qweight.shape[0]):
+        for k in range(0, numel_per_int32):
+            qweight_unpack[row*numel_per_int32+k, :] = ((qweight[row] >> k*nbit) & 0xF).astype(np.float32) # read as int32 and cast to float32
+    intweight_match = torch.allclose(
+        torch.from_numpy(qweight_unpack).to(torch.int32),
+        torch.from_numpy(pack['intweight'].astype(np.int32))
+    )
+    assert intweight_match, "intweight and qweight_unpack do not match! pls debug"
+    scales_float = scales.astype(np.float32)
+    # TODO: verify with asym zero point. sym zero points are all identical
+    qzeros_unpack = np.zeros(list(scales.shape), dtype=np.float32)
+    for i in range(0, numel_per_int32):
+        # shift multiplier
+        shift_multiplier = numel_per_int32 - 1 - i
+        shift_by = shift_multiplier * nbit
+        qzeros_unpack[:, i::numel_per_int32] = ((qzeros >> shift_by) & 0xF).astype(np.float32) # read as int32 and cast to float32
+    qzeros_unpack += 1 # for some reason they minus 1
+    qweight_unpack = torch.from_numpy(qweight_unpack).to('cuda').to(torch.float16)
+    qzeros_unpack = torch.from_numpy(qzeros_unpack).to('cuda').to(torch.float16)
+    scales_float = torch.from_numpy(scales_float).to('cuda').to(torch.float16)
+    deqweight_unpack = torch.zeros((IC,OC), dtype=torch.float16)
+    for i in range(IC):
+        gid = i//group_size
+        deqweight_unpack[i, :] = (qweight_unpack[i, :]-qzeros_unpack[gid, :]) * scales_float[gid, :]
+    simulated_match = torch.allclose(deqweight_unpack, prepack['w'].t(), atol=0.0005)
+    assert simulated_match, "prepack['w'] and deqweight_unpack do not match! pls debug"
+    print(f"intweight_match: {intweight_match}, simulated_match: {simulated_match}")
+for layer, lblob in blob.items():
+    print(f"\n\n--> {layer}")
+    prepack = lblob['prepack']
+    pack = lblob['pack']
+    # for k, v in prepack.items():
+    #     print(f"prepack['{k:10}'] : {str(tuple(v.shape)):<20}")
+    # for k, v in pack.items():
+    #     print(f"pack['{k:13}'] : {str(tuple(v.shape)):<20}")
+    verify_unpack_logic(prepack, pack)

internal/pack_sparse_linear.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import torch
+import numpy as np
+import os
+def calc_sparsity(tensor):
+    if isinstance(tensor, torch.Tensor):
+        nnz = tensor.count_nonzero()
+        rate = 1-(nnz/tensor.numel())
+        return rate.item(), nnz
+    else:
+        nnz = np.count_nonzero(tensor)
+        rate = 1-(nnz/tensor.size)
+        return rate, nnz
+if __name__ == "__main__":
+    sd = torch.load("./sqft_llama3_8B_gptq_tx1_mlp.pth")
+    for k,v in sd.items():
+        print(k)
+    weight = sd['up_proj.weight'] # OC x IC
+    scales = sd['up_proj.scales'] # n_group x OC
+    zeros = sd['up_proj.zeros'] # n_group x OC
+    nbit=4
+    OC, IC = weight.shape
+    numel_per_int32 = 32//nbit
+    #16x128B tile
+    stride_oc = 16
+    stride_ic = 128 * 8 // nbit
+    # always make contigous!
+    weight = weight.contiguous() # OC x IC
+    scales = scales.t().contiguous() # OC x n_group
+    zeros = zeros.t().contiguous() # OC x n_group
+    #TODO: hardcoding, temporary, Livia requires group size of 32. but our model is 128, we are going to repeat the value
+    group_size = 32
+    scales = scales.repeat_interleave(4, dim=1)
+    zeros = zeros.repeat_interleave(4, dim=1)
+    # Tile weight into target block size
+    tiled_weight = weight.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic, stride_ic)
+    tiled_scales = scales.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)
+    tiled_zeros = zeros.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)
+    assert tiled_weight.shape[:2] == tiled_scales.shape[:2], "pls debug"
+    assert tiled_weight.shape[:2] == tiled_zeros.shape[:2], "pls debug"
+    tiled_qweight = torch.zeros_like(tiled_weight)
+    tiled_bitmap = torch.zeros_like(tiled_weight).to(torch.bool)
+    tiled_nnz = torch.zeros(tiled_weight.shape[:2]).to(torch.int16)
+    non_zero_removed_tiled_qweight = torch.zeros_like(tiled_weight) # for debug
+    for tile_r in range(0, tiled_weight.shape[0]):
+        for tile_c in range(0, tiled_weight.shape[1]):
+            # metadata: number of non-zero elements (nnz)
+            sparsity, nnz = calc_sparsity(tiled_weight[tile_r, tile_c])
+            print(f"tile [{tile_r:4},{tile_c:4}], sparsity: {sparsity*100:4.1f}%, nnz: {nnz:5}")
+            # metadata: generate bitmask
+            nonzero_bool = (tiled_weight[tile_r, tile_c] != 0)
+            assert nonzero_bool.sum() == nnz, "pls debug"
+            tiled_bitmap[tile_r, tile_c] = nonzero_bool
+            tiled_nnz[tile_r, tile_c] = nnz
+            r = tile_r
+            c = tile_c
+            # get quantize val
+            w = tiled_weight[r, c]
+            qw = torch.zeros_like(tiled_weight[r, c])
+            s = tiled_scales[r, c]
+            z = tiled_zeros[r, c]
+            # for every column of groups
+            for col in range(tiled_scales.shape[-1]):
+                sidx = col*group_size
+                eidx = (col+1)*group_size
+                # unsqueeze is needed to make the vector as column
+                qw[:, sidx:eidx] = (  w[:, sidx:eidx] + (s[:,col]*z[:,col]).unsqueeze(-1) ) / s[:,col].unsqueeze(-1)
+            #for debug
+            non_zero_removed_tiled_qweight[r, c]=qw
+            # Zero Removal and pad to tile length (per Livia's request)
+            assert len(qw[nonzero_bool]) == nnz, "pls debug"
+            compress_qw = (torch.ones_like(qw)*8).reshape(-1) # because zero is 8, in this manner we achieve padding effect
+            compress_qw[:nnz] = qw[nonzero_bool]
+            assert (compress_qw != 8).sum() == nnz, "pls debug"
+            compress_qw = compress_qw.reshape(qw.shape)
+            tiled_qweight[r, c] = compress_qw
+            # nnz
+            # scale
+            # zeros
+    tiled_qweight = tiled_qweight.to(torch.int32).contiguous()
+    tiled_zeros = tiled_zeros.to(torch.int32).contiguous()
+    tiled_scales = tiled_scales.to(torch.float16).contiguous()
+    tiled_bitmap = tiled_bitmap.to(torch.int32).contiguous()
+    tiled_nnz = tiled_nnz.to(torch.int16).contiguous()
+    linear_nnz = tiled_nnz
+    linear_scales = tiled_scales.reshape(-1)
+    linear_qweight = tiled_qweight.reshape(-1).reshape(-1, 8).cpu().numpy()
+    linear_qweight_pack = np.zeros((linear_qweight.shape[0], 1), dtype=np.int32)
+    for i in range(0, numel_per_int32):
+        linear_qweight_pack[:, 0] |= linear_qweight[:, i] << (numel_per_int32 - 1 - i)*nbit
+    linear_qweight_pack = linear_qweight_pack.reshape(-1)
+    linear_zeros = tiled_zeros.reshape(-1).reshape(-1, 8).cpu().numpy()
+    linear_zeros_pack = np.zeros((linear_zeros.shape[0], 1), dtype=np.int32)
+    for i in range(0, numel_per_int32):
+        linear_zeros_pack[:, 0] |= linear_zeros[:, i] << (numel_per_int32 - 1 - i)*nbit
+    linear_zeros_pack = linear_zeros_pack.reshape(-1)
+    linear_bitmap = tiled_bitmap.reshape(-1).reshape(-1, 32).cpu().numpy() # why 32? 32 bitmask for an int32
+    linear_bitmap_pack = np.zeros((linear_bitmap.shape[0], 1), dtype=np.int32)
+    for i in range(0, 32):
+        linear_bitmap_pack[:, 0] |= linear_bitmap[:, i] << (32 - 1 - i)
+    linear_bitmap_pack = linear_bitmap_pack.reshape(-1)
+    os.makedirs("sparse_w4", exist_ok=True)
+    linear_qweight_pack.tofile('sparse_w4/linear_compressed_qweight_int32.bin')
+    linear_zeros_pack.tofile('sparse_w4/linear_zeros_int32.bin')
+    linear_scales.cpu().contiguous().numpy().tofile('sparse_w4/linear_scales_float16.bin')
+    linear_bitmap_pack.tofile('sparse_w4/linear_bitmap_int32.bin')
+    linear_nnz.cpu().contiguous().numpy().tofile('sparse_w4/linear_nnz_int16.bin')
+    print("joto")
+loaded_linear_nnz = np.fromfile("sparse_w4/linear_nnz_int16.bin", dtype=np.int16)
+loaded_tiled_nnz = loaded_linear_nnz.reshape(896,16)
+assert torch.all(torch.from_numpy(loaded_tiled_nnz) == tiled_nnz), "pls debug"
+loaded_linear_scales = np.fromfile("sparse_w4/linear_scales_float16.bin", dtype=np.float16)
+loaded_tiled_scales = loaded_linear_scales.reshape(896, 16, 16, 8)
+assert torch.all(torch.from_numpy(loaded_tiled_scales).to("cuda") == tiled_scales), "pls debug"
+loaded_linear_bitmap_pack = np.fromfile('sparse_w4/linear_bitmap_int32.bin', dtype=np.int32)
+loaded_linear_bitmap_pack = np.expand_dims(loaded_linear_bitmap_pack, axis=-1)
+loaded_linear_bitmap = np.zeros((loaded_linear_bitmap_pack.shape[0], 32), dtype=np.int32)
+for i in range(0, 32):
+    loaded_linear_bitmap[:, i] = ( loaded_linear_bitmap_pack[:, 0] >> (32 - 1 - i) ) & 0x1
+loaded_tiled_bitmap = loaded_linear_bitmap.reshape(-1).reshape(896, 16, 16, 256)
+assert torch.all(torch.from_numpy(loaded_tiled_bitmap).to("cuda") == tiled_bitmap), "pls debug"
+loaded_linear_qweight_pack = np.fromfile('sparse_w4/linear_compressed_qweight_int32.bin', dtype=np.int32)
+loaded_linear_qweight_pack = np.expand_dims(loaded_linear_qweight_pack, axis=-1)
+loaded_linear_qweight = np.zeros((loaded_linear_qweight_pack.shape[0], numel_per_int32), dtype=np.int32)
+for i in range(0, numel_per_int32):
+    loaded_linear_qweight[:, i] = ( loaded_linear_qweight_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
+loaded_tiled_qweight = loaded_linear_qweight.reshape(-1).reshape(896, 16, 16, 256)
+assert torch.all(torch.from_numpy(loaded_tiled_qweight).to("cuda") == tiled_qweight), "pls debug"
+loaded_linear_zeros_pack = np.fromfile('sparse_w4/linear_zeros_int32.bin', dtype=np.int32)
+loaded_linear_zeros_pack = np.expand_dims(loaded_linear_zeros_pack, axis=-1)
+loaded_linear_zeros = np.zeros((loaded_linear_zeros_pack.shape[0], numel_per_int32), dtype=np.int32)
+for i in range(0, numel_per_int32):
+    loaded_linear_zeros[:, i] = ( loaded_linear_zeros_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
+loaded_tiled_zeros = loaded_linear_zeros.reshape(-1).reshape(896, 16, 16, 8)
+assert torch.all(torch.from_numpy(loaded_tiled_zeros).to("cuda") == tiled_zeros), "pls debug"
+zero_recovered_tiles = np.ones_like(loaded_tiled_qweight)*8 # zero is represented by value of 8
+for r in range(0, loaded_tiled_qweight.shape[0]):
+    for c in range(0, loaded_tiled_qweight.shape[1]):
+        zero_removed_padded_tile = loaded_tiled_qweight[r, c]
+        nnz=loaded_tiled_nnz[r, c]
+        tile_values = zero_removed_padded_tile.reshape(-1)[0:nnz]
+        nnz_indices = np.nonzero(loaded_tiled_bitmap[r, c])
+        zero_recovered_tiles[r, c][nnz_indices] = tile_values
+assert torch.all(non_zero_removed_tiled_qweight.to(torch.int32) == torch.from_numpy(zero_recovered_tiles).to("cuda")), "pls debug"
+dequantized_tiles = np.zeros_like(zero_recovered_tiles, dtype=np.float16)
+zero_recovered_tiles = zero_recovered_tiles.astype(np.float16)
+loaded_tiled_zeros = loaded_tiled_zeros.astype(np.float16)
+loaded_tiled_scales = loaded_tiled_scales.astype(np.float16)
+for i in range(0, zero_recovered_tiles.shape[-1], group_size):
+    gid = i//group_size
+    dequantized_tiles[:, :, :, i:i+group_size] = \
+        ( zero_recovered_tiles[:, :, :, i:i+group_size] - \
+          np.expand_dims(loaded_tiled_zeros[:, :, :, gid], axis=-1) ) * \
+            np.expand_dims(loaded_tiled_scales[:, :, :, gid], axis=-1)
+print("joto")
+# torch.allclose(linear_tiled_W[0], tiled_W[0,0])
+# torch.allclose(linear_tiled_W[1], tiled_W[0,1])
+# torch.allclose(linear_tiled_W[12], tiled_W[1,0])
+# torch.allclose(linear_tiled_W[26], tiled_W[2,2])
+# torch.allclose(linear_tiled_W[-1], tiled_W[-1,-1])
+# In [18]: torch.allclose(tiled_W[0,1], W[0:16, 256:512])
+# Out[18]: True
+# In [19]: torch.allclose(tiled_W[1,1], W[16:32, 256:512])
+# Out[19]: True
+# In [20]: torch.allclose(tiled_W[-1,-1], W[(768-16):768, (3072-256):3072])
+# Out[20]: True
+# If you want to serialize the tensor such that a single bit indicates if an element is zero or non-zero, you can achieve this by creating a byte array where each bit corresponds to the zero/non-zero status of each element. Here’s how you can do it:
+# Convert the tensor to a boolean tensor indicating zero or non-zero.
+# Flatten the boolean tensor.
+# Pack the boolean values into bytes.
+# Here’s a step-by-step example:
+# python
+# Copy code
+# import torch
+# # Example tensor
+# tensor = torch.tensor([[0, 1, 2], [3, 0, 4], [5, 6, 0]])
+# # Step 1: Create a boolean tensor indicating zero or non-zero values
+# zero_indicator = torch.eq(tensor, 0)
+# # Step 2: Flatten the boolean tensor
+# flat_zero_indicator = zero_indicator.flatten()
+# # Step 3: Convert boolean tensor to a list of bytes
+# byte_array = []
+# byte = 0
+# for i, bit in enumerate(flat_zero_indicator):
+#     if bit:
+#         byte |= 1 << (i % 8)
+#     if (i % 8) == 7:
+#         byte_array.append(byte)
+#         byte = 0
+# # Append the last byte if necessary
+# if (len(flat_zero_indicator) % 8) != 0:
+#     byte_array.append(byte)
+# # Convert to bytearray
+# result = bytearray(byte_array)
+# print(result)

internal/sqft_llama3_8B_gptq_tx1_mlp.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e4b052cf767df68cde1e08ab4c5e1adf19d821d64b6f9ff5727ef5b615f97a7
+size 357830528

sparse_w4/linear_bitmap_int32.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1db9c9466c5e2f5efdb426685b479794520c35f196e6811e175cb5066b9b874b
+size 7340032

sparse_w4/linear_compressed_qweight_int32.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f20a8d23c239a5d002686ff8c0867bb49ffc0daec5480fedef4a5163877ca7f
+size 29360128

sparse_w4/linear_nnz_int16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f8d24ef1e4e2af4d04f7ef8e3f52d2023b916336c1bd013a4256f8d96805736
+size 28672

sparse_w4/linear_scales_float16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f825735214928e40a0c5850f95f5b55bc8de1b31bf7c1a67974df544f247b45
+size 3670016

sparse_w4/linear_zeros_int32.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf1b625d7d0b024b60e69eae10e8f7bf74ec7d6a249ab6e0e2dee6c482123946
+size 917504

unpack_blobs.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+# Python samples to recover the zero compressed W4 blobs
+nbit=4
+numel_per_int32 = 32//nbit
+group_size=32
+linear_nnz = np.fromfile("sparse_w4/linear_nnz_int16.bin", dtype=np.int16)
+tiled_nnz = linear_nnz.reshape(896,16)
+linear_scales = np.fromfile("sparse_w4/linear_scales_float16.bin", dtype=np.float16)
+tiled_scales = linear_scales.reshape(896, 16, 16, 8)
+linear_bitmap_pack = np.fromfile('sparse_w4/linear_bitmap_int32.bin', dtype=np.int32)
+linear_bitmap_pack = np.expand_dims(linear_bitmap_pack, axis=-1)
+linear_bitmap = np.zeros((linear_bitmap_pack.shape[0], 32), dtype=np.int32)
+for i in range(0, 32):
+    linear_bitmap[:, i] = ( linear_bitmap_pack[:, 0] >> (32 - 1 - i) ) & 0x1
+tiled_bitmap = linear_bitmap.reshape(-1).reshape(896, 16, 16, 256)
+linear_qweight_pack = np.fromfile('sparse_w4/linear_compressed_qweight_int32.bin', dtype=np.int32)
+linear_qweight_pack = np.expand_dims(linear_qweight_pack, axis=-1)
+linear_qweight = np.zeros((linear_qweight_pack.shape[0], numel_per_int32), dtype=np.int32)
+for i in range(0, numel_per_int32):
+    linear_qweight[:, i] = ( linear_qweight_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
+tiled_qweight = linear_qweight.reshape(-1).reshape(896, 16, 16, 256)
+linear_zeros_pack = np.fromfile('sparse_w4/linear_zeros_int32.bin', dtype=np.int32)
+linear_zeros_pack = np.expand_dims(linear_zeros_pack, axis=-1)
+linear_zeros = np.zeros((linear_zeros_pack.shape[0], numel_per_int32), dtype=np.int32)
+for i in range(0, numel_per_int32):
+    linear_zeros[:, i] = ( linear_zeros_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
+tiled_zeros = linear_zeros.reshape(-1).reshape(896, 16, 16, 8)
+# ------------------------------------------------------------
+# Decompress the tile, recover the zero locations
+zero_recovered_tiles = np.ones_like(tiled_qweight)*8 # zero is represented by value of 8
+for r in range(0, tiled_qweight.shape[0]):
+    for c in range(0, tiled_qweight.shape[1]):
+        zero_removed_padded_tile = tiled_qweight[r, c]
+        nnz=tiled_nnz[r, c]
+        tile_values = zero_removed_padded_tile.reshape(-1)[0:nnz]
+        nnz_indices = np.nonzero(tiled_bitmap[r, c])
+        zero_recovered_tiles[r, c][nnz_indices] = tile_values
+# ------------------------------------------------------------
+# Simulate dequantization of 4-bit weight to floating value
+dequantized_tiles = np.zeros_like(zero_recovered_tiles, dtype=np.float16)
+zero_recovered_tiles = zero_recovered_tiles.astype(np.float16)
+tiled_zeros = tiled_zeros.astype(np.float16)
+tiled_scales = tiled_scales.astype(np.float16)
+for i in range(0, zero_recovered_tiles.shape[-1], group_size):
+    gid = i//group_size
+    dequantized_tiles[:, :, :, i:i+group_size] = \
+        ( zero_recovered_tiles[:, :, :, i:i+group_size] - \
+          np.expand_dims(tiled_zeros[:, :, :, gid], axis=-1) ) * \
+            np.expand_dims(tiled_scales[:, :, :, gid], axis=-1)
+# ------------------------------------------------------------
+# Check sparsity per tile
+def calc_sparsity(tensor):
+    nnz = np.count_nonzero(tensor)
+    rate = 1-(nnz/tensor.size)
+    return rate, nnz
+for tile_r in range(0, dequantized_tiles.shape[0]):
+    for tile_c in range(0, dequantized_tiles.shape[1]):
+        sparsity, nnz = calc_sparsity(dequantized_tiles[tile_r, tile_c])
+        print(f"tile [{tile_r:4},{tile_c:4}], sparsity: {sparsity*100:4.1f}%, nnz: {nnz:5}")
+print("end.")