vuiseng9
/

gptq-w4-gs32-sparse-compressed-oc14336-ic4096

Model card Files Files and versions Community

gptq-w4-gs32-sparse-compressed-oc14336-ic4096 / internal /pack_sparse_linear.py

Vui Seng Chua

Add content

cfb9114 2 months ago

raw

history blame contribute delete

No virus

10.7 kB

	import torch
	import numpy as np
	import os

	def calc_sparsity(tensor):
	if isinstance(tensor, torch.Tensor):
	nnz = tensor.count_nonzero()
	rate = 1-(nnz/tensor.numel())
	return rate.item(), nnz
	else:
	nnz = np.count_nonzero(tensor)
	rate = 1-(nnz/tensor.size)
	return rate, nnz

	if __name__ == "__main__":
	sd = torch.load("./sqft_llama3_8B_gptq_tx1_mlp.pth")

	for k,v in sd.items():
	print(k)

	weight = sd['up_proj.weight'] # OC x IC
	scales = sd['up_proj.scales'] # n_group x OC
	zeros = sd['up_proj.zeros'] # n_group x OC

	nbit=4
	OC, IC = weight.shape
	numel_per_int32 = 32//nbit
	#16x128B tile
	stride_oc = 16
	stride_ic = 128 * 8 // nbit

	# always make contigous!
	weight = weight.contiguous() # OC x IC
	scales = scales.t().contiguous() # OC x n_group
	zeros = zeros.t().contiguous() # OC x n_group

	#TODO: hardcoding, temporary, Livia requires group size of 32. but our model is 128, we are going to repeat the value
	group_size = 32
	scales = scales.repeat_interleave(4, dim=1)
	zeros = zeros.repeat_interleave(4, dim=1)

	# Tile weight into target block size
	tiled_weight = weight.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic, stride_ic)
	tiled_scales = scales.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)
	tiled_zeros = zeros.unfold(0, stride_oc, stride_oc).unfold(1, stride_ic//group_size, stride_ic//group_size)

	assert tiled_weight.shape[:2] == tiled_scales.shape[:2], "pls debug"
	assert tiled_weight.shape[:2] == tiled_zeros.shape[:2], "pls debug"

	tiled_qweight = torch.zeros_like(tiled_weight)
	tiled_bitmap = torch.zeros_like(tiled_weight).to(torch.bool)
	tiled_nnz = torch.zeros(tiled_weight.shape[:2]).to(torch.int16)

	non_zero_removed_tiled_qweight = torch.zeros_like(tiled_weight) # for debug
	for tile_r in range(0, tiled_weight.shape[0]):
	for tile_c in range(0, tiled_weight.shape[1]):

	# metadata: number of non-zero elements (nnz)
	sparsity, nnz = calc_sparsity(tiled_weight[tile_r, tile_c])
	print(f"tile [{tile_r:4},{tile_c:4}], sparsity: {sparsity*100:4.1f}%, nnz: {nnz:5}")

	# metadata: generate bitmask
	nonzero_bool = (tiled_weight[tile_r, tile_c] != 0)
	assert nonzero_bool.sum() == nnz, "pls debug"
	tiled_bitmap[tile_r, tile_c] = nonzero_bool
	tiled_nnz[tile_r, tile_c] = nnz

	r = tile_r
	c = tile_c

	# get quantize val
	w = tiled_weight[r, c]
	qw = torch.zeros_like(tiled_weight[r, c])
	s = tiled_scales[r, c]
	z = tiled_zeros[r, c]

	# for every column of groups
	for col in range(tiled_scales.shape[-1]):
	sidx = col*group_size
	eidx = (col+1)*group_size

	# unsqueeze is needed to make the vector as column
	qw[:, sidx:eidx] = ( w[:, sidx:eidx] + (s[:,col]*z[:,col]).unsqueeze(-1) ) / s[:,col].unsqueeze(-1)

	#for debug
	non_zero_removed_tiled_qweight[r, c]=qw

	# Zero Removal and pad to tile length (per Livia's request)
	assert len(qw[nonzero_bool]) == nnz, "pls debug"
	compress_qw = (torch.ones_like(qw)*8).reshape(-1) # because zero is 8, in this manner we achieve padding effect
	compress_qw[:nnz] = qw[nonzero_bool]
	assert (compress_qw != 8).sum() == nnz, "pls debug"
	compress_qw = compress_qw.reshape(qw.shape)

	tiled_qweight[r, c] = compress_qw
	# nnz
	# scale
	# zeros

	tiled_qweight = tiled_qweight.to(torch.int32).contiguous()
	tiled_zeros = tiled_zeros.to(torch.int32).contiguous()
	tiled_scales = tiled_scales.to(torch.float16).contiguous()
	tiled_bitmap = tiled_bitmap.to(torch.int32).contiguous()
	tiled_nnz = tiled_nnz.to(torch.int16).contiguous()


	linear_nnz = tiled_nnz
	linear_scales = tiled_scales.reshape(-1)

	linear_qweight = tiled_qweight.reshape(-1).reshape(-1, 8).cpu().numpy()
	linear_qweight_pack = np.zeros((linear_qweight.shape[0], 1), dtype=np.int32)
	for i in range(0, numel_per_int32):
	linear_qweight_pack[:, 0] \|= linear_qweight[:, i] << (numel_per_int32 - 1 - i)*nbit
	linear_qweight_pack = linear_qweight_pack.reshape(-1)

	linear_zeros = tiled_zeros.reshape(-1).reshape(-1, 8).cpu().numpy()
	linear_zeros_pack = np.zeros((linear_zeros.shape[0], 1), dtype=np.int32)
	for i in range(0, numel_per_int32):
	linear_zeros_pack[:, 0] \|= linear_zeros[:, i] << (numel_per_int32 - 1 - i)*nbit
	linear_zeros_pack = linear_zeros_pack.reshape(-1)

	linear_bitmap = tiled_bitmap.reshape(-1).reshape(-1, 32).cpu().numpy() # why 32? 32 bitmask for an int32
	linear_bitmap_pack = np.zeros((linear_bitmap.shape[0], 1), dtype=np.int32)
	for i in range(0, 32):
	linear_bitmap_pack[:, 0] \|= linear_bitmap[:, i] << (32 - 1 - i)
	linear_bitmap_pack = linear_bitmap_pack.reshape(-1)

	os.makedirs("sparse_w4", exist_ok=True)
	linear_qweight_pack.tofile('sparse_w4/linear_compressed_qweight_int32.bin')
	linear_zeros_pack.tofile('sparse_w4/linear_zeros_int32.bin')
	linear_scales.cpu().contiguous().numpy().tofile('sparse_w4/linear_scales_float16.bin')
	linear_bitmap_pack.tofile('sparse_w4/linear_bitmap_int32.bin')
	linear_nnz.cpu().contiguous().numpy().tofile('sparse_w4/linear_nnz_int16.bin')

	print("joto")

	loaded_linear_nnz = np.fromfile("sparse_w4/linear_nnz_int16.bin", dtype=np.int16)
	loaded_tiled_nnz = loaded_linear_nnz.reshape(896,16)

	assert torch.all(torch.from_numpy(loaded_tiled_nnz) == tiled_nnz), "pls debug"

	loaded_linear_scales = np.fromfile("sparse_w4/linear_scales_float16.bin", dtype=np.float16)
	loaded_tiled_scales = loaded_linear_scales.reshape(896, 16, 16, 8)

	assert torch.all(torch.from_numpy(loaded_tiled_scales).to("cuda") == tiled_scales), "pls debug"

	loaded_linear_bitmap_pack = np.fromfile('sparse_w4/linear_bitmap_int32.bin', dtype=np.int32)
	loaded_linear_bitmap_pack = np.expand_dims(loaded_linear_bitmap_pack, axis=-1)
	loaded_linear_bitmap = np.zeros((loaded_linear_bitmap_pack.shape[0], 32), dtype=np.int32)
	for i in range(0, 32):
	loaded_linear_bitmap[:, i] = ( loaded_linear_bitmap_pack[:, 0] >> (32 - 1 - i) ) & 0x1
	loaded_tiled_bitmap = loaded_linear_bitmap.reshape(-1).reshape(896, 16, 16, 256)

	assert torch.all(torch.from_numpy(loaded_tiled_bitmap).to("cuda") == tiled_bitmap), "pls debug"

	loaded_linear_qweight_pack = np.fromfile('sparse_w4/linear_compressed_qweight_int32.bin', dtype=np.int32)
	loaded_linear_qweight_pack = np.expand_dims(loaded_linear_qweight_pack, axis=-1)
	loaded_linear_qweight = np.zeros((loaded_linear_qweight_pack.shape[0], numel_per_int32), dtype=np.int32)
	for i in range(0, numel_per_int32):
	loaded_linear_qweight[:, i] = ( loaded_linear_qweight_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
	loaded_tiled_qweight = loaded_linear_qweight.reshape(-1).reshape(896, 16, 16, 256)

	assert torch.all(torch.from_numpy(loaded_tiled_qweight).to("cuda") == tiled_qweight), "pls debug"

	loaded_linear_zeros_pack = np.fromfile('sparse_w4/linear_zeros_int32.bin', dtype=np.int32)
	loaded_linear_zeros_pack = np.expand_dims(loaded_linear_zeros_pack, axis=-1)
	loaded_linear_zeros = np.zeros((loaded_linear_zeros_pack.shape[0], numel_per_int32), dtype=np.int32)
	for i in range(0, numel_per_int32):
	loaded_linear_zeros[:, i] = ( loaded_linear_zeros_pack[:, 0] >> (numel_per_int32 - 1 - i)*nbit ) & 0xF
	loaded_tiled_zeros = loaded_linear_zeros.reshape(-1).reshape(896, 16, 16, 8)

	assert torch.all(torch.from_numpy(loaded_tiled_zeros).to("cuda") == tiled_zeros), "pls debug"

	zero_recovered_tiles = np.ones_like(loaded_tiled_qweight)*8 # zero is represented by value of 8
	for r in range(0, loaded_tiled_qweight.shape[0]):
	for c in range(0, loaded_tiled_qweight.shape[1]):
	zero_removed_padded_tile = loaded_tiled_qweight[r, c]
	nnz=loaded_tiled_nnz[r, c]
	tile_values = zero_removed_padded_tile.reshape(-1)[0:nnz]
	nnz_indices = np.nonzero(loaded_tiled_bitmap[r, c])
	zero_recovered_tiles[r, c][nnz_indices] = tile_values

	assert torch.all(non_zero_removed_tiled_qweight.to(torch.int32) == torch.from_numpy(zero_recovered_tiles).to("cuda")), "pls debug"

	dequantized_tiles = np.zeros_like(zero_recovered_tiles, dtype=np.float16)

	zero_recovered_tiles = zero_recovered_tiles.astype(np.float16)
	loaded_tiled_zeros = loaded_tiled_zeros.astype(np.float16)
	loaded_tiled_scales = loaded_tiled_scales.astype(np.float16)
	for i in range(0, zero_recovered_tiles.shape[-1], group_size):
	gid = i//group_size
	dequantized_tiles[:, :, :, i:i+group_size] = \
	( zero_recovered_tiles[:, :, :, i:i+group_size] - \
	np.expand_dims(loaded_tiled_zeros[:, :, :, gid], axis=-1) ) * \
	np.expand_dims(loaded_tiled_scales[:, :, :, gid], axis=-1)

	print("joto")
	# torch.allclose(linear_tiled_W[0], tiled_W[0,0])
	# torch.allclose(linear_tiled_W[1], tiled_W[0,1])
	# torch.allclose(linear_tiled_W[12], tiled_W[1,0])
	# torch.allclose(linear_tiled_W[26], tiled_W[2,2])
	# torch.allclose(linear_tiled_W[-1], tiled_W[-1,-1])
	# In [18]: torch.allclose(tiled_W[0,1], W[0:16, 256:512])
	# Out[18]: True

	# In [19]: torch.allclose(tiled_W[1,1], W[16:32, 256:512])
	# Out[19]: True

	# In [20]: torch.allclose(tiled_W[-1,-1], W[(768-16):768, (3072-256):3072])
	# Out[20]: True



	# If you want to serialize the tensor such that a single bit indicates if an element is zero or non-zero, you can achieve this by creating a byte array where each bit corresponds to the zero/non-zero status of each element. Here’s how you can do it:

	# Convert the tensor to a boolean tensor indicating zero or non-zero.
	# Flatten the boolean tensor.
	# Pack the boolean values into bytes.
	# Here’s a step-by-step example:

	# python
	# Copy code
	# import torch

	# # Example tensor
	# tensor = torch.tensor([[0, 1, 2], [3, 0, 4], [5, 6, 0]])

	# # Step 1: Create a boolean tensor indicating zero or non-zero values
	# zero_indicator = torch.eq(tensor, 0)

	# # Step 2: Flatten the boolean tensor
	# flat_zero_indicator = zero_indicator.flatten()

	# # Step 3: Convert boolean tensor to a list of bytes
	# byte_array = []
	# byte = 0
	# for i, bit in enumerate(flat_zero_indicator):
	# if bit:
	# byte \|= 1 << (i % 8)
	# if (i % 8) == 7:
	# byte_array.append(byte)
	# byte = 0

	# # Append the last byte if necessary
	# if (len(flat_zero_indicator) % 8) != 0:
	# byte_array.append(byte)

	# # Convert to bytearray
	# result = bytearray(byte_array)

	# print(result)