import numpy as np import torch import torch.nn as nn import math def quantize(x, scale, zero, maxq): q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) return scale * (q - zero) class Quantizer(nn.Module): def __init__(self, shape=1): super(Quantizer, self).__init__() self.register_buffer('maxq', torch.tensor(0)) self.register_buffer('scale', torch.zeros(shape)) self.register_buffer('zero', torch.zeros(shape)) def configure( self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8 ): self.maxq = torch.tensor(2 ** bits - 1) self.perchannel = perchannel self.sym = sym self.mse = mse self.norm = norm self.grid = grid self.maxshrink = maxshrink def find_params(self, x, weight=False): dev = x.device self.maxq = self.maxq.to(dev) shape = x.shape if self.perchannel: if weight: x = x.flatten(1) else: if len(shape) == 4: x = x.permute([1, 0, 2, 3]) x = x.flatten(1) if len(shape) == 3: x = x.reshape((-1, shape[-1])).t() if len(shape) == 2: x = x.t() else: x = x.flatten().unsqueeze(0) tmp = torch.zeros(x.shape[0], device=dev) xmin = torch.minimum(x.min(1)[0], tmp) xmax = torch.maximum(x.max(1)[0], tmp) if self.sym: xmax = torch.maximum(torch.abs(xmin), xmax) tmp = xmin < 0 if torch.any(tmp): xmin[tmp] = -xmax[tmp] tmp = (xmin == 0) & (xmax == 0) xmin[tmp] = -1 xmax[tmp] = +1 self.scale = (xmax - xmin) / self.maxq if self.sym: self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) else: self.zero = torch.round(-xmin / self.scale) if self.mse: best = torch.full([x.shape[0]], float('inf'), device=dev) for i in range(int(self.maxshrink * self.grid)): p = 1 - i / self.grid xmin1 = p * xmin xmax1 = p * xmax scale1 = (xmax1 - xmin1) / self.maxq zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq) q -= x q.abs_() q.pow_(self.norm) err = torch.sum(q, 1) tmp = err < best if torch.any(tmp): best[tmp] = err[tmp] self.scale[tmp] = scale1[tmp] self.zero[tmp] = zero1[tmp] if not self.perchannel: if weight: tmp = shape[0] else: tmp = shape[1] if len(shape) != 3 else shape[2] self.scale = self.scale.repeat(tmp) self.zero = self.zero.repeat(tmp) if weight: shape = [-1] + [1] * (len(shape) - 1) self.scale = self.scale.reshape(shape) self.zero = self.zero.reshape(shape) return if len(shape) == 4: self.scale = self.scale.reshape((1, -1, 1, 1)) self.zero = self.zero.reshape((1, -1, 1, 1)) if len(shape) == 3: self.scale = self.scale.reshape((1, 1, -1)) self.zero = self.zero.reshape((1, 1, -1)) if len(shape) == 2: self.scale = self.scale.unsqueeze(0) self.zero = self.zero.unsqueeze(0) def quantize(self, x): if self.ready(): return quantize(x, self.scale, self.zero, self.maxq) return x def enabled(self): return self.maxq > 0 def ready(self): return torch.all(self.scale != 0) try: import importlib quant_cuda = importlib.import_module("quant_cuda") except: import os import sys argv = sys.argv sys.argv = ['quant.py','install'] dir_path = os.path.dirname(os.path.realpath(__file__)) from setuptools import setup, Extension from torch.utils import cpp_extension os.chdir(dir_path) cucode = ''' #include #include #include #include // atomicAdd for double-precision floating-point numbers on hardware with // compute capability < 6.0 from: // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 __device__ double atomicAdd( double* address, double val ) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS( address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)) ); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } #endif template __global__ void VecQuant2MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ); template __global__ void VecQuant3MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ); template __global__ void VecQuant4MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ); template __global__ void VecQuant8MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ); const int BLOCKWIDTH = 256; const int BLOCKHEIGHT2 = 16; const int BLOCKHEIGHT3 = 24; const int BLOCKHEIGHT4 = 32; const int BLOCKHEIGHT8 = 64; __device__ inline unsigned int as_unsigned(int i) { return *reinterpret_cast(&i); } void vecquant2matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { int batch = vec.size(0); int vec_height = vec.size(1); int height = mat.size(0); int width = mat.size(1); int zero_width = zeros.size(1); dim3 blocks( (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2, (width + BLOCKWIDTH - 1) / BLOCKWIDTH, batch ); dim3 threads(BLOCKWIDTH); AT_DISPATCH_FLOATING_TYPES( vec.type(), "vecquant2matmul_cuda", ([&] { VecQuant2MatMulKernel<<>>( vec.data(), mat.data(), mul.data(), scales.data(), zeros.data(), batch, vec_height, height, width, zero_width, groupsize ); }) ); } template __global__ void VecQuant2MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ) { int b = blockIdx.z; int h = BLOCKHEIGHT2 * blockIdx.x; int w = BLOCKWIDTH * blockIdx.y + threadIdx.x; __shared__ scalar_t blockvec[BLOCKWIDTH]; blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x]; __syncthreads(); scalar_t res = 0; int i = width * h + w; int g_h = h * 16; int k = 0; int z_w = w / 16; int z_mod = (w % 16) * 2; unsigned int tmp; while (k < BLOCKWIDTH) { tmp = as_unsigned(mat[i]); int g = (g_h + k) / groupsize; scalar_t scale = scales[g * width + w]; scalar_t zero = scale * scalar_t((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod & 0x3) + 1); res += (scale * scalar_t((tmp >> 0) & 0x3) - zero) * blockvec[k + 0]; res += (scale * scalar_t((tmp >> 2) & 0x3) - zero) * blockvec[k + 1]; res += (scale * scalar_t((tmp >> 4) & 0x3) - zero) * blockvec[k + 2]; res += (scale * scalar_t((tmp >> 6) & 0x3) - zero) * blockvec[k + 3]; res += (scale * scalar_t((tmp >> 8) & 0x3) - zero) * blockvec[k + 4]; res += (scale * scalar_t((tmp >> 10) & 0x3) - zero) * blockvec[k + 5]; res += (scale * scalar_t((tmp >> 12) & 0x3) - zero) * blockvec[k + 6]; res += (scale * scalar_t((tmp >> 14) & 0x3) - zero) * blockvec[k + 7]; res += (scale * scalar_t((tmp >> 16) & 0x3) - zero) * blockvec[k + 8]; res += (scale * scalar_t((tmp >> 18) & 0x3) - zero) * blockvec[k + 9]; res += (scale * scalar_t((tmp >> 20) & 0x3) - zero) * blockvec[k + 10]; res += (scale * scalar_t((tmp >> 22) & 0x3) - zero) * blockvec[k + 11]; res += (scale * scalar_t((tmp >> 24) & 0x3) - zero) * blockvec[k + 12]; res += (scale * scalar_t((tmp >> 26) & 0x3) - zero) * blockvec[k + 13]; res += (scale * scalar_t((tmp >> 28) & 0x3) - zero) * blockvec[k + 14]; res += (scale * scalar_t((tmp >> 30) & 0x3) - zero) * blockvec[k + 15]; i += width; k += 16; } atomicAdd(&mul[b * width + w], res); } void vecquant3matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { int batch = vec.size(0); int vec_height = vec.size(1); int height = mat.size(0); int width = mat.size(1); int zero_width = zeros.size(1); dim3 blocks( (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3, (width + BLOCKWIDTH - 1) / BLOCKWIDTH, batch ); dim3 threads(BLOCKWIDTH); AT_DISPATCH_FLOATING_TYPES( vec.type(), "vecquant3matmul_cuda", ([&] { VecQuant3MatMulKernel<<>>( vec.data(), mat.data(), mul.data(), scales.data(), zeros.data(), batch, vec_height, height, width, zero_width, groupsize ); }) ); } template __global__ void VecQuant3MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ) { int b = blockIdx.z; int h = BLOCKHEIGHT3 * blockIdx.x; int w = BLOCKWIDTH * blockIdx.y + threadIdx.x; __shared__ scalar_t blockvec[BLOCKWIDTH]; blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x]; __syncthreads(); scalar_t res = 0; int i = width * h + w; int g_h = (h / 3) * 32; int k = 0; int z_w = (w / 32) * 3; // ((w / 256) * 24) / 3 int z_mod = w % 32; int z_bit; if (z_mod != 10){ if (z_mod != 21){ z_bit = z_mod; if (z_bit > 21){ z_bit -= 22; z_bit *= 3; z_bit += 2; z_w += 2; } else if (z_bit > 10){ z_bit -= 11; z_bit *= 3; z_bit += 1; z_w += 1; } else { z_bit *= 3; } } else { z_w += 1; } } unsigned int tmp1; unsigned int tmp2; unsigned int tmp; unsigned int z_tmp; while (k < BLOCKWIDTH) { tmp1 = as_unsigned(mat[i]); int g = (g_h + k) / groupsize; scalar_t scale = scales[g * width + w]; scalar_t zero; if (z_mod == 10) { z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4); zero = scale * scalar_t((z_tmp) + 1); } else if (z_mod == 21){ z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6); zero = scale * scalar_t((z_tmp) + 1); } else { zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1); } res += (scale * scalar_t((tmp1 >> 0) & 0x7) - zero) * blockvec[k + 0]; res += (scale * scalar_t((tmp1 >> 3) & 0x7) - zero) * blockvec[k + 1]; res += (scale * scalar_t((tmp1 >> 6) & 0x7) - zero) * blockvec[k + 2]; res += (scale * scalar_t((tmp1 >> 9) & 0x7) - zero) * blockvec[k + 3]; res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4]; res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5]; res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6]; res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7]; res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8]; res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9]; i += width; tmp2 = as_unsigned(mat[i]); tmp = (tmp1 >> 30) | ((tmp2 << 2) & 0x4); tmp2 >>= 1; res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10]; k += 11; res += (scale * scalar_t((tmp2 >> 0) & 0x7) - zero) * blockvec[k + 0]; res += (scale * scalar_t((tmp2 >> 3) & 0x7) - zero) * blockvec[k + 1]; res += (scale * scalar_t((tmp2 >> 6) & 0x7) - zero) * blockvec[k + 2]; res += (scale * scalar_t((tmp2 >> 9) & 0x7) - zero) * blockvec[k + 3]; res += (scale * scalar_t((tmp2 >> 12) & 0x7) - zero) * blockvec[k + 4]; res += (scale * scalar_t((tmp2 >> 15) & 0x7) - zero) * blockvec[k + 5]; res += (scale * scalar_t((tmp2 >> 18) & 0x7) - zero) * blockvec[k + 6]; res += (scale * scalar_t((tmp2 >> 21) & 0x7) - zero) * blockvec[k + 7]; res += (scale * scalar_t((tmp2 >> 24) & 0x7) - zero) * blockvec[k + 8]; res += (scale * scalar_t((tmp2 >> 27) & 0x7) - zero) * blockvec[k + 9]; i += width; tmp1 = as_unsigned(mat[i]); tmp = (tmp2 >> 30) | ((tmp1 << 1) & 0x6); tmp1 >>= 2; res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10]; k += 11; res += (scale * scalar_t((tmp1 >> 0) & 0x7) - zero) * blockvec[k + 0]; res += (scale * scalar_t((tmp1 >> 3) & 0x7) - zero) * blockvec[k + 1]; res += (scale * scalar_t((tmp1 >> 6) & 0x7) - zero) * blockvec[k + 2]; res += (scale * scalar_t((tmp1 >> 9) & 0x7) - zero) * blockvec[k + 3]; res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4]; res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5]; res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6]; res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7]; res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8]; res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9]; i += width; k += 10; } atomicAdd(&mul[b * width + w], res); } void vecquant4matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { int batch = vec.size(0); int vec_height = vec.size(1); int height = mat.size(0); int width = mat.size(1); int zero_width = zeros.size(1); dim3 blocks( (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4, (width + BLOCKWIDTH - 1) / BLOCKWIDTH, batch ); dim3 threads(BLOCKWIDTH); AT_DISPATCH_FLOATING_TYPES( vec.type(), "vecquant4matmul_cuda", ([&] { VecQuant4MatMulKernel<<>>( vec.data(), mat.data(), mul.data(), scales.data(), zeros.data(), batch, vec_height, height, width, zero_width, groupsize ); }) ); } template __global__ void VecQuant4MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ) { int b = blockIdx.z; int h = BLOCKHEIGHT4 * blockIdx.x; int w = BLOCKWIDTH * blockIdx.y + threadIdx.x; __shared__ scalar_t blockvec[BLOCKWIDTH]; blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x]; __syncthreads(); scalar_t res = 0; int i = width * h + w; int g_h = h * 8; int k = 0; int z_w = w / 8; int z_mod = (w % 8) * 4; unsigned int tmp; while (k < BLOCKWIDTH) { tmp = as_unsigned(mat[i]); int g = (g_h + k) / groupsize; scalar_t scale = scales[g * width + w]; scalar_t zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1); res += (scale * scalar_t((tmp >> 0) & 0xF) - zero) * blockvec[k + 0]; res += (scale * scalar_t((tmp >> 4) & 0xF) - zero) * blockvec[k + 1]; res += (scale * scalar_t((tmp >> 8) & 0xF) - zero) * blockvec[k + 2]; res += (scale * scalar_t((tmp >> 12) & 0xF) - zero) * blockvec[k + 3]; res += (scale * scalar_t((tmp >> 16) & 0xF) - zero) * blockvec[k + 4]; res += (scale * scalar_t((tmp >> 20) & 0xF) - zero) * blockvec[k + 5]; res += (scale * scalar_t((tmp >> 24) & 0xF) - zero) * blockvec[k + 6]; res += (scale * scalar_t((tmp >> 28) & 0xF) - zero) * blockvec[k + 7]; i += width; k += 8; } atomicAdd(&mul[b * width + w], res); } void vecquant8matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { int batch = vec.size(0); int vec_height = vec.size(1); int height = mat.size(0); int width = mat.size(1); int zero_width = zeros.size(1); dim3 blocks( (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8, (width + BLOCKWIDTH - 1) / BLOCKWIDTH, batch ); dim3 threads(BLOCKWIDTH); AT_DISPATCH_FLOATING_TYPES( vec.type(), "vecquant8matmul_cuda", ([&] { VecQuant8MatMulKernel<<>>( vec.data(), mat.data(), mul.data(), scales.data(), zeros.data(), batch, vec_height, height, width, zero_width, groupsize ); }) ); } template __global__ void VecQuant8MatMulKernel( const scalar_t* __restrict__ vec, const int* __restrict__ mat, scalar_t* __restrict__ mul, const scalar_t* __restrict__ scales, const int* __restrict__ zeros, int batch, int vec_height, int height, int width, int zero_width, int groupsize ) { int b = blockIdx.z; int h = BLOCKHEIGHT8 * blockIdx.x; int w = BLOCKWIDTH * blockIdx.y + threadIdx.x; __shared__ scalar_t blockvec[BLOCKWIDTH]; blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x]; __syncthreads(); scalar_t res = 0; int i = width * h + w; int g_h = h * 4; int k = 0; int z_w = w / 4; int z_mod = (w % 4) * 8; unsigned int tmp; while (k < BLOCKWIDTH) { tmp = as_unsigned(mat[i]); int g = (g_h + k) / groupsize; scalar_t scale = scales[g * width + w]; scalar_t zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1); res += (scale * scalar_t((tmp >> 0) & 0xFF) - zero) * blockvec[k + 0]; res += (scale * scalar_t((tmp >> 8) & 0xFF) - zero) * blockvec[k + 1]; res += (scale * scalar_t((tmp >> 16) & 0xFF) - zero) * blockvec[k + 2]; res += (scale * scalar_t((tmp >> 24) & 0xFF) - zero) * blockvec[k + 3]; i += width; k += 4; } atomicAdd(&mul[b * width + w], res); } ''' with open("quant_cuda_kernel.cu","w") as f: f.write(cucode) cppcode = ''' #include #include #include void vecquant2matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ); void vecquant2matmul( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); vecquant2matmul_cuda(vec, mat, mul, scales, zeros,groupsize); } void vecquant3matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ); void vecquant3matmul( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); vecquant3matmul_cuda(vec, mat, mul, scales, zeros, groupsize); } void vecquant4matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ); void vecquant4matmul( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); vecquant4matmul_cuda(vec, mat, mul, scales, zeros, groupsize); } void vecquant8matmul_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ); void vecquant8matmul( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros, int groupsize ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); vecquant8matmul_cuda(vec, mat, mul, scales, zeros, groupsize); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)"); m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)"); m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)"); m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)"); } ''' with open("quant_cuda.cpp","w") as f: f.write(cppcode) setup( name='quant_cuda', ext_modules=[cpp_extension.CUDAExtension( 'quant_cuda', ['quant_cuda.cpp', 'quant_cuda_kernel.cu'] )], cmdclass={'build_ext': cpp_extension.BuildExtension} ) os.chdir(os.getcwd()) sys.argv = argv for i in sys.path: if i.endswith("site-packages"): for j in os.listdir(i): if j.find("quant_cuda") != -1: sys.path.append(os.path.join(i,j)) break break import importlib quant_cuda = importlib.import_module("quant_cuda") # Assumes layer is perfectly divisible into 256 * 256 blocks class QuantLinear(nn.Module): def __init__(self, bits, groupsize, infeatures, outfeatures): super().__init__() if bits not in [2,3,4,8]: raise NotImplementedError("Only 2,3,4,8 bits are supported.") self.infeatures = infeatures self.outfeatures = outfeatures self.bits = bits if groupsize != -1 and groupsize < 32 and groupsize != int(math.pow(2,int(math.log2(groupsize)))): raise NotImplementedError("groupsize supports powers of 2 greater than 32. (e.g. : 32,64,128,etc)") groupsize = groupsize if groupsize != -1 else infeatures self.groupsize = groupsize self.register_buffer('qzeros', torch.zeros((math.ceil(infeatures/groupsize),outfeatures // 256 * (bits * 8)), dtype=torch.int)) self.register_buffer('scales', torch.zeros((math.ceil(infeatures/groupsize),outfeatures))) self.register_buffer('bias', torch.zeros(outfeatures)) self.register_buffer( 'qweight', torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int) ) self._initialized_quant_state = False def pack(self, linear, scales, zeros): scales = scales.t().contiguous() zeros = zeros.t().contiguous() scale_zeros = zeros * scales self.scales = scales.clone() if linear.bias is not None: self.bias = linear.bias.clone() intweight = [] for idx in range(self.infeatures): g_idx = idx // self.groupsize intweight.append(torch.round((linear.weight.data[:,idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:,None]) intweight = torch.cat(intweight,dim=1) intweight = intweight.t().contiguous() intweight = intweight.numpy().astype(np.uint32) qweight = np.zeros( (intweight.shape[0] // 256 * (self.bits * 8), intweight.shape[1]), dtype=np.uint32 ) i = 0 row = 0 while row < qweight.shape[0]: if self.bits in [2,4,8]: for j in range(i, i + (32//self.bits)): qweight[row] |= intweight[j] << (self.bits * (j - i)) i += 32//self.bits row += 1 elif self.bits == 3: for j in range(i, i + 10): qweight[row] |= intweight[j] << (3 * (j - i)) i += 10 qweight[row] |= intweight[i] << 30 row += 1 qweight[row] |= (intweight[i] >> 2) & 1 i += 1 for j in range(i, i + 10): qweight[row] |= intweight[j] << (3 * (j - i) + 1) i += 10 qweight[row] |= intweight[i] << 31 row += 1 qweight[row] |= (intweight[i] >> 1) & 0x3 i += 1 for j in range(i, i + 10): qweight[row] |= intweight[j] << (3 * (j - i) + 2) i += 10 row += 1 else: raise NotImplementedError("Only 2,3,4,8 bits are supported.") qweight = qweight.astype(np.int32) self.qweight = torch.from_numpy(qweight) zeros -= 1; zeros = zeros.numpy().astype(np.uint32) qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (self.bits * 8)), dtype=np.uint32) i = 0 col = 0 while col < qzeros.shape[1]: if self.bits in [2,4,8]: for j in range(i, i + (32//self.bits)): qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) i += 32//self.bits col += 1 elif self.bits == 3: for j in range(i, i + 10): qzeros[:, col] |= zeros[:, j] << (3 * (j - i)) i += 10 qzeros[:, col] |= zeros[:, i] << 30 col += 1 qzeros[:, col] |= (zeros[:, i] >> 2) & 1 i += 1 for j in range(i, i + 10): qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1) i += 10 qzeros[:, col] |= zeros[:, i] << 31 col += 1 qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3 i += 1 for j in range(i, i + 10): qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2) i += 10 col += 1 else: raise NotImplementedError("Only 2,3,4,8 bits are supported.") qzeros = qzeros.astype(np.int32) self.qzeros = torch.from_numpy(qzeros) def forward(self, x): intermediate_dtype = torch.float32 if not self._initialized_quant_state: # Do we even have a bias? Check for at least one non-zero element. if self.bias is not None and bool(torch.any(self.bias != 0)): # Then make sure it's the right type. self.bias.data = self.bias.data.to(intermediate_dtype) else: self.bias = None outshape = list(x.shape) outshape[-1] = self.outfeatures x = x.reshape(-1, x.shape[-1]) if self.bias is None: y = torch.zeros(x.shape[0], outshape[-1], dtype=intermediate_dtype, device=x.device) else: y = self.bias.clone().repeat(x.shape[0], 1) output_dtype = x.dtype x = x.to(intermediate_dtype) if self.bits == 2: quant_cuda.vecquant2matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize) elif self.bits == 3: quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize) elif self.bits == 4: quant_cuda.vecquant4matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize) elif self.bits == 8: quant_cuda.vecquant8matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize) else: raise NotImplementedError("Only 2,3,4,8 bits are supported.") y = y.to(output_dtype) return y.reshape(outshape) def make_quant(module, names, bits, groupsize, name=''): if isinstance(module, QuantLinear): return for attr in dir(module): tmp = getattr(module, attr) name1 = name + '.' + attr if name != '' else attr if name1 in names: setattr( module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features) ) for name1, child in module.named_children(): make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)