Spaces:

Charris71
/

text-generation-webui

Build error

App Files Files Community

text-generation-webui / repositories /GPTQ-for-LLaMa /quant_cuda_kernel.cu

Charris71

Upload folder using huggingface_hub

a1bea4a over 1 year ago

raw

history blame contribute delete

27.4 kB

	#include <torch/all.h>
	#include <torch/python.h>
	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <cuda_fp16.h>

	// atomicAdd for double-precision floating-point numbers on hardware with
	// compute capability < 6.0 from:
	// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
	__device__ double atomicAdd(
	double* address,
	double val
	) {
	unsigned long long int* address_as_ull = (unsigned long long int*)address;
	unsigned long long int old = *address_as_ull, assumed;

	do {
	assumed = old;
	old = atomicCAS(
	address_as_ull,
	assumed,
	__double_as_longlong(val + __longlong_as_double(assumed))
	);

	// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
	} while (assumed != old);

	return __longlong_as_double(old);
	}
	#endif

	template <typename scalar_t>
	__global__ void VecQuant2MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	);

	template <typename scalar_t>
	__global__ void VecQuant3MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	);

	template <typename scalar_t>
	__global__ void VecQuant4MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	);

	template <typename scalar_t>
	__global__ void VecQuant8MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	);

	__global__ void VecQuant2MatMulKernelFaster(
	const half2* __restrict__ vec,
	const int* __restrict__ mat,
	float* __restrict__ mul,
	const float* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	);

	__global__ void VecQuant3MatMulKernelFaster(
	const half2* __restrict__ vec,
	const int* __restrict__ mat,
	float* __restrict__ mul,
	const float* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	);

	__global__ void VecQuant4MatMulKernelFaster(
	const half2* __restrict__ vec,
	const int* __restrict__ mat,
	float* __restrict__ mul,
	const float* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	);

	const int BLOCKWIDTH = 256;
	const int BLOCKHEIGHT2 = 16;
	const int BLOCKHEIGHT3 = 24;
	const int BLOCKHEIGHT4 = 32;
	const int BLOCKHEIGHT8 = 64;

	__device__ inline unsigned int as_unsigned(int i) {
	return reinterpret_cast<unsigned int>(&i);
	}

	void vecquant2matmul_cuda(
	torch::Tensor vec,
	torch::Tensor mat,
	torch::Tensor mul,
	torch::Tensor scales,
	torch::Tensor zeros,
	int groupsize
	) {
	int batch = vec.size(0);
	int vec_height = vec.size(1);
	int height = mat.size(0);
	int width = mat.size(1);
	int zero_width = zeros.size(1);

	dim3 blocks(
	(height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
	(width + BLOCKWIDTH - 1) / BLOCKWIDTH,
	batch
	);
	dim3 threads(BLOCKWIDTH);

	AT_DISPATCH_FLOATING_TYPES(
	vec.type(), "vecquant2matmul_cuda", ([&] {
	VecQuant2MatMulKernel<<<blocks, threads>>>(
	vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
	scales.data<scalar_t>(), zeros.data<int>(),
	batch, vec_height, height, width, zero_width, groupsize
	);
	})
	);
	}

	template <typename scalar_t>
	__global__ void VecQuant2MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	) {
	int b = blockIdx.z;
	int h = BLOCKHEIGHT2 * blockIdx.x;
	int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;

	__shared__ scalar_t blockvec[BLOCKWIDTH];
	blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
	__syncthreads();

	scalar_t res = 0;
	int i = width * h + w;
	int g_h = h * 16;
	int k = 0;

	int z_w = w / 16;
	int z_mod = (w % 16) * 2;

	unsigned int tmp;

	while (k < BLOCKWIDTH) {
	tmp = as_unsigned(mat[i]);

	int g = (g_h + k) / groupsize;
	scalar_t scale = scales[g * width + w];
	scalar_t zero = scale * scalar_t((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod & 0x3) + 1);

	res += (scale * scalar_t((tmp >> 0) & 0x3) - zero) * blockvec[k + 0];
	res += (scale * scalar_t((tmp >> 2) & 0x3) - zero) * blockvec[k + 1];
	res += (scale * scalar_t((tmp >> 4) & 0x3) - zero) * blockvec[k + 2];
	res += (scale * scalar_t((tmp >> 6) & 0x3) - zero) * blockvec[k + 3];
	res += (scale * scalar_t((tmp >> 8) & 0x3) - zero) * blockvec[k + 4];
	res += (scale * scalar_t((tmp >> 10) & 0x3) - zero) * blockvec[k + 5];
	res += (scale * scalar_t((tmp >> 12) & 0x3) - zero) * blockvec[k + 6];
	res += (scale * scalar_t((tmp >> 14) & 0x3) - zero) * blockvec[k + 7];
	res += (scale * scalar_t((tmp >> 16) & 0x3) - zero) * blockvec[k + 8];
	res += (scale * scalar_t((tmp >> 18) & 0x3) - zero) * blockvec[k + 9];
	res += (scale * scalar_t((tmp >> 20) & 0x3) - zero) * blockvec[k + 10];
	res += (scale * scalar_t((tmp >> 22) & 0x3) - zero) * blockvec[k + 11];
	res += (scale * scalar_t((tmp >> 24) & 0x3) - zero) * blockvec[k + 12];
	res += (scale * scalar_t((tmp >> 26) & 0x3) - zero) * blockvec[k + 13];
	res += (scale * scalar_t((tmp >> 28) & 0x3) - zero) * blockvec[k + 14];
	res += (scale * scalar_t((tmp >> 30) & 0x3) - zero) * blockvec[k + 15];

	i += width;
	k += 16;
	}

	atomicAdd(&mul[b * width + w], res);
	}

	void vecquant3matmul_cuda(
	torch::Tensor vec,
	torch::Tensor mat,
	torch::Tensor mul,
	torch::Tensor scales,
	torch::Tensor zeros,
	int groupsize
	) {
	int batch = vec.size(0);
	int vec_height = vec.size(1);
	int height = mat.size(0);
	int width = mat.size(1);
	int zero_width = zeros.size(1);

	dim3 blocks(
	(height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
	(width + BLOCKWIDTH - 1) / BLOCKWIDTH,
	batch
	);
	dim3 threads(BLOCKWIDTH);

	AT_DISPATCH_FLOATING_TYPES(
	vec.type(), "vecquant3matmul_cuda", ([&] {
	VecQuant3MatMulKernel<<<blocks, threads>>>(
	vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
	scales.data<scalar_t>(), zeros.data<int>(),
	batch, vec_height, height, width, zero_width, groupsize
	);
	})
	);
	}

	template <typename scalar_t>
	__global__ void VecQuant3MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	) {
	int b = blockIdx.z;
	int h = BLOCKHEIGHT3 * blockIdx.x;
	int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;

	__shared__ scalar_t blockvec[BLOCKWIDTH];
	blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
	__syncthreads();

	scalar_t res = 0;
	int i = width * h + w;
	int g_h = (h / 3) * 32;
	int k = 0;

	int z_w = (w / 32) * 3;
	int z_mod = w % 32;
	int z_bit;

	if (z_mod != 10){
	if (z_mod != 21){
	z_bit = z_mod;
	if (z_bit > 21){
	z_bit -= 22;
	z_bit *= 3;
	z_bit += 2;
	z_w += 2;
	} else if (z_bit > 10){
	z_bit -= 11;
	z_bit *= 3;
	z_bit += 1;
	z_w += 1;
	} else {
	z_bit *= 3;
	}
	} else {
	z_w += 1;
	}
	}

	unsigned int tmp1;
	unsigned int tmp2;
	unsigned int tmp;
	unsigned int z_tmp;

	while (k < BLOCKWIDTH) {
	tmp1 = as_unsigned(mat[i]);

	int g = (g_h + k) / groupsize;
	scalar_t scale = scales[g * width + w];
	scalar_t zero;
	if (z_mod == 10) {
	z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) \| ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
	zero = scale * scalar_t((z_tmp) + 1);
	} else if (z_mod == 21){
	z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) \| ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
	zero = scale * scalar_t((z_tmp) + 1);
	} else {
	zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1);
	}

	res += (scale * scalar_t((tmp1 >> 0) & 0x7) - zero) * blockvec[k + 0];
	res += (scale * scalar_t((tmp1 >> 3) & 0x7) - zero) * blockvec[k + 1];
	res += (scale * scalar_t((tmp1 >> 6) & 0x7) - zero) * blockvec[k + 2];
	res += (scale * scalar_t((tmp1 >> 9) & 0x7) - zero) * blockvec[k + 3];
	res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4];
	res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5];
	res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6];
	res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7];
	res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8];
	res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9];

	i += width;
	tmp2 = as_unsigned(mat[i]);
	tmp = (tmp1 >> 30) \| ((tmp2 << 2) & 0x4);
	tmp2 >>= 1;
	res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10];
	k += 11;

	res += (scale * scalar_t((tmp2 >> 0) & 0x7) - zero) * blockvec[k + 0];
	res += (scale * scalar_t((tmp2 >> 3) & 0x7) - zero) * blockvec[k + 1];
	res += (scale * scalar_t((tmp2 >> 6) & 0x7) - zero) * blockvec[k + 2];
	res += (scale * scalar_t((tmp2 >> 9) & 0x7) - zero) * blockvec[k + 3];
	res += (scale * scalar_t((tmp2 >> 12) & 0x7) - zero) * blockvec[k + 4];
	res += (scale * scalar_t((tmp2 >> 15) & 0x7) - zero) * blockvec[k + 5];
	res += (scale * scalar_t((tmp2 >> 18) & 0x7) - zero) * blockvec[k + 6];
	res += (scale * scalar_t((tmp2 >> 21) & 0x7) - zero) * blockvec[k + 7];
	res += (scale * scalar_t((tmp2 >> 24) & 0x7) - zero) * blockvec[k + 8];
	res += (scale * scalar_t((tmp2 >> 27) & 0x7) - zero) * blockvec[k + 9];

	i += width;
	tmp1 = as_unsigned(mat[i]);
	tmp = (tmp2 >> 30) \| ((tmp1 << 1) & 0x6);
	tmp1 >>= 2;
	res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10];
	k += 11;

	res += (scale * scalar_t((tmp1 >> 0) & 0x7) - zero) * blockvec[k + 0];
	res += (scale * scalar_t((tmp1 >> 3) & 0x7) - zero) * blockvec[k + 1];
	res += (scale * scalar_t((tmp1 >> 6) & 0x7) - zero) * blockvec[k + 2];
	res += (scale * scalar_t((tmp1 >> 9) & 0x7) - zero) * blockvec[k + 3];
	res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4];
	res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5];
	res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6];
	res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7];
	res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8];
	res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9];

	i += width;
	k += 10;
	}

	atomicAdd(&mul[b * width + w], res);
	}

	void vecquant4matmul_cuda(
	torch::Tensor vec,
	torch::Tensor mat,
	torch::Tensor mul,
	torch::Tensor scales,
	torch::Tensor zeros,
	int groupsize
	) {
	int batch = vec.size(0);
	int vec_height = vec.size(1);
	int height = mat.size(0);
	int width = mat.size(1);
	int zero_width = zeros.size(1);

	dim3 blocks(
	(height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
	(width + BLOCKWIDTH - 1) / BLOCKWIDTH,
	batch
	);
	dim3 threads(BLOCKWIDTH);

	AT_DISPATCH_FLOATING_TYPES(
	vec.type(), "vecquant4matmul_cuda", ([&] {
	VecQuant4MatMulKernel<<<blocks, threads>>>(
	vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
	scales.data<scalar_t>(), zeros.data<int>(),
	batch, vec_height, height, width, zero_width, groupsize
	);
	})
	);
	}

	template <typename scalar_t>
	__global__ void VecQuant4MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	) {
	int b = blockIdx.z;
	int h = BLOCKHEIGHT4 * blockIdx.x;
	int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;

	__shared__ scalar_t blockvec[BLOCKWIDTH];
	blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
	__syncthreads();

	scalar_t res = 0;
	int i = width * h + w;
	int g_h = h * 8;
	int k = 0;

	int z_w = w / 8;
	int z_mod = (w % 8) * 4;

	unsigned int tmp;

	while (k < BLOCKWIDTH) {
	tmp = as_unsigned(mat[i]);

	int g = (g_h + k) / groupsize;
	scalar_t scale = scales[g * width + w];
	scalar_t zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1);

	res += (scale * scalar_t((tmp >> 0) & 0xF) - zero) * blockvec[k + 0];
	res += (scale * scalar_t((tmp >> 4) & 0xF) - zero) * blockvec[k + 1];
	res += (scale * scalar_t((tmp >> 8) & 0xF) - zero) * blockvec[k + 2];
	res += (scale * scalar_t((tmp >> 12) & 0xF) - zero) * blockvec[k + 3];
	res += (scale * scalar_t((tmp >> 16) & 0xF) - zero) * blockvec[k + 4];
	res += (scale * scalar_t((tmp >> 20) & 0xF) - zero) * blockvec[k + 5];
	res += (scale * scalar_t((tmp >> 24) & 0xF) - zero) * blockvec[k + 6];
	res += (scale * scalar_t((tmp >> 28) & 0xF) - zero) * blockvec[k + 7];

	i += width;
	k += 8;
	}

	atomicAdd(&mul[b * width + w], res);
	}

	void vecquant8matmul_cuda(
	torch::Tensor vec,
	torch::Tensor mat,
	torch::Tensor mul,
	torch::Tensor scales,
	torch::Tensor zeros,
	int groupsize
	) {
	int batch = vec.size(0);
	int vec_height = vec.size(1);
	int height = mat.size(0);
	int width = mat.size(1);
	int zero_width = zeros.size(1);

	dim3 blocks(
	(height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
	(width + BLOCKWIDTH - 1) / BLOCKWIDTH,
	batch
	);
	dim3 threads(BLOCKWIDTH);

	AT_DISPATCH_FLOATING_TYPES(
	vec.type(), "vecquant8matmul_cuda", ([&] {
	VecQuant8MatMulKernel<<<blocks, threads>>>(
	vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
	scales.data<scalar_t>(), zeros.data<int>(),
	batch, vec_height, height, width, zero_width, groupsize
	);
	})
	);
	}

	template <typename scalar_t>
	__global__ void VecQuant8MatMulKernel(
	const scalar_t* __restrict__ vec,
	const int* __restrict__ mat,
	scalar_t* __restrict__ mul,
	const scalar_t* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	) {
	int b = blockIdx.z;
	int h = BLOCKHEIGHT8 * blockIdx.x;
	int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;

	__shared__ scalar_t blockvec[BLOCKWIDTH];
	blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
	__syncthreads();

	scalar_t res = 0;
	int i = width * h + w;
	int g_h = h * 4;
	int k = 0;

	int z_w = w / 4;
	int z_mod = (w % 4) * 8;

	unsigned int tmp;

	while (k < BLOCKWIDTH) {
	tmp = as_unsigned(mat[i]);

	int g = (g_h + k) / groupsize;
	scalar_t scale = scales[g * width + w];
	scalar_t zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1);

	res += (scale * scalar_t((tmp >> 0) & 0xFF) - zero) * blockvec[k + 0];
	res += (scale * scalar_t((tmp >> 8) & 0xFF) - zero) * blockvec[k + 1];
	res += (scale * scalar_t((tmp >> 16) & 0xFF) - zero) * blockvec[k + 2];
	res += (scale * scalar_t((tmp >> 24) & 0xFF) - zero) * blockvec[k + 3];

	i += width;
	k += 4;
	}

	atomicAdd(&mul[b * width + w], res);
	}


	void vecquant2matmul_faster_cuda(
	torch::Tensor vec,
	torch::Tensor mat,
	torch::Tensor mul,
	torch::Tensor scales,
	torch::Tensor zeros,
	int groupsize,
	int vec_height
	) {
	int batch = vec.size(0);
	int height = mat.size(0);
	int width = mat.size(1);
	int zero_width = zeros.size(1);

	dim3 blocks(
	(height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
	(width + BLOCKWIDTH - 1) / BLOCKWIDTH,
	batch
	);
	dim3 threads(BLOCKWIDTH);

	VecQuant2MatMulKernelFaster<<<blocks, threads>>>(
	(half2*) vec.data_ptr(),
	mat.data_ptr<int>(),
	mul.data_ptr<float>(),
	scales.data_ptr<float>(),
	zeros.data_ptr<int>(),
	batch, vec_height, height, width, zero_width, groupsize
	);
	}

	__global__ void VecQuant2MatMulKernelFaster(
	const half2* __restrict__ vec,
	const int* __restrict__ mat,
	float* __restrict__ mul,
	const float* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	) {
	const int blockwidth2 = BLOCKWIDTH / 2;
	int b = blockIdx.z;
	int h = BLOCKHEIGHT2 * blockIdx.x;
	int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;

	__shared__ half2 blockvec[blockwidth2];
	if (threadIdx.x < blockwidth2)
	blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];

	__shared__ half2 deq2[16][16];
	int val = threadIdx.x / 16;
	int off = threadIdx.x % 16;
	for (; val < 16; val += BLOCKWIDTH / 16) {
	deq2[val][off] = __halves2half2(
	__int2half_rn(val & 0x3), __int2half_rn(val >> 2)
	);
	}

	int i = width * h + w;
	int g_h = h * 16;
	int k = 0;

	int z_w = w / 16;
	int z_mod = (w % 16) * 2;

	float res = 0;
	half2 res2;

	unsigned int tmp;

	__syncthreads();

	while (k < blockwidth2) {
	int g = (g_h + (k * 2)) / groupsize;
	float scale_f = scales[g * width + w];
	half2 scale = __float2half2_rn(scale_f);
	half2 zero = __float2half2_rn(-(scale_f * (((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0x3) + 1)));

	res2 = {};
	tmp = as_unsigned(mat[i]);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 0) & 0xf][off], scale, zero), blockvec[k + 0], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 4) & 0xf][off], scale, zero), blockvec[k + 1], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 8) & 0xf][off], scale, zero), blockvec[k + 2], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 12) & 0xf][off], scale, zero), blockvec[k + 3], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xf][off], scale, zero), blockvec[k + 4], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 20) & 0xf][off], scale, zero), blockvec[k + 5], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xf][off], scale, zero), blockvec[k + 6], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 28) & 0xf][off], scale, zero), blockvec[k + 7], res2);
	i += width;
	k += 8;
	res += __half2float(res2.x) + __half2float(res2.y);
	}

	atomicAdd(&mul[b * width + w], res);
	}

	void vecquant3matmul_faster_cuda(
	torch::Tensor vec,
	torch::Tensor mat,
	torch::Tensor mul,
	torch::Tensor scales,
	torch::Tensor zeros,
	int groupsize,
	int vec_height
	) {
	int batch = vec.size(0);
	int height = mat.size(0);
	int width = mat.size(1);
	int zero_width = zeros.size(1);

	dim3 blocks(
	(height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
	(width + BLOCKWIDTH - 1) / BLOCKWIDTH,
	batch
	);
	dim3 threads(BLOCKWIDTH);

	VecQuant3MatMulKernelFaster<<<blocks, threads>>>(
	(half2*) vec.data_ptr(),
	mat.data_ptr<int>(),
	mul.data_ptr<float>(),
	scales.data_ptr<float>(),
	zeros.data_ptr<int>(),
	batch, vec_height, height, width, zero_width, groupsize
	);
	}

	__global__ void VecQuant3MatMulKernelFaster(
	const half2* __restrict__ vec,
	const int* __restrict__ mat,
	float* __restrict__ mul,
	const float* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	) {
	const int blockwidth2 = BLOCKWIDTH / 2;
	int b = blockIdx.z;
	int h = BLOCKHEIGHT3 * blockIdx.x;
	int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;

	__shared__ half2 blockvec[blockwidth2];
	if (threadIdx.x < blockwidth2)
	blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];

	__shared__ half2 deq2[64][32];
	int val = threadIdx.x / 32;
	int off = threadIdx.x % 32;
	for (; val < 64; val += BLOCKWIDTH / 32) {
	deq2[val][off] = __halves2half2(
	__int2half_rn(val & 0x7), __int2half_rn(val >> 3)
	);
	}

	int i = width * h + w;
	int g_h = (h / 3) * 32;
	int k = 0;

	int z_w = (w / 32) * 3;
	int z_mod = w % 32;
	int z_bit;

	if (z_mod != 10){
	if (z_mod != 21){
	z_bit = z_mod;
	if (z_bit > 21){
	z_bit -= 22;
	z_bit *= 3;
	z_bit += 2;
	z_w += 2;
	} else if (z_bit > 10){
	z_bit -= 11;
	z_bit *= 3;
	z_bit += 1;
	z_w += 1;
	} else {
	z_bit *= 3;
	}
	} else {
	z_w += 1;
	}
	}

	float res = 0;
	half2 res2;

	unsigned int tmp1;
	unsigned int tmp2;
	unsigned int tmp;
	unsigned int z_tmp;

	__syncthreads();

	while (k < blockwidth2) {
	int g = (g_h + (k * 2)) / groupsize;
	float scale_f = scales[g * width + w];
	half2 scale = __float2half2_rn(scale_f);
	half2 zero;
	if (z_mod == 10) {
	z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) \| ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
	zero = __float2half2_rn(-(scale_f * ((z_tmp) + 1)));
	} else if (z_mod == 21){
	z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) \| ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
	zero = __float2half2_rn(-(scale_f * ((z_tmp) + 1)));
	} else {
	zero = __float2half2_rn(-(scale_f * (((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1)));
	}

	res2 = {};
	tmp1 = as_unsigned(mat[i]);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
	i += width;
	tmp2 = as_unsigned(mat[i]);
	tmp = (tmp1 >> 30) \| ((tmp2 << 2) & 0x3c);
	res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 5], res2);
	tmp2 >>= 4;
	k += 6;
	res2 = __hfma2(__hfma2(deq2[(tmp2 >> 0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp2 >> 6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp2 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp2 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
	i += width;
	tmp1 = as_unsigned(mat[i]);
	tmp = (tmp2 >> 24) \| ((tmp1 << 4) & 0x30);
	res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 4], res2);
	tmp1 >>= 2;
	k += 5;
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
	i += width;
	k += 5;
	res += __half2float(res2.x) + __half2float(res2.y);
	}

	atomicAdd(&mul[b * width + w], res);
	}

	void vecquant4matmul_faster_cuda(
	torch::Tensor vec,
	torch::Tensor mat,
	torch::Tensor mul,
	torch::Tensor scales,
	torch::Tensor zeros,
	int groupsize,
	int vec_height
	) {
	int batch = vec.size(0);
	int height = mat.size(0);
	int width = mat.size(1);
	int zero_width = zeros.size(1);

	dim3 blocks(
	(height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
	(width + BLOCKWIDTH - 1) / BLOCKWIDTH,
	batch
	);
	dim3 threads(BLOCKWIDTH);

	VecQuant4MatMulKernelFaster<<<blocks, threads>>>(
	(half2*) vec.data_ptr(),
	mat.data_ptr<int>(),
	mul.data_ptr<float>(),
	scales.data_ptr<float>(),
	zeros.data_ptr<int>(),
	batch, vec_height, height, width, zero_width, groupsize
	);
	}

	__global__ void VecQuant4MatMulKernelFaster(
	const half2* __restrict__ vec,
	const int* __restrict__ mat,
	float* __restrict__ mul,
	const float* __restrict__ scales,
	const int* __restrict__ zeros,
	int batch,
	int vec_height,
	int height,
	int width,
	int zero_width,
	int groupsize
	) {
	const int blockwidth2 = BLOCKWIDTH / 2;
	int b = blockIdx.z;
	int h = BLOCKHEIGHT4 * blockIdx.x;
	int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;

	__shared__ half2 blockvec[blockwidth2];
	if (threadIdx.x < blockwidth2)
	blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];

	__shared__ half2 deq2[256][8];
	int val = threadIdx.x / 8;
	int off = threadIdx.x % 8;
	for (; val < 256; val += BLOCKWIDTH / 8) {
	deq2[val][off] = __halves2half2(
	__int2half_rn(val & 0xF), __int2half_rn(val >> 4)
	);
	}

	int i = width * h + w;
	int g_h = h * 8;
	int k = 0;

	int z_w = w / 8;
	int z_mod = (w % 8) * 4;

	float res = 0;
	half2 res2;

	unsigned int tmp;

	__syncthreads();

	while (k < blockwidth2) {
	int g = (g_h + (k * 2)) / groupsize;
	float scale_f = scales[g * width + w];
	half2 scale = __float2half2_rn(scale_f);
	half2 zero = __float2half2_rn(-(scale_f * (((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1)));

	res2 = {};
	tmp = as_unsigned(mat[i]);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 0) & 0xff][off], scale, zero), blockvec[k + 0], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 8) & 0xff][off], scale, zero), blockvec[k + 1], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scale, zero), blockvec[k + 2], res2);
	res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scale, zero), blockvec[k + 3], res2);
	i += width;
	k += 4;
	res += __half2float(res2.x) + __half2float(res2.y);
	}

	atomicAdd(&mul[b * width + w], res);
	}