Spaces:

naver
/

PUMP

Runtime error

Philippe Weinzaepfel

huggingface demo

3ef85e9 about 2 years ago

No virus

22.5 kB

	// Copyright 2022-present NAVER Corp.
	// CC BY-NC-SA 4.0
	// Available only for non-commercial use

	#include <torch/extension.h>
	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <vector>

	#define MIN(x, y) ((x) < (y) ? (x) : (y))
	#define MAX(x, y) ((x) < (y) ? (y) : (x))
	#define inf std::numeric_limits<float>::infinity()

	#define CHECK_CUDA(tensor) {\
	TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
	TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
	void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}


	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ < 600
	#define atomicMax_block atomicMax
	#endif


	template <typename scalar_t>
	__global__ void forward_agg_cuda_kernel(
	const int LH1, const int LW1, const int LH2, const int LW2,
	const int gap_left, const int gap_right, float norm,
	const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower,
	torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper,
	const float* weights, float* new_weights ) {

	const auto UH1 = LH1 + bool(!gap_left); // level 0 is smaller than other levels
	const auto UW1 = LW1 + bool(!gap_left);
	const auto UH2 = LH2;
	const auto UW2 = LW2;

	int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	const int uw2 = idx % UW2; idx /= UW2;
	const int uh2 = idx % UH2; idx /= UH2;
	const int uw1 = idx % UW1; idx /= UW1;
	const int uh1 = idx;
	if (uh1 >= UH1) return;

	// then, add the 4 child
	float sumw = 0, nrm = 0, res = 0;
	// #pragma unroll
	for (int i = 0; i < 4; i++) {
	const int v = i/2, u = i%2;
	// source pixel
	const int lh1 = uh1 + (1-v) * gap_left - v * gap_right;
	if (lh1 < 0 \|\| lh1 >= LH1) continue;
	const int lw1 = uw1 + (1-u) * gap_left - u * gap_right;
	if (lw1 < 0 \|\| lw1 >= LW1) continue;

	// load weight even if (lh2,lw2) are invalid
	const float weight = weights ? weights[lh1*LW1 + lw1] : 1;
	sumw += weight;

	const int lh2 = uh2 + 1 - 2*v;
	if (lh2 < 0 \|\| lh2 >= LH2) continue;
	const int lw2 = uw2 + 1 - 2*u;
	if (lw2 < 0 \|\| lw2 >= LW2) continue;

	res += weight * lower[lh1][lw1][lh2][lw2];
	nrm += weight;
	}

	// normalize output
	nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1);
	upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0);
	if (uh2 == 1 && uw2 == 1)
	new_weights[uh1*UW1 + uw1] = sumw;
	}

	torch::Tensor forward_agg_cuda( int level, float norm, const torch::Tensor lower,
	const at::optional<at::Tensor> weights, torch::Tensor upper ) {
	CHECK_CUDA(lower);
	CHECK_CUDA(upper);
	if (weights) CHECK_CUDA(weights.value());

	const auto UH1 = upper.size(0);
	const auto UW1 = upper.size(1);
	const auto UH2 = upper.size(2);
	const auto UW2 = upper.size(3);
	const auto LH1 = lower.size(0);
	const auto LW1 = lower.size(1);
	const auto LH2 = lower.size(2);
	const auto LW2 = lower.size(3);
	TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" );

	const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ...
	const int gap_right= 1 << MAX(0, level-2); // 1, 1, 2, 4, ...

	const int MAX_THREADS = 512; // faster than 1024 (higher SM occupancy)
	const int THREADS_PER_BLOCK = MAX_THREADS;
	const int N_BLOCKS = (UH1UW1UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

	torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat32));

	// one block for each layer, one thread per local-max
	AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_agg_cuda", ([&] {
	forward_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>(
	LH1, LW1, LH2, LW2,
	gap_left, gap_right, norm,
	lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
	upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
	weights ? weights->data_ptr<float>() : nullptr, new_weights.data_ptr<float>() );
	}));
	return new_weights;
	}

	template <typename scalar_t>
	__global__ void forward_pool_agg_cuda_kernel(
	const int LH1, const int LW1, const int LH2, const int LW2,
	// const int UH1, const int UW1, const int UH2, const int UW2,
	const int gap_left, const int gap_right, float norm,
	const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower,
	torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper,
	const float* weights, float* new_weights ) {

	const auto UH1 = LH1 + bool(!gap_left); // level 0 is smaller than other levels
	const auto UW1 = LW1 + bool(!gap_left);
	const auto UH2 = (LH2-1)/2 + 1;
	const auto UW2 = (LW2-1)/2 + 1;

	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	const int uw2 = idx % UW2; idx /= UW2;
	const int uh2 = idx % UH2; idx /= UH2;
	const int uw1 = idx % UW1; idx /= UW1;
	const int uh1 = idx;
	if (uh1 >= UH1) return;

	// then, add the 4 child
	float sumw = 0, nrm = 0, res = 0;
	// #pragma unroll
	for (int i = 0; i < 4; i++) {
	const int v = i/2, u = i%2;
	// source pixel
	const int lh1 = uh1 + (1-v) * gap_left - v * gap_right;
	if (lh1 < 0 \|\| lh1 >= LH1) continue;
	const int lw1 = uw1 + (1-u) * gap_left - u * gap_right;
	if (lw1 < 0 \|\| lw1 >= LW1) continue;

	// load weight even if (lh2,lw2) are invalid
	const float weight = weights ? weights[lh1*LW1 + lw1] : 1;
	sumw += weight;

	const int lh2_ = 2(uh2 + 1 - 2v); // position in lower
	const int lw2_ = 2(uw2 + 1 - 2u);
	float lower_max = -inf;
	#pragma unroll
	for (int j = -1; j <= 1; j++) {
	const int lh2 = lh2_ + j;
	if (lh2 < 0 \|\| lh2 >= LH2) continue;
	#pragma unroll
	for (int i = -1; i <= 1; i++) {
	const int lw2 = lw2_ + i;
	if (lw2 < 0 \|\| lw2 >= LW2) continue;
	float l = lower[lh1][lw1][lh2][lw2];
	lower_max = MAX(lower_max, l);
	}}
	if (lower_max == -inf) continue;

	res += weight * lower_max;
	nrm += weight;
	}

	// normalize output
	nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1);
	upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0);
	if (uh2 == 1 && uw2 == 1)
	new_weights[uh1*UW1 + uw1] = sumw;
	}

	torch::Tensor forward_pool_agg_cuda( int level, float norm, const torch::Tensor lower,
	const at::optional<at::Tensor> weights, torch::Tensor upper ) {
	CHECK_CUDA(lower);
	CHECK_CUDA(upper);
	if (weights) CHECK_CUDA(weights.value());

	const auto LH1 = lower.size(0);
	const auto LW1 = lower.size(1);
	const auto LH2 = lower.size(2);
	const auto LW2 = lower.size(3);
	const auto UH1 = upper.size(0);
	const auto UW1 = upper.size(1);
	const auto UH2 = upper.size(2);
	const auto UW2 = upper.size(3);
	TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" );
	TORCH_CHECK( UH2 == (LH2-1)/2+1 && UW2 == (LW2-1)/2+1, "lower level should be twice as big" );

	const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ...
	const int gap_right= 1 << MAX(0, level-2); // 1, 1, 2, 4, ...

	const int MAX_THREADS = 512; // faster than 1024 (higher SM occupancy)
	const int THREADS_PER_BLOCK = MAX_THREADS;
	const int N_BLOCKS = (UH1UW1UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

	torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat));

	// one block for each layer, one thread per local-max
	AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_pool_agg_cuda", ([&] {
	forward_pool_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>(
	LH1, LW1, LH2, LW2,
	// UH1, UW1, UH2, UW2,
	gap_left, gap_right, norm,
	lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
	upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
	weights ? weights->data<float>() : nullptr, new_weights.data<float>() );
	}));
	return new_weights;
	}

	__device__ inline int in(int lower, int var, int upper) {
	return lower <= var && var < upper;
	}
	__device__ inline int sl(bool b) {
	return b ? 1 : -1;
	}

	__device__ short atomicMaxShort(short* address, short val) {
	unsigned int base_address = (unsigned int )((size_t)address & ~3); // multiple of 4

	unsigned int order_from[] = {0x0010, 0x0032}; // either bytes[0:2] or bytes[2:4]
	unsigned int from = order_from[((size_t)address & 3) / 2];

	unsigned int order_back[] = {0x3254, 0x5410}; // right-to-left
	unsigned int back = order_back[((size_t)address & 3) / 2];
	unsigned int old, assumed, max_, new_;

	old = *base_address;
	do {
	assumed = old;
	max_ = max(val, (short)__byte_perm(old, 0, from)); // extract word
	new_ = __byte_perm(old, max_, back); // replace word
	old = atomicCAS(base_address, assumed, new_);
	} while (assumed != old);
	return old;
	}

	template <typename scalar_t>
	__device__ inline void TplAtomicMax_block( scalar_t* before, scalar_t after ) { assert(!"atomicMax not implemented for this dtype"); }
	template <>
	__device__ inline void TplAtomicMax_block( at::Half* before, at::Half after ) { atomicMaxShort( (int16_t)before, (int16_t*)&after ); }
	template <>
	__device__ inline void TplAtomicMax_block( float* before, float after ) { atomicMax_block( (int32_t)before, (int32_t*)&after ); }

	template <typename scalar_t>
	__global__ void backward_agg_unpool_cuda_kernel(
	const int UH1, const int UW1,
	const int UH2, const int UW2,
	const int LH2, const int LW2,
	const int gap_left, const int gap_right,
	const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper,
	torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower ) {

	/* Each block is going to take care of a single layer, i.e. lower[:,:,0::2,0::2].
	the first thread is allocating some global memory and then frees it later.
	*/
	// const int LH1 = gridDim.x;
	// const int LW1 = gridDim.y;
	const int lh1 = blockIdx.y;
	const int lw1 = blockIdx.x;
	const int UHW2 = UH2 * UW2; // upper layer size

	__shared__ float* _shared_addr;
	if (threadIdx.x == 0)
	do{ _shared_addr = new float [2*UHW2]; } // for each upper place, we have (best, bestp)
	while(!_shared_addr); // waiting for memory to be available...
	__syncthreads();

	float * layer_best = _shared_addr;
	int * layer_bestp = (int*)(_shared_addr+1); //UHW);
	assert( layer_best );

	/* First pass: we recover the position and values of all local maxima in the layer
	*/
	for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) {
	const int ux = idx % UW2;
	const int uy = idx / UW2;
	const int lx = 2*ux; // lower pos from upper pos
	const int ly = 2*uy;

	// argmax my local minima
	float best = -inf;
	int bestp = 0;
	#pragma unroll
	for (int j_= -1; j_<= 1; j_++) {
	const int j = ly + j_;
	if (j < 0 \|\| j >= LH2) continue;
	#pragma unroll
	for (int i_= -1; i_<= 1; i_++) {
	const int i = lx + i_;
	if (i < 0 \|\| i >= LW2) continue;
	float cur = lower[lh1][lw1][j][i];
	if (cur > best) { best = cur; bestp = j*LW2+i; }
	}}
	layer_best[2*idx] = best;
	layer_bestp[2*idx] = bestp;
	}

	__syncthreads();

	/* Second pass: we update the local maxima according to the upper layer
	*/
	for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) {
	const int ux = idx % UW2;
	const int uy = idx / UW2;

	// max-pool the additional value from the upper layer
	scalar_t add = 0;
	for (int v = -gap_left; v <= gap_right; v += gap_right+gap_left) {
	for (int u = -gap_left; u <= gap_right; u += gap_right+gap_left) {
	const int uh1 = lh1 + v, uw1 = lw1 + u;
	const int uh2 = uy+sl(v>0), uw2 = ux+sl(u>0);
	if (in(0, uh1, UH1) && in(0, uw1, UW1) && in(0, uh2, UH2) && in(0, uw2, UW2))
	add = MAX(add, upper[uh1][uw1][uh2][uw2]);
	}}

	// grab local maxima
	float best = layer_best[2*idx];
	int bestp = layer_bestp[2*idx];
	const int lx = bestp % LW2;
	const int ly = bestp / LW2;

	// printf("UH=%d,UW=%d: uy=%d,ux=%d --> best=%g at ly=%d,lx=%d\n", UH,UW, uy,ux, best, ly,lx);
	scalar_t* before = & lower[lh1][lw1][ly][lx];
	scalar_t after = best + add;
	TplAtomicMax_block<scalar_t>( before, after );
	}

	__syncthreads();

	if (threadIdx.x == 0)
	delete _shared_addr;
	}

	void backward_agg_unpool_cuda( int level, const torch::Tensor upper, torch::Tensor lower, bool exclude_borders ) {
	CHECK_CUDA(lower);
	CHECK_CUDA(upper);

	const auto UH1 = upper.size(0);
	const auto UW1 = upper.size(1);
	const auto UH2 = upper.size(2);
	const auto UW2 = upper.size(3);
	const auto LH1 = lower.size(0);
	const auto LW1 = lower.size(1);
	const auto LH2 = lower.size(2);
	const auto LW2 = lower.size(3);
	TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" );
	const int xb = exclude_borders; // local_argmax cannot reach the bottom and right borders

	const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ...
	const int gap_right= 1 << MAX(0, level-2); // 1, 1, 2, 4, ...

	const int64_t MAX_THREADS = 1024;
	const int64_t THREADS_PER_LAYER = MIN(UH2*UW2, MAX_THREADS);

	// one block for each layer, one thread per local-max
	AT_DISPATCH_FLOATING_TYPES_AND_HALF(upper.type(), "backward_agg_unpool_cuda", ([&] {
	backward_agg_unpool_cuda_kernel<<<dim3(LW1,LH1), THREADS_PER_LAYER>>>(
	UH1, UW1, UH2, UW2, LH2-xb, LW2-xb,
	gap_left, gap_right,
	upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
	lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>());
	}));
	CHECK_KERNEL();
	}

	template <typename scalar_t>
	__global__ void max_pool3d_cuda_kernel(
	const int BS, const int NC, const int IH, const int IW, const int OH, const int OW,
	const int ks, const int stride,
	const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> tensor,
	torch::PackedTensorAccessor64<scalar_t,3,torch::RestrictPtrTraits> maxima,
	torch::PackedTensorAccessor64<int64_t, 3,torch::RestrictPtrTraits> indices ) {

	// each thread takes care of one output
	int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	const int x = idx % OW; idx /= OW;
	const int y = idx % OH; idx /= OH;
	const int b = idx;
	if (b >= BS) return;

	float best = -inf;
	int64_t best_pos = 0;
	for (int64_t c = 0; c < NC; c++) {
	for (int j = stridey; j < stridey+ks; j++) {
	for (int i = stridex; i < stridex+ks; i++) {
	// assert( b < BS and c < NC and j < IH and i < IW );
	float cur = tensor[b][c][j][i];
	if (cur > best) {best = cur; best_pos = (cIH + j)IW+ i; }
	}}}

	// assert( b < BS and y < OH and x < OW );
	maxima [b][y][x] = best;
	indices[b][y][x] = best_pos;
	}

	void max_pool3d_cuda( const torch::Tensor tensor, const int kernel_size, const int stride,
	torch::Tensor maxima, torch::Tensor indices ) {
	CHECK_CUDA(tensor);
	TORCH_CHECK(tensor.dim() == 4, "tensor should be 4-dimensional: BxCxHxW");
	const int BS = tensor.size(0);
	const int NC = tensor.size(1);
	const int IH = tensor.size(2); // input height
	const int IW = tensor.size(3); // input width

	// output size
	TORCH_CHECK( maxima.sizes() == indices.sizes(), "maxima and indices should have the same shape" );
	TORCH_CHECK( BS == maxima.size(0), "bad batch size" );
	const int OH = maxima.size(1);
	const int OW = maxima.size(2);

	const int64_t THREADS_PER_LAYER = 512;
	const int64_t N_BLOCKS = (BSOHOW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER;

	// one block for each layer, one thread per local-max
	AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor.type(), "max_pool3d_cuda", ([&] {
	max_pool3d_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>(
	BS, NC, IH, IW, OH, OW, kernel_size, stride,
	tensor. packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
	maxima. packed_accessor64<scalar_t,3,torch::RestrictPtrTraits>(),
	indices.packed_accessor64<int64_t,3,torch::RestrictPtrTraits>());
	}));
	}


	__device__ inline float ptdot( const float* m, float x, float y ) {
	return xm[0] + ym[1] + m[2];
	}

	__device__ inline float sqr(float v) {
	return v*v;
	}


	__global__ void merge_corres_cuda_kernel(
	const int OH, const int OW, const int OZ, const int IH, const int IW,
	const float dmax2, int offset, const float* inv_rot, const int all_step,
	const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> corres_a,
	torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> all_corres_a ) {

	// each thread takes care of one output
	int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	const int i = idx % OW; idx /= OW;
	const int j = idx;
	if (j >= OH) return;

	const float tol2 = 2*2; // squared
	auto all_cor = all_corres_a[j][i];

	// center of the bin in the reference frame
	float x = i*all_step + all_step/2;
	float y = j*all_step + all_step/2;

	// center of the bin on the rescaled+rotated image
	float xr = ptdot( inv_rot + 0, x, y );
	float yr = ptdot( inv_rot + 3, x, y );

	// iterate on the nearby bins
	int xb = (int)(0.5+ xr/4); // rescaled+rotated desc always has step 4
	int yb = (int)(0.5+ yr/4);

	float best = dmax2;
	#pragma unroll
	for (int _v = -1; _v <= 1; _v++) {
	#pragma unroll
	for (int _u = -1; _u <= 1; _u++) {
	const int v = yb+_v, u = xb+_u;
	if (!(in(0, v, IH) && in(0, u, IW))) continue;
	auto cor = corres_a[v][u];
	float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y);
	if (d < best) best = d;
	}}

	#pragma unroll
	for (int _v = -1; _v <= 1; _v++) {
	#pragma unroll
	for (int _u = -1; _u <= 1; _u++) {
	const int v = yb+_v, u = xb+_u;
	if (!(in(0, v, IH) && in(0, u, IW))) continue;
	auto cor = corres_a[v][u];
	float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y);
	if (d <= tol2*best) { // spatially close
	// merge correspondence if score is better than actual
	if (cor[4] > all_cor[4])
	for (int k = 0; k < OZ; k++) all_cor[k] = cor[k];
	}
	}}
	}

	void merge_corres_cuda( const torch::Tensor corres, const int offset, const torch::Tensor _inv_rot,
	const float dmax, torch::Tensor all_corres, const int all_step ) {
	CHECK_CUDA( corres );
	CHECK_CUDA( all_corres );
	CHECK_CUDA( _inv_rot );
	TORCH_CHECK(_inv_rot.is_contiguous(), "inv_rot should be contiguous" );

	const int IH = corres.size(0);
	const int IW = corres.size(1);
	const int IZ = corres.size(2);
	const int OH = all_corres.size(0);
	const int OW = all_corres.size(1);
	const int OZ = all_corres.size(2);
	TORCH_CHECK( IZ == OZ, "corres and all_corres should have the same shape[2]" );

	const int THREADS_PER_LAYER = 512;
	const int N_BLOCKS = (OH * OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER;

	merge_corres_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>(
	OH, OW, OZ, IH, IW, dmax*dmax, offset, _inv_rot.data_ptr<float>(), all_step,
	corres.packed_accessor32<float,3,torch::RestrictPtrTraits>(),
	all_corres.packed_accessor32<float,3,torch::RestrictPtrTraits>());
	CHECK_KERNEL();
	}


	template <typename scalar_t>
	__global__ void mask_correlations_radial_cuda_kernel(
	float radius, const float alpha,
	const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> targets,
	torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> corr ) {

	#define H1 ((int)corr.size(0))
	#define W1 ((int)corr.size(1))
	#define H2 ((int)corr.size(2))
	#define W2 ((int)corr.size(3))

	// each block takes care of one layer corr[j,i,:,:]
	const int j = blockIdx.x / W1;
	const int i = blockIdx.x % W1;
	if (j >= H1) return;

	// read the target center
	const float cx = targets[j][i][0];
	const float cy = targets[j][i][1];
	if (cx != cx \|\| cy != cy) return; // undefined center
	radius *= radius; // squared
	const float alpha_out = (alpha > 1 ? 1 : alpha);
	const float alpha_in = (alpha < 1 ? 1 : alpha);

	for (int idx = threadIdx.x; idx < H2*W2; idx += blockDim.x) {
	const int v = idx / W2;
	const int u = idx % W2;

	// compute weighting
	float dis2 = sqr(u - cx) + sqr(v - cy);
	float mul = alpha_in;
	if (dis2 > radius)
	mul = 1 - alpha_out*(1 - radius / dis2);

	corr[j][i][v][u] *= mul;
	}
	}

	void mask_correlations_radial_cuda( torch::Tensor corr, const torch::Tensor targets,
	const float radius, const float alpha) {
	CHECK_CUDA( corr );
	CHECK_CUDA( targets );

	const int THREADS_PER_LAYER = 512;
	const int N_BLOCKS = H1*W1;

	#undef H1
	#undef W1
	#undef H2
	#undef W2

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(corr.type(), "mask_correlations_radial_cuda", ([&] {
	mask_correlations_radial_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>(
	radius, alpha,
	targets.packed_accessor32<float,3,torch::RestrictPtrTraits>(),
	corr.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>());
	}));
	CHECK_KERNEL();
	}