Spaces:

naver
/

PUMP

Runtime error

File size: 22,488 Bytes

3ef85e9

// Copyright 2022-present NAVER Corp.
// CC BY-NC-SA 4.0
// Available only for non-commercial use

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>

#define MIN(x, y)           ((x) < (y) ? (x) : (y))
#define MAX(x, y)           ((x) < (y) ? (y) : (x))
#define inf std::numeric_limits<float>::infinity()

#define CHECK_CUDA(tensor) {\
    TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
    TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}


#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 600
#define atomicMax_block atomicMax
#endif


template <typename scalar_t>
__global__ void forward_agg_cuda_kernel( 
        const int LH1, const int LW1, const int LH2, const int LW2, 
        const int gap_left, const int gap_right, float norm,
        const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower,
              torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper,
        const float* weights, float* new_weights ) {

    const auto UH1 = LH1 + bool(!gap_left); // level 0 is smaller than other levels
    const auto UW1 = LW1 + bool(!gap_left);
    const auto UH2 = LH2;
    const auto UW2 = LW2;

    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int uw2 = idx % UW2; idx /= UW2;
    const int uh2 = idx % UH2; idx /= UH2;
    const int uw1 = idx % UW1; idx /= UW1;
    const int uh1 = idx;
    if (uh1 >= UH1) return;

    // then, add the 4 child
    float sumw = 0, nrm = 0, res = 0;
    // #pragma unroll
    for (int i = 0; i < 4; i++) {
        const int v = i/2, u = i%2;
        // source pixel
        const int lh1 = uh1 + (1-v) * gap_left - v * gap_right;
        if (lh1 < 0 || lh1 >= LH1) continue;
        const int lw1 = uw1 + (1-u) * gap_left - u * gap_right;
        if (lw1 < 0 || lw1 >= LW1) continue;

        // load weight even if (lh2,lw2) are invalid
        const float weight = weights ? weights[lh1*LW1 + lw1] : 1;
        sumw += weight;

        const int lh2 = uh2 + 1 - 2*v;
        if (lh2 < 0 || lh2 >= LH2) continue;
        const int lw2 = uw2 + 1 - 2*u;
        if (lw2 < 0 || lw2 >= LW2) continue;

        res += weight * lower[lh1][lw1][lh2][lw2];
        nrm += weight;
    }

    // normalize output
    nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1);
    upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0);
    if (uh2 == 1 && uw2 == 1)
        new_weights[uh1*UW1 + uw1] = sumw;
}

torch::Tensor forward_agg_cuda( int level, float norm, const torch::Tensor lower, 
                                const at::optional<at::Tensor> weights, torch::Tensor upper ) {
    CHECK_CUDA(lower);
    CHECK_CUDA(upper);
    if (weights) CHECK_CUDA(weights.value());

    const auto UH1 = upper.size(0);
    const auto UW1 = upper.size(1);
    const auto UH2 = upper.size(2);
    const auto UW2 = upper.size(3);
    const auto LH1 = lower.size(0);
    const auto LW1 = lower.size(1);
    const auto LH2 = lower.size(2);
    const auto LW2 = lower.size(3);
    TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" );

    const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ...
    const int gap_right= 1 << MAX(0, level-2);              // 1, 1, 2, 4, ...

    const int MAX_THREADS = 512; // faster than 1024 (higher SM occupancy)
    const int THREADS_PER_BLOCK = MAX_THREADS;
    const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

    torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat32));

    // one block for each layer, one thread per local-max
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_agg_cuda", ([&] {
        forward_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>(
            LH1, LW1, LH2, LW2, 
            gap_left, gap_right, norm,
            lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
            upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
            weights ? weights->data_ptr<float>() : nullptr, new_weights.data_ptr<float>() );
    }));
    return new_weights;
}

template <typename scalar_t>
__global__ void forward_pool_agg_cuda_kernel( 
        const int LH1, const int LW1, const int LH2, const int LW2, 
        // const int UH1, const int UW1, const int UH2, const int UW2,
        const int gap_left, const int gap_right, float norm,
        const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower,
              torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper,
        const float* weights, float* new_weights ) {

    const auto UH1 = LH1 + bool(!gap_left); // level 0 is smaller than other levels
    const auto UW1 = LW1 + bool(!gap_left);
    const auto UH2 = (LH2-1)/2 + 1;
    const auto UW2 = (LW2-1)/2 + 1;

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int uw2 = idx % UW2; idx /= UW2;
    const int uh2 = idx % UH2; idx /= UH2;
    const int uw1 = idx % UW1; idx /= UW1;
    const int uh1 = idx;
    if (uh1 >= UH1) return;

    // then, add the 4 child
    float sumw = 0, nrm = 0, res = 0;
    // #pragma unroll
    for (int i = 0; i < 4; i++) {
        const int v = i/2, u = i%2;
        // source pixel
        const int lh1 = uh1 + (1-v) * gap_left - v * gap_right;
        if (lh1 < 0 || lh1 >= LH1) continue;
        const int lw1 = uw1 + (1-u) * gap_left - u * gap_right;
        if (lw1 < 0 || lw1 >= LW1) continue;

        // load weight even if (lh2,lw2) are invalid
        const float weight = weights ? weights[lh1*LW1 + lw1] : 1;
        sumw += weight;

        const int lh2_ = 2*(uh2 + 1 - 2*v); // position in lower
        const int lw2_ = 2*(uw2 + 1 - 2*u);
        float lower_max = -inf;
        #pragma unroll
        for (int j = -1; j <= 1; j++) {
          const int lh2 = lh2_ + j;
          if (lh2 < 0 || lh2 >= LH2) continue;
          #pragma unroll
          for (int i = -1; i <= 1; i++) {
            const int lw2 = lw2_ + i;
            if (lw2 < 0 || lw2 >= LW2) continue;
            float l = lower[lh1][lw1][lh2][lw2];
            lower_max = MAX(lower_max, l);
        }}
        if (lower_max == -inf) continue;

        res += weight * lower_max;
        nrm += weight;
    }

    // normalize output
    nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1);
    upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0);
    if (uh2 == 1 && uw2 == 1)
        new_weights[uh1*UW1 + uw1] = sumw;
}

torch::Tensor forward_pool_agg_cuda( int level, float norm, const torch::Tensor lower, 
                                     const at::optional<at::Tensor> weights, torch::Tensor upper ) {
    CHECK_CUDA(lower);
    CHECK_CUDA(upper);
    if (weights) CHECK_CUDA(weights.value());

    const auto LH1 = lower.size(0);
    const auto LW1 = lower.size(1);
    const auto LH2 = lower.size(2);
    const auto LW2 = lower.size(3);
    const auto UH1 = upper.size(0);
    const auto UW1 = upper.size(1);
    const auto UH2 = upper.size(2);
    const auto UW2 = upper.size(3);
    TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" );
    TORCH_CHECK( UH2 == (LH2-1)/2+1 && UW2 == (LW2-1)/2+1, "lower level should be twice as big" );

    const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ...
    const int gap_right= 1 << MAX(0, level-2);              // 1, 1, 2, 4, ...

    const int MAX_THREADS = 512; // faster than 1024 (higher SM occupancy)
    const int THREADS_PER_BLOCK = MAX_THREADS;
    const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

    torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat));
    
    // one block for each layer, one thread per local-max
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_pool_agg_cuda", ([&] {
        forward_pool_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>(
            LH1, LW1, LH2, LW2, 
            // UH1, UW1, UH2, UW2, 
            gap_left, gap_right, norm,
            lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
            upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
            weights ? weights->data<float>() : nullptr, new_weights.data<float>() );
    }));
    return new_weights;
}

__device__ inline int in(int lower, int var, int upper) {
    return lower <= var && var < upper;
}
__device__ inline int sl(bool b) {
    return b ? 1 : -1;
}

__device__ short atomicMaxShort(short* address, short val) {
    unsigned int *base_address = (unsigned int *)((size_t)address & ~3); // multiple of 4
    
    unsigned int order_from[] = {0x0010, 0x0032}; // either bytes[0:2] or bytes[2:4]
    unsigned int from = order_from[((size_t)address & 3) / 2];
    
    unsigned int order_back[] = {0x3254, 0x5410}; // right-to-left 
    unsigned int back = order_back[((size_t)address & 3) / 2];
    unsigned int old, assumed, max_, new_;

    old = *base_address;
    do {
        assumed = old;
        max_ = max(val, (short)__byte_perm(old, 0, from)); // extract word
        new_ = __byte_perm(old, max_, back); // replace word
        old = atomicCAS(base_address, assumed, new_);
    } while (assumed != old);
    return old;
}

template <typename scalar_t>
__device__ inline void TplAtomicMax_block( scalar_t* before, scalar_t after ) { assert(!"atomicMax not implemented for this dtype"); }
template <>
__device__ inline void TplAtomicMax_block( at::Half* before, at::Half after ) { atomicMaxShort( (int16_t*)before, *(int16_t*)&after ); }
template <>
__device__ inline void TplAtomicMax_block( float* before, float after ) { atomicMax_block( (int32_t*)before, *(int32_t*)&after ); }

template <typename scalar_t>
__global__ void backward_agg_unpool_cuda_kernel( 
        const int UH1, const int UW1, 
        const int UH2, const int UW2, 
        const int LH2, const int LW2, 
        const int gap_left, const int gap_right,
        const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper,
              torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower ) {

    /* Each block is going to take care of a single layer, i.e. lower[:,:,0::2,0::2].
       the first thread is allocating some global memory and then frees it later.
    */
    // const int LH1 = gridDim.x;
    // const int LW1 = gridDim.y;
    const int lh1 = blockIdx.y;
    const int lw1 = blockIdx.x;
    const int UHW2 = UH2 * UW2; // upper layer size

    __shared__ float* _shared_addr;
    if (threadIdx.x == 0)
        do{ _shared_addr = new float [2*UHW2]; } // for each upper place, we have (best, bestp)
        while(!_shared_addr); // waiting for memory to be available...
    __syncthreads();

    float * layer_best = _shared_addr;
    int * layer_bestp = (int*)(_shared_addr+1); //UHW);
    assert( layer_best );

    /* First pass: we recover the position and values of all local maxima in the layer
    */ 
    for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) {
        const int ux = idx % UW2;
        const int uy = idx / UW2;
        const int lx = 2*ux; // lower pos from upper pos
        const int ly = 2*uy;

        // argmax my local minima
        float best = -inf;
        int bestp = 0;
        #pragma unroll
        for (int j_= -1; j_<= 1; j_++) {
          const int j = ly + j_;
          if (j < 0 || j >= LH2) continue;
          #pragma unroll
          for (int i_= -1; i_<= 1; i_++) {
            const int i = lx + i_;
            if (i < 0 || i >= LW2) continue;
            float cur = lower[lh1][lw1][j][i];
            if (cur > best) { best = cur; bestp = j*LW2+i; }
        }}
        layer_best[2*idx] = best;
        layer_bestp[2*idx] = bestp;
    }
    
    __syncthreads();
    
    /* Second pass: we update the local maxima according to the upper layer
    */ 
    for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) {
        const int ux = idx % UW2;
        const int uy = idx / UW2;

        // max-pool the additional value from the upper layer
        scalar_t add = 0;
        for (int v = -gap_left; v <= gap_right; v += gap_right+gap_left) {
          for (int u = -gap_left; u <= gap_right; u += gap_right+gap_left) {
            const int uh1 = lh1 + v, uw1 = lw1 + u;
            const int uh2 = uy+sl(v>0), uw2 = ux+sl(u>0);
            if (in(0, uh1, UH1) && in(0, uw1, UW1) && in(0, uh2, UH2) && in(0, uw2, UW2))
                add = MAX(add, upper[uh1][uw1][uh2][uw2]);
        }}

        // grab local maxima
        float best = layer_best[2*idx];
        int bestp = layer_bestp[2*idx];
        const int lx = bestp % LW2;
        const int ly = bestp / LW2;

        // printf("UH=%d,UW=%d: uy=%d,ux=%d --> best=%g at ly=%d,lx=%d\n", UH,UW, uy,ux, best, ly,lx);
        scalar_t* before = & lower[lh1][lw1][ly][lx];
        scalar_t  after  = best + add;
        TplAtomicMax_block<scalar_t>( before, after );
    }

    __syncthreads();

    if (threadIdx.x == 0) 
        delete _shared_addr;
}

void backward_agg_unpool_cuda( int level, const torch::Tensor upper, torch::Tensor lower, bool exclude_borders ) {
    CHECK_CUDA(lower);
    CHECK_CUDA(upper);

    const auto UH1 = upper.size(0);
    const auto UW1 = upper.size(1);
    const auto UH2 = upper.size(2);
    const auto UW2 = upper.size(3);
    const auto LH1 = lower.size(0);
    const auto LW1 = lower.size(1);
    const auto LH2 = lower.size(2);
    const auto LW2 = lower.size(3);
    TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" );
    const int xb = exclude_borders; // local_argmax cannot reach the bottom and right borders

    const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ...
    const int gap_right= 1 << MAX(0, level-2);              // 1, 1, 2, 4, ...

    const int64_t MAX_THREADS = 1024;
    const int64_t THREADS_PER_LAYER = MIN(UH2*UW2, MAX_THREADS);

    // one block for each layer, one thread per local-max
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(upper.type(), "backward_agg_unpool_cuda", ([&] {
        backward_agg_unpool_cuda_kernel<<<dim3(LW1,LH1), THREADS_PER_LAYER>>>(
            UH1, UW1, UH2, UW2, LH2-xb, LW2-xb, 
            gap_left, gap_right,
            upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
            lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>());
    }));
    CHECK_KERNEL();
}

template <typename scalar_t>
__global__ void max_pool3d_cuda_kernel( 
        const int BS, const int NC, const int IH, const int IW, const int OH, const int OW, 
        const int ks, const int stride,
        const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> tensor,
              torch::PackedTensorAccessor64<scalar_t,3,torch::RestrictPtrTraits> maxima,
              torch::PackedTensorAccessor64<int64_t,    3,torch::RestrictPtrTraits> indices ) {

    // each thread takes care of one output
    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int x = idx % OW; idx /= OW;
    const int y = idx % OH; idx /= OH;
    const int b = idx;
    if (b >= BS) return;

    float best = -inf;
    int64_t best_pos = 0;
    for (int64_t c = 0; c < NC; c++) {
      for (int j = stride*y; j < stride*y+ks; j++) {
        for (int i = stride*x; i < stride*x+ks; i++) {
            // assert( b < BS and c < NC and j < IH and i < IW );
            float cur = tensor[b][c][j][i];
            if (cur > best) {best = cur; best_pos = (c*IH + j)*IW+ i; }
    }}}

    // assert( b < BS and y < OH and x < OW );
    maxima [b][y][x] = best;
    indices[b][y][x] = best_pos;
}

void max_pool3d_cuda( const torch::Tensor tensor, const int kernel_size, const int stride,
                            torch::Tensor maxima, torch::Tensor indices ) {
    CHECK_CUDA(tensor);
    TORCH_CHECK(tensor.dim() == 4, "tensor should be 4-dimensional: BxCxHxW");
    const int BS = tensor.size(0);
    const int NC = tensor.size(1);
    const int IH = tensor.size(2); // input height
    const int IW = tensor.size(3); // input width

    // output size
    TORCH_CHECK( maxima.sizes() == indices.sizes(), "maxima and indices should have the same shape" );
    TORCH_CHECK( BS == maxima.size(0), "bad batch size" );
    const int OH = maxima.size(1);
    const int OW = maxima.size(2);

    const int64_t THREADS_PER_LAYER = 512;
    const int64_t N_BLOCKS = (BS*OH*OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER;
    
    // one block for each layer, one thread per local-max
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor.type(), "max_pool3d_cuda", ([&] {
       max_pool3d_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>(
            BS, NC, IH, IW, OH, OW, kernel_size, stride,
            tensor. packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(),
            maxima. packed_accessor64<scalar_t,3,torch::RestrictPtrTraits>(),
            indices.packed_accessor64<int64_t,3,torch::RestrictPtrTraits>());
    }));
}


__device__ inline float ptdot( const float* m, float x, float y ) {
  return x*m[0] + y*m[1] + m[2];
}

__device__ inline float sqr(float v) {
    return v*v;
}


__global__ void merge_corres_cuda_kernel( 
            const int OH, const int OW, const int OZ, const int IH, const int IW, 
            const float dmax2, int offset, const float* inv_rot, const int all_step,
            const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> corres_a,
                  torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> all_corres_a ) {

    // each thread takes care of one output
    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int i = idx % OW; idx /= OW;
    const int j = idx;
    if (j >= OH) return;

    const float tol2 = 2*2; // squared
    auto all_cor = all_corres_a[j][i];
    
    // center of the bin in the reference frame
    float x = i*all_step + all_step/2;
    float y = j*all_step + all_step/2;

    // center of the bin on the rescaled+rotated image
    float xr = ptdot( inv_rot + 0, x, y ); 
    float yr = ptdot( inv_rot + 3, x, y );

    // iterate on the nearby bins
    int xb = (int)(0.5+ xr/4); // rescaled+rotated desc always has step 4
    int yb = (int)(0.5+ yr/4);
    
    float best = dmax2;
    #pragma unroll
    for (int _v = -1; _v <= 1; _v++) {
      #pragma unroll
      for (int _u = -1; _u <= 1; _u++) {
        const int v = yb+_v, u = xb+_u;
        if (!(in(0, v, IH) && in(0, u, IW))) continue;
        auto cor = corres_a[v][u];
        float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y);
        if (d < best)  best = d;
    }}

    #pragma unroll
    for (int _v = -1; _v <= 1; _v++) {
      #pragma unroll
      for (int _u = -1; _u <= 1; _u++) {
        const int v = yb+_v, u = xb+_u;
        if (!(in(0, v, IH) && in(0, u, IW))) continue;
        auto cor = corres_a[v][u];
        float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y);
        if (d <= tol2*best) { // spatially close
            // merge correspondence if score is better than actual
            if (cor[4] > all_cor[4])
              for (int k = 0; k < OZ; k++) all_cor[k] = cor[k];
          }
    }}
}

void merge_corres_cuda( const torch::Tensor corres, const int offset, const torch::Tensor _inv_rot, 
                        const float dmax, torch::Tensor all_corres, const int all_step ) {
    CHECK_CUDA( corres );
    CHECK_CUDA( all_corres );
    CHECK_CUDA( _inv_rot );
    TORCH_CHECK(_inv_rot.is_contiguous(), "inv_rot should be contiguous" );

    const int IH = corres.size(0);
    const int IW = corres.size(1);
    const int IZ = corres.size(2);
    const int OH = all_corres.size(0);
    const int OW = all_corres.size(1);
    const int OZ = all_corres.size(2);
    TORCH_CHECK( IZ == OZ, "corres and all_corres should have the same shape[2]" );

    const int THREADS_PER_LAYER = 512;
    const int N_BLOCKS = (OH * OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER;
    
    merge_corres_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>(
        OH, OW, OZ, IH, IW, dmax*dmax, offset, _inv_rot.data_ptr<float>(), all_step,
                corres.packed_accessor32<float,3,torch::RestrictPtrTraits>(),
            all_corres.packed_accessor32<float,3,torch::RestrictPtrTraits>());
    CHECK_KERNEL();
}


template <typename scalar_t>
__global__ void mask_correlations_radial_cuda_kernel( 
            float radius, const float alpha,
            const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> targets,
                  torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> corr ) {

    #define H1 ((int)corr.size(0))
    #define W1 ((int)corr.size(1))
    #define H2 ((int)corr.size(2))
    #define W2 ((int)corr.size(3))

    // each block takes care of one layer corr[j,i,:,:]
    const int j = blockIdx.x / W1;
    const int i = blockIdx.x % W1;
    if (j >= H1) return;

    // read the target center
    const float cx = targets[j][i][0];
    const float cy = targets[j][i][1];
    if (cx != cx || cy != cy) return; // undefined center
    radius *= radius; // squared
    const float alpha_out = (alpha > 1 ? 1 : alpha);
    const float alpha_in = (alpha < 1 ? 1 : alpha);
    
    for (int idx = threadIdx.x; idx < H2*W2; idx += blockDim.x) {
        const int v = idx / W2;
        const int u = idx % W2;

        // compute weighting
        float dis2 = sqr(u - cx) + sqr(v - cy);
        float mul = alpha_in;
        if (dis2 > radius) 
            mul = 1 - alpha_out*(1 - radius / dis2);

        corr[j][i][v][u] *= mul; 
    }
}

void mask_correlations_radial_cuda( torch::Tensor corr, const torch::Tensor targets, 
                                    const float radius, const float alpha) {
    CHECK_CUDA( corr );
    CHECK_CUDA( targets );

    const int THREADS_PER_LAYER = 512;
    const int N_BLOCKS = H1*W1;

    #undef H1
    #undef W1
    #undef H2
    #undef W2

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(corr.type(), "mask_correlations_radial_cuda", ([&] {    
        mask_correlations_radial_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>(
            radius, alpha,
            targets.packed_accessor32<float,3,torch::RestrictPtrTraits>(),
               corr.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>());
    }));
    CHECK_KERNEL();
}