|
|
|
|
|
|
|
|
|
#include <torch/extension.h> |
|
#include <cuda.h> |
|
#include <cuda_runtime.h> |
|
#include <vector> |
|
|
|
#define MIN(x, y) ((x) < (y) ? (x) : (y)) |
|
#define MAX(x, y) ((x) < (y) ? (y) : (x)) |
|
#define inf std::numeric_limits<float>::infinity() |
|
|
|
#define CHECK_CUDA(tensor) {\ |
|
TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \ |
|
TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); } |
|
void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));} |
|
|
|
|
|
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 600 |
|
#define atomicMax_block atomicMax |
|
#endif |
|
|
|
|
|
template <typename scalar_t> |
|
__global__ void forward_agg_cuda_kernel( |
|
const int LH1, const int LW1, const int LH2, const int LW2, |
|
const int gap_left, const int gap_right, float norm, |
|
const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower, |
|
torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper, |
|
const float* weights, float* new_weights ) { |
|
|
|
const auto UH1 = LH1 + bool(!gap_left); |
|
const auto UW1 = LW1 + bool(!gap_left); |
|
const auto UH2 = LH2; |
|
const auto UW2 = LW2; |
|
|
|
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
const int uw2 = idx % UW2; idx /= UW2; |
|
const int uh2 = idx % UH2; idx /= UH2; |
|
const int uw1 = idx % UW1; idx /= UW1; |
|
const int uh1 = idx; |
|
if (uh1 >= UH1) return; |
|
|
|
|
|
float sumw = 0, nrm = 0, res = 0; |
|
|
|
for (int i = 0; i < 4; i++) { |
|
const int v = i/2, u = i%2; |
|
|
|
const int lh1 = uh1 + (1-v) * gap_left - v * gap_right; |
|
if (lh1 < 0 || lh1 >= LH1) continue; |
|
const int lw1 = uw1 + (1-u) * gap_left - u * gap_right; |
|
if (lw1 < 0 || lw1 >= LW1) continue; |
|
|
|
|
|
const float weight = weights ? weights[lh1*LW1 + lw1] : 1; |
|
sumw += weight; |
|
|
|
const int lh2 = uh2 + 1 - 2*v; |
|
if (lh2 < 0 || lh2 >= LH2) continue; |
|
const int lw2 = uw2 + 1 - 2*u; |
|
if (lw2 < 0 || lw2 >= LW2) continue; |
|
|
|
res += weight * lower[lh1][lw1][lh2][lw2]; |
|
nrm += weight; |
|
} |
|
|
|
|
|
nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1); |
|
upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0); |
|
if (uh2 == 1 && uw2 == 1) |
|
new_weights[uh1*UW1 + uw1] = sumw; |
|
} |
|
|
|
torch::Tensor forward_agg_cuda( int level, float norm, const torch::Tensor lower, |
|
const at::optional<at::Tensor> weights, torch::Tensor upper ) { |
|
CHECK_CUDA(lower); |
|
CHECK_CUDA(upper); |
|
if (weights) CHECK_CUDA(weights.value()); |
|
|
|
const auto UH1 = upper.size(0); |
|
const auto UW1 = upper.size(1); |
|
const auto UH2 = upper.size(2); |
|
const auto UW2 = upper.size(3); |
|
const auto LH1 = lower.size(0); |
|
const auto LW1 = lower.size(1); |
|
const auto LH2 = lower.size(2); |
|
const auto LW2 = lower.size(3); |
|
TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); |
|
|
|
const int gap_left = (level >= 2) ? 1 << (level-2) : 0; |
|
const int gap_right= 1 << MAX(0, level-2); |
|
|
|
const int MAX_THREADS = 512; |
|
const int THREADS_PER_BLOCK = MAX_THREADS; |
|
const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; |
|
|
|
torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat32)); |
|
|
|
|
|
AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_agg_cuda", ([&] { |
|
forward_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>( |
|
LH1, LW1, LH2, LW2, |
|
gap_left, gap_right, norm, |
|
lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
|
upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
|
weights ? weights->data_ptr<float>() : nullptr, new_weights.data_ptr<float>() ); |
|
})); |
|
return new_weights; |
|
} |
|
|
|
template <typename scalar_t> |
|
__global__ void forward_pool_agg_cuda_kernel( |
|
const int LH1, const int LW1, const int LH2, const int LW2, |
|
|
|
const int gap_left, const int gap_right, float norm, |
|
const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower, |
|
torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper, |
|
const float* weights, float* new_weights ) { |
|
|
|
const auto UH1 = LH1 + bool(!gap_left); |
|
const auto UW1 = LW1 + bool(!gap_left); |
|
const auto UH2 = (LH2-1)/2 + 1; |
|
const auto UW2 = (LW2-1)/2 + 1; |
|
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
const int uw2 = idx % UW2; idx /= UW2; |
|
const int uh2 = idx % UH2; idx /= UH2; |
|
const int uw1 = idx % UW1; idx /= UW1; |
|
const int uh1 = idx; |
|
if (uh1 >= UH1) return; |
|
|
|
|
|
float sumw = 0, nrm = 0, res = 0; |
|
|
|
for (int i = 0; i < 4; i++) { |
|
const int v = i/2, u = i%2; |
|
|
|
const int lh1 = uh1 + (1-v) * gap_left - v * gap_right; |
|
if (lh1 < 0 || lh1 >= LH1) continue; |
|
const int lw1 = uw1 + (1-u) * gap_left - u * gap_right; |
|
if (lw1 < 0 || lw1 >= LW1) continue; |
|
|
|
|
|
const float weight = weights ? weights[lh1*LW1 + lw1] : 1; |
|
sumw += weight; |
|
|
|
const int lh2_ = 2*(uh2 + 1 - 2*v); |
|
const int lw2_ = 2*(uw2 + 1 - 2*u); |
|
float lower_max = -inf; |
|
#pragma unroll |
|
for (int j = -1; j <= 1; j++) { |
|
const int lh2 = lh2_ + j; |
|
if (lh2 < 0 || lh2 >= LH2) continue; |
|
#pragma unroll |
|
for (int i = -1; i <= 1; i++) { |
|
const int lw2 = lw2_ + i; |
|
if (lw2 < 0 || lw2 >= LW2) continue; |
|
float l = lower[lh1][lw1][lh2][lw2]; |
|
lower_max = MAX(lower_max, l); |
|
}} |
|
if (lower_max == -inf) continue; |
|
|
|
res += weight * lower_max; |
|
nrm += weight; |
|
} |
|
|
|
|
|
nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1); |
|
upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0); |
|
if (uh2 == 1 && uw2 == 1) |
|
new_weights[uh1*UW1 + uw1] = sumw; |
|
} |
|
|
|
torch::Tensor forward_pool_agg_cuda( int level, float norm, const torch::Tensor lower, |
|
const at::optional<at::Tensor> weights, torch::Tensor upper ) { |
|
CHECK_CUDA(lower); |
|
CHECK_CUDA(upper); |
|
if (weights) CHECK_CUDA(weights.value()); |
|
|
|
const auto LH1 = lower.size(0); |
|
const auto LW1 = lower.size(1); |
|
const auto LH2 = lower.size(2); |
|
const auto LW2 = lower.size(3); |
|
const auto UH1 = upper.size(0); |
|
const auto UW1 = upper.size(1); |
|
const auto UH2 = upper.size(2); |
|
const auto UW2 = upper.size(3); |
|
TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); |
|
TORCH_CHECK( UH2 == (LH2-1)/2+1 && UW2 == (LW2-1)/2+1, "lower level should be twice as big" ); |
|
|
|
const int gap_left = (level >= 2) ? 1 << (level-2) : 0; |
|
const int gap_right= 1 << MAX(0, level-2); |
|
|
|
const int MAX_THREADS = 512; |
|
const int THREADS_PER_BLOCK = MAX_THREADS; |
|
const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; |
|
|
|
torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat)); |
|
|
|
|
|
AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_pool_agg_cuda", ([&] { |
|
forward_pool_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>( |
|
LH1, LW1, LH2, LW2, |
|
|
|
gap_left, gap_right, norm, |
|
lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
|
upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
|
weights ? weights->data<float>() : nullptr, new_weights.data<float>() ); |
|
})); |
|
return new_weights; |
|
} |
|
|
|
__device__ inline int in(int lower, int var, int upper) { |
|
return lower <= var && var < upper; |
|
} |
|
__device__ inline int sl(bool b) { |
|
return b ? 1 : -1; |
|
} |
|
|
|
__device__ short atomicMaxShort(short* address, short val) { |
|
unsigned int *base_address = (unsigned int *)((size_t)address & ~3); |
|
|
|
unsigned int order_from[] = {0x0010, 0x0032}; |
|
unsigned int from = order_from[((size_t)address & 3) / 2]; |
|
|
|
unsigned int order_back[] = {0x3254, 0x5410}; |
|
unsigned int back = order_back[((size_t)address & 3) / 2]; |
|
unsigned int old, assumed, max_, new_; |
|
|
|
old = *base_address; |
|
do { |
|
assumed = old; |
|
max_ = max(val, (short)__byte_perm(old, 0, from)); |
|
new_ = __byte_perm(old, max_, back); |
|
old = atomicCAS(base_address, assumed, new_); |
|
} while (assumed != old); |
|
return old; |
|
} |
|
|
|
template <typename scalar_t> |
|
__device__ inline void TplAtomicMax_block( scalar_t* before, scalar_t after ) { assert(!"atomicMax not implemented for this dtype"); } |
|
template <> |
|
__device__ inline void TplAtomicMax_block( at::Half* before, at::Half after ) { atomicMaxShort( (int16_t*)before, *(int16_t*)&after ); } |
|
template <> |
|
__device__ inline void TplAtomicMax_block( float* before, float after ) { atomicMax_block( (int32_t*)before, *(int32_t*)&after ); } |
|
|
|
template <typename scalar_t> |
|
__global__ void backward_agg_unpool_cuda_kernel( |
|
const int UH1, const int UW1, |
|
const int UH2, const int UW2, |
|
const int LH2, const int LW2, |
|
const int gap_left, const int gap_right, |
|
const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper, |
|
torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower ) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
const int lh1 = blockIdx.y; |
|
const int lw1 = blockIdx.x; |
|
const int UHW2 = UH2 * UW2; |
|
|
|
__shared__ float* _shared_addr; |
|
if (threadIdx.x == 0) |
|
do{ _shared_addr = new float [2*UHW2]; } |
|
while(!_shared_addr); |
|
__syncthreads(); |
|
|
|
float * layer_best = _shared_addr; |
|
int * layer_bestp = (int*)(_shared_addr+1); |
|
assert( layer_best ); |
|
|
|
|
|
|
|
for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) { |
|
const int ux = idx % UW2; |
|
const int uy = idx / UW2; |
|
const int lx = 2*ux; |
|
const int ly = 2*uy; |
|
|
|
|
|
float best = -inf; |
|
int bestp = 0; |
|
#pragma unroll |
|
for (int j_= -1; j_<= 1; j_++) { |
|
const int j = ly + j_; |
|
if (j < 0 || j >= LH2) continue; |
|
#pragma unroll |
|
for (int i_= -1; i_<= 1; i_++) { |
|
const int i = lx + i_; |
|
if (i < 0 || i >= LW2) continue; |
|
float cur = lower[lh1][lw1][j][i]; |
|
if (cur > best) { best = cur; bestp = j*LW2+i; } |
|
}} |
|
layer_best[2*idx] = best; |
|
layer_bestp[2*idx] = bestp; |
|
} |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) { |
|
const int ux = idx % UW2; |
|
const int uy = idx / UW2; |
|
|
|
|
|
scalar_t add = 0; |
|
for (int v = -gap_left; v <= gap_right; v += gap_right+gap_left) { |
|
for (int u = -gap_left; u <= gap_right; u += gap_right+gap_left) { |
|
const int uh1 = lh1 + v, uw1 = lw1 + u; |
|
const int uh2 = uy+sl(v>0), uw2 = ux+sl(u>0); |
|
if (in(0, uh1, UH1) && in(0, uw1, UW1) && in(0, uh2, UH2) && in(0, uw2, UW2)) |
|
add = MAX(add, upper[uh1][uw1][uh2][uw2]); |
|
}} |
|
|
|
|
|
float best = layer_best[2*idx]; |
|
int bestp = layer_bestp[2*idx]; |
|
const int lx = bestp % LW2; |
|
const int ly = bestp / LW2; |
|
|
|
|
|
scalar_t* before = & lower[lh1][lw1][ly][lx]; |
|
scalar_t after = best + add; |
|
TplAtomicMax_block<scalar_t>( before, after ); |
|
} |
|
|
|
__syncthreads(); |
|
|
|
if (threadIdx.x == 0) |
|
delete _shared_addr; |
|
} |
|
|
|
void backward_agg_unpool_cuda( int level, const torch::Tensor upper, torch::Tensor lower, bool exclude_borders ) { |
|
CHECK_CUDA(lower); |
|
CHECK_CUDA(upper); |
|
|
|
const auto UH1 = upper.size(0); |
|
const auto UW1 = upper.size(1); |
|
const auto UH2 = upper.size(2); |
|
const auto UW2 = upper.size(3); |
|
const auto LH1 = lower.size(0); |
|
const auto LW1 = lower.size(1); |
|
const auto LH2 = lower.size(2); |
|
const auto LW2 = lower.size(3); |
|
TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); |
|
const int xb = exclude_borders; |
|
|
|
const int gap_left = (level >= 2) ? 1 << (level-2) : 0; |
|
const int gap_right= 1 << MAX(0, level-2); |
|
|
|
const int64_t MAX_THREADS = 1024; |
|
const int64_t THREADS_PER_LAYER = MIN(UH2*UW2, MAX_THREADS); |
|
|
|
|
|
AT_DISPATCH_FLOATING_TYPES_AND_HALF(upper.type(), "backward_agg_unpool_cuda", ([&] { |
|
backward_agg_unpool_cuda_kernel<<<dim3(LW1,LH1), THREADS_PER_LAYER>>>( |
|
UH1, UW1, UH2, UW2, LH2-xb, LW2-xb, |
|
gap_left, gap_right, |
|
upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
|
lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>()); |
|
})); |
|
CHECK_KERNEL(); |
|
} |
|
|
|
template <typename scalar_t> |
|
__global__ void max_pool3d_cuda_kernel( |
|
const int BS, const int NC, const int IH, const int IW, const int OH, const int OW, |
|
const int ks, const int stride, |
|
const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> tensor, |
|
torch::PackedTensorAccessor64<scalar_t,3,torch::RestrictPtrTraits> maxima, |
|
torch::PackedTensorAccessor64<int64_t, 3,torch::RestrictPtrTraits> indices ) { |
|
|
|
|
|
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
const int x = idx % OW; idx /= OW; |
|
const int y = idx % OH; idx /= OH; |
|
const int b = idx; |
|
if (b >= BS) return; |
|
|
|
float best = -inf; |
|
int64_t best_pos = 0; |
|
for (int64_t c = 0; c < NC; c++) { |
|
for (int j = stride*y; j < stride*y+ks; j++) { |
|
for (int i = stride*x; i < stride*x+ks; i++) { |
|
|
|
float cur = tensor[b][c][j][i]; |
|
if (cur > best) {best = cur; best_pos = (c*IH + j)*IW+ i; } |
|
}}} |
|
|
|
|
|
maxima [b][y][x] = best; |
|
indices[b][y][x] = best_pos; |
|
} |
|
|
|
void max_pool3d_cuda( const torch::Tensor tensor, const int kernel_size, const int stride, |
|
torch::Tensor maxima, torch::Tensor indices ) { |
|
CHECK_CUDA(tensor); |
|
TORCH_CHECK(tensor.dim() == 4, "tensor should be 4-dimensional: BxCxHxW"); |
|
const int BS = tensor.size(0); |
|
const int NC = tensor.size(1); |
|
const int IH = tensor.size(2); |
|
const int IW = tensor.size(3); |
|
|
|
|
|
TORCH_CHECK( maxima.sizes() == indices.sizes(), "maxima and indices should have the same shape" ); |
|
TORCH_CHECK( BS == maxima.size(0), "bad batch size" ); |
|
const int OH = maxima.size(1); |
|
const int OW = maxima.size(2); |
|
|
|
const int64_t THREADS_PER_LAYER = 512; |
|
const int64_t N_BLOCKS = (BS*OH*OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER; |
|
|
|
|
|
AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor.type(), "max_pool3d_cuda", ([&] { |
|
max_pool3d_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>( |
|
BS, NC, IH, IW, OH, OW, kernel_size, stride, |
|
tensor. packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
|
maxima. packed_accessor64<scalar_t,3,torch::RestrictPtrTraits>(), |
|
indices.packed_accessor64<int64_t,3,torch::RestrictPtrTraits>()); |
|
})); |
|
} |
|
|
|
|
|
__device__ inline float ptdot( const float* m, float x, float y ) { |
|
return x*m[0] + y*m[1] + m[2]; |
|
} |
|
|
|
__device__ inline float sqr(float v) { |
|
return v*v; |
|
} |
|
|
|
|
|
__global__ void merge_corres_cuda_kernel( |
|
const int OH, const int OW, const int OZ, const int IH, const int IW, |
|
const float dmax2, int offset, const float* inv_rot, const int all_step, |
|
const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> corres_a, |
|
torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> all_corres_a ) { |
|
|
|
|
|
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
const int i = idx % OW; idx /= OW; |
|
const int j = idx; |
|
if (j >= OH) return; |
|
|
|
const float tol2 = 2*2; |
|
auto all_cor = all_corres_a[j][i]; |
|
|
|
|
|
float x = i*all_step + all_step/2; |
|
float y = j*all_step + all_step/2; |
|
|
|
|
|
float xr = ptdot( inv_rot + 0, x, y ); |
|
float yr = ptdot( inv_rot + 3, x, y ); |
|
|
|
|
|
int xb = (int)(0.5+ xr/4); |
|
int yb = (int)(0.5+ yr/4); |
|
|
|
float best = dmax2; |
|
#pragma unroll |
|
for (int _v = -1; _v <= 1; _v++) { |
|
#pragma unroll |
|
for (int _u = -1; _u <= 1; _u++) { |
|
const int v = yb+_v, u = xb+_u; |
|
if (!(in(0, v, IH) && in(0, u, IW))) continue; |
|
auto cor = corres_a[v][u]; |
|
float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y); |
|
if (d < best) best = d; |
|
}} |
|
|
|
#pragma unroll |
|
for (int _v = -1; _v <= 1; _v++) { |
|
#pragma unroll |
|
for (int _u = -1; _u <= 1; _u++) { |
|
const int v = yb+_v, u = xb+_u; |
|
if (!(in(0, v, IH) && in(0, u, IW))) continue; |
|
auto cor = corres_a[v][u]; |
|
float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y); |
|
if (d <= tol2*best) { |
|
|
|
if (cor[4] > all_cor[4]) |
|
for (int k = 0; k < OZ; k++) all_cor[k] = cor[k]; |
|
} |
|
}} |
|
} |
|
|
|
void merge_corres_cuda( const torch::Tensor corres, const int offset, const torch::Tensor _inv_rot, |
|
const float dmax, torch::Tensor all_corres, const int all_step ) { |
|
CHECK_CUDA( corres ); |
|
CHECK_CUDA( all_corres ); |
|
CHECK_CUDA( _inv_rot ); |
|
TORCH_CHECK(_inv_rot.is_contiguous(), "inv_rot should be contiguous" ); |
|
|
|
const int IH = corres.size(0); |
|
const int IW = corres.size(1); |
|
const int IZ = corres.size(2); |
|
const int OH = all_corres.size(0); |
|
const int OW = all_corres.size(1); |
|
const int OZ = all_corres.size(2); |
|
TORCH_CHECK( IZ == OZ, "corres and all_corres should have the same shape[2]" ); |
|
|
|
const int THREADS_PER_LAYER = 512; |
|
const int N_BLOCKS = (OH * OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER; |
|
|
|
merge_corres_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>( |
|
OH, OW, OZ, IH, IW, dmax*dmax, offset, _inv_rot.data_ptr<float>(), all_step, |
|
corres.packed_accessor32<float,3,torch::RestrictPtrTraits>(), |
|
all_corres.packed_accessor32<float,3,torch::RestrictPtrTraits>()); |
|
CHECK_KERNEL(); |
|
} |
|
|
|
|
|
template <typename scalar_t> |
|
__global__ void mask_correlations_radial_cuda_kernel( |
|
float radius, const float alpha, |
|
const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> targets, |
|
torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> corr ) { |
|
|
|
#define H1 ((int)corr.size(0)) |
|
#define W1 ((int)corr.size(1)) |
|
#define H2 ((int)corr.size(2)) |
|
#define W2 ((int)corr.size(3)) |
|
|
|
|
|
const int j = blockIdx.x / W1; |
|
const int i = blockIdx.x % W1; |
|
if (j >= H1) return; |
|
|
|
|
|
const float cx = targets[j][i][0]; |
|
const float cy = targets[j][i][1]; |
|
if (cx != cx || cy != cy) return; |
|
radius *= radius; |
|
const float alpha_out = (alpha > 1 ? 1 : alpha); |
|
const float alpha_in = (alpha < 1 ? 1 : alpha); |
|
|
|
for (int idx = threadIdx.x; idx < H2*W2; idx += blockDim.x) { |
|
const int v = idx / W2; |
|
const int u = idx % W2; |
|
|
|
|
|
float dis2 = sqr(u - cx) + sqr(v - cy); |
|
float mul = alpha_in; |
|
if (dis2 > radius) |
|
mul = 1 - alpha_out*(1 - radius / dis2); |
|
|
|
corr[j][i][v][u] *= mul; |
|
} |
|
} |
|
|
|
void mask_correlations_radial_cuda( torch::Tensor corr, const torch::Tensor targets, |
|
const float radius, const float alpha) { |
|
CHECK_CUDA( corr ); |
|
CHECK_CUDA( targets ); |
|
|
|
const int THREADS_PER_LAYER = 512; |
|
const int N_BLOCKS = H1*W1; |
|
|
|
#undef H1 |
|
#undef W1 |
|
#undef H2 |
|
#undef W2 |
|
|
|
AT_DISPATCH_FLOATING_TYPES_AND_HALF(corr.type(), "mask_correlations_radial_cuda", ([&] { |
|
mask_correlations_radial_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>( |
|
radius, alpha, |
|
targets.packed_accessor32<float,3,torch::RestrictPtrTraits>(), |
|
corr.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>()); |
|
})); |
|
CHECK_KERNEL(); |
|
} |
|
|