// Copyright 2022-present NAVER Corp. // CC BY-NC-SA 4.0 // Available only for non-commercial use #include #include #include #include #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define MAX(x, y) ((x) < (y) ? (y) : (x)) #define inf std::numeric_limits::infinity() #define CHECK_CUDA(tensor) {\ TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \ TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); } void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));} #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 600 #define atomicMax_block atomicMax #endif template __global__ void forward_agg_cuda_kernel( const int LH1, const int LW1, const int LH2, const int LW2, const int gap_left, const int gap_right, float norm, const torch::PackedTensorAccessor64 lower, torch::PackedTensorAccessor64 upper, const float* weights, float* new_weights ) { const auto UH1 = LH1 + bool(!gap_left); // level 0 is smaller than other levels const auto UW1 = LW1 + bool(!gap_left); const auto UH2 = LH2; const auto UW2 = LW2; int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; const int uw2 = idx % UW2; idx /= UW2; const int uh2 = idx % UH2; idx /= UH2; const int uw1 = idx % UW1; idx /= UW1; const int uh1 = idx; if (uh1 >= UH1) return; // then, add the 4 child float sumw = 0, nrm = 0, res = 0; // #pragma unroll for (int i = 0; i < 4; i++) { const int v = i/2, u = i%2; // source pixel const int lh1 = uh1 + (1-v) * gap_left - v * gap_right; if (lh1 < 0 || lh1 >= LH1) continue; const int lw1 = uw1 + (1-u) * gap_left - u * gap_right; if (lw1 < 0 || lw1 >= LW1) continue; // load weight even if (lh2,lw2) are invalid const float weight = weights ? weights[lh1*LW1 + lw1] : 1; sumw += weight; const int lh2 = uh2 + 1 - 2*v; if (lh2 < 0 || lh2 >= LH2) continue; const int lw2 = uw2 + 1 - 2*u; if (lw2 < 0 || lw2 >= LW2) continue; res += weight * lower[lh1][lw1][lh2][lw2]; nrm += weight; } // normalize output nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1); upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0); if (uh2 == 1 && uw2 == 1) new_weights[uh1*UW1 + uw1] = sumw; } torch::Tensor forward_agg_cuda( int level, float norm, const torch::Tensor lower, const at::optional weights, torch::Tensor upper ) { CHECK_CUDA(lower); CHECK_CUDA(upper); if (weights) CHECK_CUDA(weights.value()); const auto UH1 = upper.size(0); const auto UW1 = upper.size(1); const auto UH2 = upper.size(2); const auto UW2 = upper.size(3); const auto LH1 = lower.size(0); const auto LW1 = lower.size(1); const auto LH2 = lower.size(2); const auto LW2 = lower.size(3); TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ... const int gap_right= 1 << MAX(0, level-2); // 1, 1, 2, 4, ... const int MAX_THREADS = 512; // faster than 1024 (higher SM occupancy) const int THREADS_PER_BLOCK = MAX_THREADS; const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat32)); // one block for each layer, one thread per local-max AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_agg_cuda", ([&] { forward_agg_cuda_kernel<<>>( LH1, LW1, LH2, LW2, gap_left, gap_right, norm, lower.packed_accessor64(), upper.packed_accessor64(), weights ? weights->data_ptr() : nullptr, new_weights.data_ptr() ); })); return new_weights; } template __global__ void forward_pool_agg_cuda_kernel( const int LH1, const int LW1, const int LH2, const int LW2, // const int UH1, const int UW1, const int UH2, const int UW2, const int gap_left, const int gap_right, float norm, const torch::PackedTensorAccessor64 lower, torch::PackedTensorAccessor64 upper, const float* weights, float* new_weights ) { const auto UH1 = LH1 + bool(!gap_left); // level 0 is smaller than other levels const auto UW1 = LW1 + bool(!gap_left); const auto UH2 = (LH2-1)/2 + 1; const auto UW2 = (LW2-1)/2 + 1; int idx = blockIdx.x * blockDim.x + threadIdx.x; const int uw2 = idx % UW2; idx /= UW2; const int uh2 = idx % UH2; idx /= UH2; const int uw1 = idx % UW1; idx /= UW1; const int uh1 = idx; if (uh1 >= UH1) return; // then, add the 4 child float sumw = 0, nrm = 0, res = 0; // #pragma unroll for (int i = 0; i < 4; i++) { const int v = i/2, u = i%2; // source pixel const int lh1 = uh1 + (1-v) * gap_left - v * gap_right; if (lh1 < 0 || lh1 >= LH1) continue; const int lw1 = uw1 + (1-u) * gap_left - u * gap_right; if (lw1 < 0 || lw1 >= LW1) continue; // load weight even if (lh2,lw2) are invalid const float weight = weights ? weights[lh1*LW1 + lw1] : 1; sumw += weight; const int lh2_ = 2*(uh2 + 1 - 2*v); // position in lower const int lw2_ = 2*(uw2 + 1 - 2*u); float lower_max = -inf; #pragma unroll for (int j = -1; j <= 1; j++) { const int lh2 = lh2_ + j; if (lh2 < 0 || lh2 >= LH2) continue; #pragma unroll for (int i = -1; i <= 1; i++) { const int lw2 = lw2_ + i; if (lw2 < 0 || lw2 >= LW2) continue; float l = lower[lh1][lw1][lh2][lw2]; lower_max = MAX(lower_max, l); }} if (lower_max == -inf) continue; res += weight * lower_max; nrm += weight; } // normalize output nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1); upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0); if (uh2 == 1 && uw2 == 1) new_weights[uh1*UW1 + uw1] = sumw; } torch::Tensor forward_pool_agg_cuda( int level, float norm, const torch::Tensor lower, const at::optional weights, torch::Tensor upper ) { CHECK_CUDA(lower); CHECK_CUDA(upper); if (weights) CHECK_CUDA(weights.value()); const auto LH1 = lower.size(0); const auto LW1 = lower.size(1); const auto LH2 = lower.size(2); const auto LW2 = lower.size(3); const auto UH1 = upper.size(0); const auto UW1 = upper.size(1); const auto UH2 = upper.size(2); const auto UW2 = upper.size(3); TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); TORCH_CHECK( UH2 == (LH2-1)/2+1 && UW2 == (LW2-1)/2+1, "lower level should be twice as big" ); const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ... const int gap_right= 1 << MAX(0, level-2); // 1, 1, 2, 4, ... const int MAX_THREADS = 512; // faster than 1024 (higher SM occupancy) const int THREADS_PER_BLOCK = MAX_THREADS; const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat)); // one block for each layer, one thread per local-max AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_pool_agg_cuda", ([&] { forward_pool_agg_cuda_kernel<<>>( LH1, LW1, LH2, LW2, // UH1, UW1, UH2, UW2, gap_left, gap_right, norm, lower.packed_accessor64(), upper.packed_accessor64(), weights ? weights->data() : nullptr, new_weights.data() ); })); return new_weights; } __device__ inline int in(int lower, int var, int upper) { return lower <= var && var < upper; } __device__ inline int sl(bool b) { return b ? 1 : -1; } __device__ short atomicMaxShort(short* address, short val) { unsigned int *base_address = (unsigned int *)((size_t)address & ~3); // multiple of 4 unsigned int order_from[] = {0x0010, 0x0032}; // either bytes[0:2] or bytes[2:4] unsigned int from = order_from[((size_t)address & 3) / 2]; unsigned int order_back[] = {0x3254, 0x5410}; // right-to-left unsigned int back = order_back[((size_t)address & 3) / 2]; unsigned int old, assumed, max_, new_; old = *base_address; do { assumed = old; max_ = max(val, (short)__byte_perm(old, 0, from)); // extract word new_ = __byte_perm(old, max_, back); // replace word old = atomicCAS(base_address, assumed, new_); } while (assumed != old); return old; } template __device__ inline void TplAtomicMax_block( scalar_t* before, scalar_t after ) { assert(!"atomicMax not implemented for this dtype"); } template <> __device__ inline void TplAtomicMax_block( at::Half* before, at::Half after ) { atomicMaxShort( (int16_t*)before, *(int16_t*)&after ); } template <> __device__ inline void TplAtomicMax_block( float* before, float after ) { atomicMax_block( (int32_t*)before, *(int32_t*)&after ); } template __global__ void backward_agg_unpool_cuda_kernel( const int UH1, const int UW1, const int UH2, const int UW2, const int LH2, const int LW2, const int gap_left, const int gap_right, const torch::PackedTensorAccessor64 upper, torch::PackedTensorAccessor64 lower ) { /* Each block is going to take care of a single layer, i.e. lower[:,:,0::2,0::2]. the first thread is allocating some global memory and then frees it later. */ // const int LH1 = gridDim.x; // const int LW1 = gridDim.y; const int lh1 = blockIdx.y; const int lw1 = blockIdx.x; const int UHW2 = UH2 * UW2; // upper layer size __shared__ float* _shared_addr; if (threadIdx.x == 0) do{ _shared_addr = new float [2*UHW2]; } // for each upper place, we have (best, bestp) while(!_shared_addr); // waiting for memory to be available... __syncthreads(); float * layer_best = _shared_addr; int * layer_bestp = (int*)(_shared_addr+1); //UHW); assert( layer_best ); /* First pass: we recover the position and values of all local maxima in the layer */ for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) { const int ux = idx % UW2; const int uy = idx / UW2; const int lx = 2*ux; // lower pos from upper pos const int ly = 2*uy; // argmax my local minima float best = -inf; int bestp = 0; #pragma unroll for (int j_= -1; j_<= 1; j_++) { const int j = ly + j_; if (j < 0 || j >= LH2) continue; #pragma unroll for (int i_= -1; i_<= 1; i_++) { const int i = lx + i_; if (i < 0 || i >= LW2) continue; float cur = lower[lh1][lw1][j][i]; if (cur > best) { best = cur; bestp = j*LW2+i; } }} layer_best[2*idx] = best; layer_bestp[2*idx] = bestp; } __syncthreads(); /* Second pass: we update the local maxima according to the upper layer */ for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) { const int ux = idx % UW2; const int uy = idx / UW2; // max-pool the additional value from the upper layer scalar_t add = 0; for (int v = -gap_left; v <= gap_right; v += gap_right+gap_left) { for (int u = -gap_left; u <= gap_right; u += gap_right+gap_left) { const int uh1 = lh1 + v, uw1 = lw1 + u; const int uh2 = uy+sl(v>0), uw2 = ux+sl(u>0); if (in(0, uh1, UH1) && in(0, uw1, UW1) && in(0, uh2, UH2) && in(0, uw2, UW2)) add = MAX(add, upper[uh1][uw1][uh2][uw2]); }} // grab local maxima float best = layer_best[2*idx]; int bestp = layer_bestp[2*idx]; const int lx = bestp % LW2; const int ly = bestp / LW2; // printf("UH=%d,UW=%d: uy=%d,ux=%d --> best=%g at ly=%d,lx=%d\n", UH,UW, uy,ux, best, ly,lx); scalar_t* before = & lower[lh1][lw1][ly][lx]; scalar_t after = best + add; TplAtomicMax_block( before, after ); } __syncthreads(); if (threadIdx.x == 0) delete _shared_addr; } void backward_agg_unpool_cuda( int level, const torch::Tensor upper, torch::Tensor lower, bool exclude_borders ) { CHECK_CUDA(lower); CHECK_CUDA(upper); const auto UH1 = upper.size(0); const auto UW1 = upper.size(1); const auto UH2 = upper.size(2); const auto UW2 = upper.size(3); const auto LH1 = lower.size(0); const auto LW1 = lower.size(1); const auto LH2 = lower.size(2); const auto LW2 = lower.size(3); TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); const int xb = exclude_borders; // local_argmax cannot reach the bottom and right borders const int gap_left = (level >= 2) ? 1 << (level-2) : 0; // 0, 1, 2, 4, ... const int gap_right= 1 << MAX(0, level-2); // 1, 1, 2, 4, ... const int64_t MAX_THREADS = 1024; const int64_t THREADS_PER_LAYER = MIN(UH2*UW2, MAX_THREADS); // one block for each layer, one thread per local-max AT_DISPATCH_FLOATING_TYPES_AND_HALF(upper.type(), "backward_agg_unpool_cuda", ([&] { backward_agg_unpool_cuda_kernel<<>>( UH1, UW1, UH2, UW2, LH2-xb, LW2-xb, gap_left, gap_right, upper.packed_accessor64(), lower.packed_accessor64()); })); CHECK_KERNEL(); } template __global__ void max_pool3d_cuda_kernel( const int BS, const int NC, const int IH, const int IW, const int OH, const int OW, const int ks, const int stride, const torch::PackedTensorAccessor64 tensor, torch::PackedTensorAccessor64 maxima, torch::PackedTensorAccessor64 indices ) { // each thread takes care of one output int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; const int x = idx % OW; idx /= OW; const int y = idx % OH; idx /= OH; const int b = idx; if (b >= BS) return; float best = -inf; int64_t best_pos = 0; for (int64_t c = 0; c < NC; c++) { for (int j = stride*y; j < stride*y+ks; j++) { for (int i = stride*x; i < stride*x+ks; i++) { // assert( b < BS and c < NC and j < IH and i < IW ); float cur = tensor[b][c][j][i]; if (cur > best) {best = cur; best_pos = (c*IH + j)*IW+ i; } }}} // assert( b < BS and y < OH and x < OW ); maxima [b][y][x] = best; indices[b][y][x] = best_pos; } void max_pool3d_cuda( const torch::Tensor tensor, const int kernel_size, const int stride, torch::Tensor maxima, torch::Tensor indices ) { CHECK_CUDA(tensor); TORCH_CHECK(tensor.dim() == 4, "tensor should be 4-dimensional: BxCxHxW"); const int BS = tensor.size(0); const int NC = tensor.size(1); const int IH = tensor.size(2); // input height const int IW = tensor.size(3); // input width // output size TORCH_CHECK( maxima.sizes() == indices.sizes(), "maxima and indices should have the same shape" ); TORCH_CHECK( BS == maxima.size(0), "bad batch size" ); const int OH = maxima.size(1); const int OW = maxima.size(2); const int64_t THREADS_PER_LAYER = 512; const int64_t N_BLOCKS = (BS*OH*OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER; // one block for each layer, one thread per local-max AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor.type(), "max_pool3d_cuda", ([&] { max_pool3d_cuda_kernel<<>>( BS, NC, IH, IW, OH, OW, kernel_size, stride, tensor. packed_accessor64(), maxima. packed_accessor64(), indices.packed_accessor64()); })); } __device__ inline float ptdot( const float* m, float x, float y ) { return x*m[0] + y*m[1] + m[2]; } __device__ inline float sqr(float v) { return v*v; } __global__ void merge_corres_cuda_kernel( const int OH, const int OW, const int OZ, const int IH, const int IW, const float dmax2, int offset, const float* inv_rot, const int all_step, const torch::PackedTensorAccessor32 corres_a, torch::PackedTensorAccessor32 all_corres_a ) { // each thread takes care of one output int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; const int i = idx % OW; idx /= OW; const int j = idx; if (j >= OH) return; const float tol2 = 2*2; // squared auto all_cor = all_corres_a[j][i]; // center of the bin in the reference frame float x = i*all_step + all_step/2; float y = j*all_step + all_step/2; // center of the bin on the rescaled+rotated image float xr = ptdot( inv_rot + 0, x, y ); float yr = ptdot( inv_rot + 3, x, y ); // iterate on the nearby bins int xb = (int)(0.5+ xr/4); // rescaled+rotated desc always has step 4 int yb = (int)(0.5+ yr/4); float best = dmax2; #pragma unroll for (int _v = -1; _v <= 1; _v++) { #pragma unroll for (int _u = -1; _u <= 1; _u++) { const int v = yb+_v, u = xb+_u; if (!(in(0, v, IH) && in(0, u, IW))) continue; auto cor = corres_a[v][u]; float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y); if (d < best) best = d; }} #pragma unroll for (int _v = -1; _v <= 1; _v++) { #pragma unroll for (int _u = -1; _u <= 1; _u++) { const int v = yb+_v, u = xb+_u; if (!(in(0, v, IH) && in(0, u, IW))) continue; auto cor = corres_a[v][u]; float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y); if (d <= tol2*best) { // spatially close // merge correspondence if score is better than actual if (cor[4] > all_cor[4]) for (int k = 0; k < OZ; k++) all_cor[k] = cor[k]; } }} } void merge_corres_cuda( const torch::Tensor corres, const int offset, const torch::Tensor _inv_rot, const float dmax, torch::Tensor all_corres, const int all_step ) { CHECK_CUDA( corres ); CHECK_CUDA( all_corres ); CHECK_CUDA( _inv_rot ); TORCH_CHECK(_inv_rot.is_contiguous(), "inv_rot should be contiguous" ); const int IH = corres.size(0); const int IW = corres.size(1); const int IZ = corres.size(2); const int OH = all_corres.size(0); const int OW = all_corres.size(1); const int OZ = all_corres.size(2); TORCH_CHECK( IZ == OZ, "corres and all_corres should have the same shape[2]" ); const int THREADS_PER_LAYER = 512; const int N_BLOCKS = (OH * OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER; merge_corres_cuda_kernel<<>>( OH, OW, OZ, IH, IW, dmax*dmax, offset, _inv_rot.data_ptr(), all_step, corres.packed_accessor32(), all_corres.packed_accessor32()); CHECK_KERNEL(); } template __global__ void mask_correlations_radial_cuda_kernel( float radius, const float alpha, const torch::PackedTensorAccessor32 targets, torch::PackedTensorAccessor64 corr ) { #define H1 ((int)corr.size(0)) #define W1 ((int)corr.size(1)) #define H2 ((int)corr.size(2)) #define W2 ((int)corr.size(3)) // each block takes care of one layer corr[j,i,:,:] const int j = blockIdx.x / W1; const int i = blockIdx.x % W1; if (j >= H1) return; // read the target center const float cx = targets[j][i][0]; const float cy = targets[j][i][1]; if (cx != cx || cy != cy) return; // undefined center radius *= radius; // squared const float alpha_out = (alpha > 1 ? 1 : alpha); const float alpha_in = (alpha < 1 ? 1 : alpha); for (int idx = threadIdx.x; idx < H2*W2; idx += blockDim.x) { const int v = idx / W2; const int u = idx % W2; // compute weighting float dis2 = sqr(u - cx) + sqr(v - cy); float mul = alpha_in; if (dis2 > radius) mul = 1 - alpha_out*(1 - radius / dis2); corr[j][i][v][u] *= mul; } } void mask_correlations_radial_cuda( torch::Tensor corr, const torch::Tensor targets, const float radius, const float alpha) { CHECK_CUDA( corr ); CHECK_CUDA( targets ); const int THREADS_PER_LAYER = 512; const int N_BLOCKS = H1*W1; #undef H1 #undef W1 #undef H2 #undef W2 AT_DISPATCH_FLOATING_TYPES_AND_HALF(corr.type(), "mask_correlations_radial_cuda", ([&] { mask_correlations_radial_cuda_kernel<<>>( radius, alpha, targets.packed_accessor32(), corr.packed_accessor64()); })); CHECK_KERNEL(); }