Find3D / Pointcept /libs /pointgroup_ops /src /bfs_cluster_kernel.cu
ziqima's picture
initial commit
4893ce0
/*
Ball Query with BatchIdx
Written by Li Jiang
All Rights Reserved 2020.
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 512
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
/* ================================== ballquery_batch_p ================================== */
__global__ void ballquery_batch_p_cuda_(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, int *cumsum) {
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (pt_idx >= n) return;
start_len += (pt_idx * 2);
int idx_temp[1000];
float radius2 = radius * radius;
float o_x = xyz[pt_idx * 3 + 0];
float o_y = xyz[pt_idx * 3 + 1];
float o_z = xyz[pt_idx * 3 + 2];
int batch_idx = batch_idxs[pt_idx];
int start = batch_offsets[batch_idx];
int end = batch_offsets[batch_idx + 1];
int cnt = 0;
for(int k = start; k < end; k++){
float x = xyz[k * 3 + 0];
float y = xyz[k * 3 + 1];
float z = xyz[k * 3 + 2];
float d2 = (o_x - x) * (o_x - x) + (o_y - y) * (o_y - y) + (o_z - z) * (o_z - z);
if(d2 < radius2){
if(cnt < 1000){
idx_temp[cnt] = k;
}
else{
break;
}
++cnt;
}
}
start_len[0] = atomicAdd(cumsum, cnt);
start_len[1] = cnt;
int thre = n * meanActive;
if(start_len[0] >= thre) return;
idx += start_len[0];
if(start_len[0] + cnt >= thre) cnt = thre - start_len[0];
for(int k = 0; k < cnt; k++){
idx[k] = idx_temp[k];
}
}
int ballquery_batch_p_cuda(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, cudaStream_t stream) {
// param xyz: (n, 3)
// param batch_idxs: (n)
// param batch_offsets: (B + 1)
// output idx: (n * meanActive) dim 0 for number of points in the ball, idx in n
// output start_len: (n, 2), int
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
int cumsum = 0;
int* p_cumsum;
cudaMalloc((void**)&p_cumsum, sizeof(int));
cudaMemcpy(p_cumsum, &cumsum, sizeof(int), cudaMemcpyHostToDevice);
ballquery_batch_p_cuda_<<<blocks, threads, 0, stream>>>(n, meanActive, radius, xyz, batch_idxs, batch_offsets, idx, start_len, p_cumsum);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
cudaMemcpy(&cumsum, p_cumsum, sizeof(int), cudaMemcpyDeviceToHost);
return cumsum;
}