Spaces:
Running
on
Zero
Running
on
Zero
/* | |
Ball Query with BatchIdx | |
Written by Li Jiang | |
All Rights Reserved 2020. | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#define TOTAL_THREADS 1024 | |
#define THREADS_PER_BLOCK 512 | |
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) | |
/* ================================== ballquery_batch_p ================================== */ | |
__global__ void ballquery_batch_p_cuda_(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, int *cumsum) { | |
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (pt_idx >= n) return; | |
start_len += (pt_idx * 2); | |
int idx_temp[1000]; | |
float radius2 = radius * radius; | |
float o_x = xyz[pt_idx * 3 + 0]; | |
float o_y = xyz[pt_idx * 3 + 1]; | |
float o_z = xyz[pt_idx * 3 + 2]; | |
int batch_idx = batch_idxs[pt_idx]; | |
int start = batch_offsets[batch_idx]; | |
int end = batch_offsets[batch_idx + 1]; | |
int cnt = 0; | |
for(int k = start; k < end; k++){ | |
float x = xyz[k * 3 + 0]; | |
float y = xyz[k * 3 + 1]; | |
float z = xyz[k * 3 + 2]; | |
float d2 = (o_x - x) * (o_x - x) + (o_y - y) * (o_y - y) + (o_z - z) * (o_z - z); | |
if(d2 < radius2){ | |
if(cnt < 1000){ | |
idx_temp[cnt] = k; | |
} | |
else{ | |
break; | |
} | |
++cnt; | |
} | |
} | |
start_len[0] = atomicAdd(cumsum, cnt); | |
start_len[1] = cnt; | |
int thre = n * meanActive; | |
if(start_len[0] >= thre) return; | |
idx += start_len[0]; | |
if(start_len[0] + cnt >= thre) cnt = thre - start_len[0]; | |
for(int k = 0; k < cnt; k++){ | |
idx[k] = idx_temp[k]; | |
} | |
} | |
int ballquery_batch_p_cuda(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, cudaStream_t stream) { | |
// param xyz: (n, 3) | |
// param batch_idxs: (n) | |
// param batch_offsets: (B + 1) | |
// output idx: (n * meanActive) dim 0 for number of points in the ball, idx in n | |
// output start_len: (n, 2), int | |
cudaError_t err; | |
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK)); | |
dim3 threads(THREADS_PER_BLOCK); | |
int cumsum = 0; | |
int* p_cumsum; | |
cudaMalloc((void**)&p_cumsum, sizeof(int)); | |
cudaMemcpy(p_cumsum, &cumsum, sizeof(int), cudaMemcpyHostToDevice); | |
ballquery_batch_p_cuda_<<<blocks, threads, 0, stream>>>(n, meanActive, radius, xyz, batch_idxs, batch_offsets, idx, start_len, p_cumsum); | |
err = cudaGetLastError(); | |
if (cudaSuccess != err) { | |
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); | |
exit(-1); | |
} | |
cudaMemcpy(&cumsum, p_cumsum, sizeof(int), cudaMemcpyDeviceToHost); | |
return cumsum; | |
} | |