WatsonTang98 commited on
Commit
e8ffc70
1 Parent(s): db12f5b

Upload 21 files

Browse files
handcrafted_solution.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: This file contains the handcrafted solution for the task of wireframe reconstruction
2
+
3
+ import io
4
+ from PIL import Image as PImage
5
+ import numpy as np
6
+ from collections import defaultdict
7
+ import cv2
8
+ from typing import Tuple, List
9
+ from scipy.spatial.distance import cdist
10
+
11
+ from hoho.read_write_colmap import read_cameras_binary, read_images_binary, read_points3D_binary
12
+ from hoho.color_mappings import gestalt_color_mapping, ade20k_color_mapping
13
+
14
+
15
+ def empty_solution():
16
+ '''Return a minimal valid solution, i.e. 2 vertices and 1 edge.'''
17
+ return np.zeros((2,3)), [(0, 1)]
18
+
19
+
20
+ def convert_entry_to_human_readable(entry):
21
+ out = {}
22
+ already_good = ['__key__', 'wf_vertices', 'wf_edges', 'edge_semantics', 'mesh_vertices', 'mesh_faces', 'face_semantics', 'K', 'R', 't']
23
+ for k, v in entry.items():
24
+ if k in already_good:
25
+ out[k] = v
26
+ continue
27
+ if k == 'points3d':
28
+ out[k] = read_points3D_binary(fid=io.BytesIO(v))
29
+ if k == 'cameras':
30
+ out[k] = read_cameras_binary(fid=io.BytesIO(v))
31
+ if k == 'images':
32
+ out[k] = read_images_binary(fid=io.BytesIO(v))
33
+ if k in ['ade20k', 'gestalt']:
34
+ out[k] = [PImage.open(io.BytesIO(x)).convert('RGB') for x in v]
35
+ if k == 'depthcm':
36
+ out[k] = [PImage.open(io.BytesIO(x)) for x in entry['depthcm']]
37
+ return out
38
+
39
+
40
+ def get_vertices_and_edges_from_segmentation(gest_seg_np, edge_th = 50.0):
41
+ '''Get the vertices and edges from the gestalt segmentation mask of the house'''
42
+ vertices = []
43
+ connections = []
44
+ # Apex
45
+ apex_color = np.array(gestalt_color_mapping['apex'])
46
+ apex_mask = cv2.inRange(gest_seg_np, apex_color-0.5, apex_color+0.5)
47
+ if apex_mask.sum() > 0:
48
+ output = cv2.connectedComponentsWithStats(apex_mask, 8, cv2.CV_32S)
49
+ (numLabels, labels, stats, centroids) = output
50
+ stats, centroids = stats[1:], centroids[1:]
51
+
52
+ for i in range(numLabels-1):
53
+ vert = {"xy": centroids[i], "type": "apex"}
54
+ vertices.append(vert)
55
+
56
+ eave_end_color = np.array(gestalt_color_mapping['eave_end_point'])
57
+ eave_end_mask = cv2.inRange(gest_seg_np, eave_end_color-0.5, eave_end_color+0.5)
58
+ if eave_end_mask.sum() > 0:
59
+ output = cv2.connectedComponentsWithStats(eave_end_mask, 8, cv2.CV_32S)
60
+ (numLabels, labels, stats, centroids) = output
61
+ stats, centroids = stats[1:], centroids[1:]
62
+
63
+ for i in range(numLabels-1):
64
+ vert = {"xy": centroids[i], "type": "eave_end_point"}
65
+ vertices.append(vert)
66
+ # Connectivity
67
+ apex_pts = []
68
+ apex_pts_idxs = []
69
+ for j, v in enumerate(vertices):
70
+ apex_pts.append(v['xy'])
71
+ apex_pts_idxs.append(j)
72
+ apex_pts = np.array(apex_pts)
73
+
74
+ # Ridge connects two apex points
75
+ for edge_class in ['eave', 'ridge', 'rake', 'valley']:
76
+ edge_color = np.array(gestalt_color_mapping[edge_class])
77
+ mask = cv2.morphologyEx(cv2.inRange(gest_seg_np,
78
+ edge_color-0.5,
79
+ edge_color+0.5),
80
+ cv2.MORPH_DILATE, np.ones((11, 11)))
81
+ line_img = np.copy(gest_seg_np) * 0
82
+ if mask.sum() > 0:
83
+ output = cv2.connectedComponentsWithStats(mask, 8, cv2.CV_32S)
84
+ (numLabels, labels, stats, centroids) = output
85
+ stats, centroids = stats[1:], centroids[1:]
86
+ edges = []
87
+ for i in range(1, numLabels):
88
+ y,x = np.where(labels == i)
89
+ xleft_idx = np.argmin(x)
90
+ x_left = x[xleft_idx]
91
+ y_left = y[xleft_idx]
92
+ xright_idx = np.argmax(x)
93
+ x_right = x[xright_idx]
94
+ y_right = y[xright_idx]
95
+ edges.append((x_left, y_left, x_right, y_right))
96
+ cv2.line(line_img, (x_left, y_left), (x_right, y_right), (255, 255, 255), 2)
97
+ edges = np.array(edges)
98
+ if (len(apex_pts) < 2) or len(edges) <1:
99
+ continue
100
+ pts_to_edges_dist = np.minimum(cdist(apex_pts, edges[:,:2]), cdist(apex_pts, edges[:,2:]))
101
+ connectivity_mask = pts_to_edges_dist <= edge_th
102
+ edge_connects = connectivity_mask.sum(axis=0)
103
+ for edge_idx, edgesum in enumerate(edge_connects):
104
+ if edgesum>=2:
105
+ connected_verts = np.where(connectivity_mask[:,edge_idx])[0]
106
+ for a_i, a in enumerate(connected_verts):
107
+ for b in connected_verts[a_i+1:]:
108
+ connections.append((a, b))
109
+ return vertices, connections
110
+
111
+ def get_uv_depth(vertices, depth):
112
+ '''Get the depth of the vertices from the depth image'''
113
+ uv = []
114
+ for v in vertices:
115
+ uv.append(v['xy'])
116
+ uv = np.array(uv)
117
+ uv_int = uv.astype(np.int32)
118
+ H, W = depth.shape[:2]
119
+ uv_int[:, 0] = np.clip( uv_int[:, 0], 0, W-1)
120
+ uv_int[:, 1] = np.clip( uv_int[:, 1], 0, H-1)
121
+ vertex_depth = depth[(uv_int[:, 1] , uv_int[:, 0])]
122
+ return uv, vertex_depth
123
+
124
+
125
+ def merge_vertices_3d(vert_edge_per_image, th=0.1):
126
+ '''Merge vertices that are close to each other in 3D space and are of same types'''
127
+ all_3d_vertices = []
128
+ connections_3d = []
129
+ all_indexes = []
130
+ cur_start = 0
131
+ types = []
132
+ for cimg_idx, (vertices, connections, vertices_3d) in vert_edge_per_image.items():
133
+ types += [int(v['type']=='apex') for v in vertices]
134
+ all_3d_vertices.append(vertices_3d)
135
+ connections_3d+=[(x+cur_start,y+cur_start) for (x,y) in connections]
136
+ cur_start+=len(vertices_3d)
137
+ all_3d_vertices = np.concatenate(all_3d_vertices, axis=0)
138
+ #print (connections_3d)
139
+ distmat = cdist(all_3d_vertices, all_3d_vertices)
140
+ types = np.array(types).reshape(-1,1)
141
+ same_types = cdist(types, types)
142
+ mask_to_merge = (distmat <= th) & (same_types==0)
143
+ new_vertices = []
144
+ new_connections = []
145
+ to_merge = sorted(list(set([tuple(a.nonzero()[0].tolist()) for a in mask_to_merge])))
146
+ to_merge_final = defaultdict(list)
147
+ for i in range(len(all_3d_vertices)):
148
+ for j in to_merge:
149
+ if i in j:
150
+ to_merge_final[i]+=j
151
+ for k, v in to_merge_final.items():
152
+ to_merge_final[k] = list(set(v))
153
+ already_there = set()
154
+ merged = []
155
+ for k, v in to_merge_final.items():
156
+ if k in already_there:
157
+ continue
158
+ merged.append(v)
159
+ for vv in v:
160
+ already_there.add(vv)
161
+ old_idx_to_new = {}
162
+ count=0
163
+ for idxs in merged:
164
+ new_vertices.append(all_3d_vertices[idxs].mean(axis=0))
165
+ for idx in idxs:
166
+ old_idx_to_new[idx] = count
167
+ count +=1
168
+ #print (connections_3d)
169
+ new_vertices=np.array(new_vertices)
170
+ #print (connections_3d)
171
+ for conn in connections_3d:
172
+ new_con = sorted((old_idx_to_new[conn[0]], old_idx_to_new[conn[1]]))
173
+ if new_con[0] == new_con[1]:
174
+ continue
175
+ if new_con not in new_connections:
176
+ new_connections.append(new_con)
177
+ #print (f'{len(new_vertices)} left after merging {len(all_3d_vertices)} with {th=}')
178
+ return new_vertices, new_connections
179
+
180
+ def prune_not_connected(all_3d_vertices, connections_3d):
181
+ '''Prune vertices that are not connected to any other vertex'''
182
+ connected = defaultdict(list)
183
+ for c in connections_3d:
184
+ connected[c[0]].append(c)
185
+ connected[c[1]].append(c)
186
+ new_indexes = {}
187
+ new_verts = []
188
+ connected_out = []
189
+ for k,v in connected.items():
190
+ vert = all_3d_vertices[k]
191
+ if tuple(vert) not in new_verts:
192
+ new_verts.append(tuple(vert))
193
+ new_indexes[k]=len(new_verts) -1
194
+ for k,v in connected.items():
195
+ for vv in v:
196
+ connected_out.append((new_indexes[vv[0]],new_indexes[vv[1]]))
197
+ connected_out=list(set(connected_out))
198
+
199
+ return np.array(new_verts), connected_out
200
+
201
+
202
+ def predict(entry, visualize=False) -> Tuple[np.ndarray, List[int]]:
203
+ good_entry = convert_entry_to_human_readable(entry)
204
+ vert_edge_per_image = {}
205
+ for i, (gest, depth, K, R, t) in enumerate(zip(good_entry['gestalt'],
206
+ good_entry['depthcm'],
207
+ good_entry['K'],
208
+ good_entry['R'],
209
+ good_entry['t']
210
+ )):
211
+ gest_seg = gest.resize(depth.size)
212
+ gest_seg_np = np.array(gest_seg).astype(np.uint8)
213
+ # Metric3D
214
+ depth_np = np.array(depth) / 2.5 # 2.5 is the scale estimation coefficient
215
+ vertices, connections = get_vertices_and_edges_from_segmentation(gest_seg_np, edge_th = 5.)
216
+ if (len(vertices) < 2) or (len(connections) < 1):
217
+ print (f'Not enough vertices or connections in image {i}')
218
+ vert_edge_per_image[i] = np.empty((0, 2)), [], np.empty((0, 3))
219
+ continue
220
+ uv, depth_vert = get_uv_depth(vertices, depth_np)
221
+ # Normalize the uv to the camera intrinsics
222
+ xy_local = np.ones((len(uv), 3))
223
+ xy_local[:, 0] = (uv[:, 0] - K[0,2]) / K[0,0]
224
+ xy_local[:, 1] = (uv[:, 1] - K[1,2]) / K[1,1]
225
+ # Get the 3D vertices
226
+ vertices_3d_local = depth_vert[...,None] * (xy_local/np.linalg.norm(xy_local, axis=1)[...,None])
227
+ world_to_cam = np.eye(4)
228
+ world_to_cam[:3, :3] = R
229
+ world_to_cam[:3, 3] = t.reshape(-1)
230
+ cam_to_world = np.linalg.inv(world_to_cam)
231
+ vertices_3d = cv2.transform(cv2.convertPointsToHomogeneous(vertices_3d_local), cam_to_world)
232
+ vertices_3d = cv2.convertPointsFromHomogeneous(vertices_3d).reshape(-1, 3)
233
+ vert_edge_per_image[i] = vertices, connections, vertices_3d
234
+ all_3d_vertices, connections_3d = merge_vertices_3d(vert_edge_per_image, 3.0)
235
+ all_3d_vertices_clean, connections_3d_clean = prune_not_connected(all_3d_vertices, connections_3d)
236
+ if (len(all_3d_vertices_clean) < 2) or len(connections_3d_clean) < 1:
237
+ print (f'Not enough vertices or connections in the 3D vertices')
238
+ return (good_entry['__key__'], *empty_solution())
239
+ if visualize:
240
+ from hoho.viz3d import plot_estimate_and_gt
241
+ plot_estimate_and_gt( all_3d_vertices_clean,
242
+ connections_3d_clean,
243
+ good_entry['wf_vertices'],
244
+ good_entry['wf_edges'])
245
+ return good_entry['__key__'], all_3d_vertices_clean, connections_3d_clean
pc_util/.DS_Store ADDED
Binary file (6.15 kB). View file
 
pc_util/setup.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+ from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3
+
4
+ setup(
5
+ name='pc_util',
6
+ version='1.0',
7
+ ext_modules=[
8
+ CUDAExtension('pc_util', [
9
+ 'src/pointnet2_api.cpp',
10
+ 'src/ball_query.cpp',
11
+ 'src/ball_query_gpu.cu',
12
+ 'src/group_points.cpp',
13
+ 'src/group_points_gpu.cu',
14
+ 'src/interpolate.cpp',
15
+ 'src/interpolate_gpu.cu',
16
+ 'src/sampling.cpp',
17
+ 'src/sampling_gpu.cu',
18
+ 'src/cluster.cpp',
19
+ 'src/cluster_gpu.cu',
20
+ ], extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']})
21
+ ],
22
+ cmdclass={'build_ext': BuildExtension}
23
+ )
pc_util/src/ball_query.cpp ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/serialize/tensor.h>
2
+ #include <vector>
3
+ // #include <THC/THC.h>
4
+ #include <cuda.h>
5
+ #include <cuda_runtime_api.h>
6
+ #include "ball_query_gpu.h"
7
+
8
+ // extern THCState *state;
9
+
10
+ #include <ATen/cuda/CUDAContext.h>
11
+ #include <ATen/cuda/CUDAEvent.h>
12
+ // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
13
+
14
+ #define CHECK_CUDA(x) do { \
15
+ if (!x.type().is_cuda()) { \
16
+ fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
17
+ exit(-1); \
18
+ } \
19
+ } while (0)
20
+ #define CHECK_CONTIGUOUS(x) do { \
21
+ if (!x.is_contiguous()) { \
22
+ fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
23
+ exit(-1); \
24
+ } \
25
+ } while (0)
26
+ #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
27
+
28
+ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
29
+ at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
30
+ CHECK_INPUT(new_xyz_tensor);
31
+ CHECK_INPUT(xyz_tensor);
32
+ const float *new_xyz = new_xyz_tensor.data<float>();
33
+ const float *xyz = xyz_tensor.data<float>();
34
+ int *idx = idx_tensor.data<int>();
35
+
36
+ ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx);
37
+ return 1;
38
+ }
39
+
40
+
41
+ int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
42
+ at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor) {
43
+ CHECK_INPUT(point_tensor);
44
+ CHECK_INPUT(key_point_tensor);
45
+ const float *point = point_tensor.data<float>();
46
+ const float *key_point = key_point_tensor.data<float>();
47
+ int *idx = idx_tensor.data<int>();
48
+
49
+ ball_center_query_kernel_launcher_fast(b, n, m, radius, point, key_point, idx);
50
+ return 1;
51
+ }
52
+
53
+
54
+ int knn_query_wrapper_fast(int b, int n, int m, int nsample,
55
+ at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
56
+ CHECK_INPUT(new_xyz_tensor);
57
+ CHECK_INPUT(xyz_tensor);
58
+ const float *new_xyz = new_xyz_tensor.data<float>();
59
+ const float *xyz = xyz_tensor.data<float>();
60
+ float *dist2 = dist2_tensor.data<float>();
61
+ int *idx = idx_tensor.data<int>();
62
+
63
+ knn_query_kernel_launcher_fast(b, n, m, nsample, new_xyz, xyz, dist2, idx);
64
+ return 1;
65
+ }
66
+
67
+
68
+ int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
69
+ at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
70
+ at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor) {
71
+ CHECK_INPUT(new_xyz_tensor);
72
+ CHECK_INPUT(xyz_tensor);
73
+ CHECK_INPUT(new_xyz_batch_cnt_tensor);
74
+ CHECK_INPUT(xyz_batch_cnt_tensor);
75
+
76
+ const float *new_xyz = new_xyz_tensor.data<float>();
77
+ const float *xyz = xyz_tensor.data<float>();
78
+ const int *new_xyz_batch_cnt = new_xyz_batch_cnt_tensor.data<int>();
79
+ const int *xyz_batch_cnt = xyz_batch_cnt_tensor.data<int>();
80
+ int *idx = idx_tensor.data<int>();
81
+
82
+ ball_query_kernel_launcher_stack(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
83
+ return 1;
84
+ }
pc_util/src/ball_query_gpu.cu ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+
5
+
6
+ #include "ball_query_gpu.h"
7
+ #include "cuda_utils.h"
8
+
9
+
10
+ __global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample,
11
+ const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
12
+ // new_xyz: (B, M, 3)
13
+ // xyz: (B, N, 3)
14
+ // output:
15
+ // idx: (B, M, nsample)
16
+ int bs_idx = blockIdx.y;
17
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
18
+ if (bs_idx >= b || pt_idx >= m) return;
19
+
20
+ new_xyz += bs_idx * m * 3 + pt_idx * 3;
21
+ xyz += bs_idx * n * 3;
22
+ idx += bs_idx * m * nsample + pt_idx * nsample;
23
+
24
+ float radius2 = radius * radius;
25
+ float new_x = new_xyz[0];
26
+ float new_y = new_xyz[1];
27
+ float new_z = new_xyz[2];
28
+
29
+ int cnt = 0;
30
+ for (int k = 0; k < n; ++k) {
31
+ float x = xyz[k * 3 + 0];
32
+ float y = xyz[k * 3 + 1];
33
+ float z = xyz[k * 3 + 2];
34
+ float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
35
+ if (d2 < radius2){
36
+ if (cnt == 0){
37
+ for (int l = 0; l < nsample; ++l) {
38
+ idx[l] = k;
39
+ }
40
+ }
41
+ idx[cnt] = k;
42
+ ++cnt;
43
+ if (cnt >= nsample) break;
44
+ }
45
+ }
46
+ }
47
+
48
+
49
+ void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
50
+ const float *new_xyz, const float *xyz, int *idx) {
51
+ // new_xyz: (B, M, 3)
52
+ // xyz: (B, N, 3)
53
+ // output:
54
+ // idx: (B, M, nsample)
55
+
56
+ cudaError_t err;
57
+
58
+ dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
59
+ dim3 threads(THREADS_PER_BLOCK);
60
+
61
+ ball_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
62
+ // cudaDeviceSynchronize(); // for using printf in kernel function
63
+ err = cudaGetLastError();
64
+ if (cudaSuccess != err) {
65
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
66
+ exit(-1);
67
+ }
68
+ }
69
+
70
+
71
+ __global__ void ball_center_query_kernel_fast(int b, int n, int m, float radius, \
72
+ const float *__restrict__ point, const float *__restrict__ key_point, int *__restrict__ idx) {
73
+ // key_point: (B, M, 3)
74
+ // point: (B, N, 3)
75
+ // output:
76
+ // idx: (B, N)
77
+ int bs_idx = blockIdx.y;
78
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
79
+ if (bs_idx >= b || pt_idx >= n) return;
80
+
81
+ point += bs_idx * n * 3 + pt_idx * 3;
82
+ key_point += bs_idx * m * 3;
83
+ idx += bs_idx * n + pt_idx;
84
+
85
+ float radius2 = radius * radius;
86
+ float point_x = point[0];
87
+ float point_y = point[1];
88
+ float point_z = point[2];
89
+
90
+ float bestd = 1e8;
91
+ for (int k = 0; k < m; ++k) {
92
+ float x = key_point[k * 3 + 0];
93
+ float y = key_point[k * 3 + 1];
94
+ float z = key_point[k * 3 + 2];
95
+ if (((x + 1) * (x + 1) + (y + 1) * (y + 1) + (z + 1) * (z + 1)) < 1e-4) break;
96
+ float d2 = (point_x - x) * (point_x - x) + (point_y - y) * (point_y - y) + (point_z - z) * (point_z - z);
97
+ if (d2 < radius2 && d2 < bestd){
98
+ idx[0] = k;
99
+ bestd = d2;
100
+ }
101
+ }
102
+ }
103
+
104
+
105
+ void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius, \
106
+ const float *point, const float *key_point, int *idx) {
107
+ // point: (B, n, 3)
108
+ // key_point: (B, m, 3)
109
+ // output:
110
+ // idx: (B, n)
111
+
112
+ cudaError_t err;
113
+
114
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
115
+ dim3 threads(THREADS_PER_BLOCK);
116
+
117
+ ball_center_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, point, key_point, idx);
118
+ // cudaDeviceSynchronize(); // for using printf in kernel function
119
+ err = cudaGetLastError();
120
+ if (cudaSuccess != err) {
121
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
122
+ exit(-1);
123
+ }
124
+ }
125
+
126
+
127
+
128
+
129
+
130
+ __global__ void knn_query_kernel_fast(int b, int n, int m, int nsample, const float *__restrict__ new_xyz,
131
+ const float *__restrict__ xyz, float *__restrict__ dist2, int *__restrict__ idx) {
132
+
133
+ // new_xyz: (B, M, 3)
134
+ // xyz: (B, N, 3)
135
+ // output:
136
+ // dist2: (B, M, nsample)
137
+ // idx: (B, M, nsample)
138
+
139
+ int bs_idx = blockIdx.y;
140
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
141
+ if (bs_idx >= b || pt_idx >= m) return;
142
+
143
+ new_xyz += bs_idx * m * 3 + pt_idx * 3;
144
+ xyz += bs_idx * n * 3;
145
+ dist2 += bs_idx * m * nsample + pt_idx * nsample;
146
+ idx += bs_idx * m * nsample + pt_idx * nsample;
147
+
148
+ float nx = new_xyz[0];
149
+ float ny = new_xyz[1];
150
+ float nz = new_xyz[2];
151
+
152
+ for (int i = 0; i < n; ++i) {
153
+ float x = xyz[i * 3 + 0];
154
+ float y = xyz[i * 3 + 1];
155
+ float z = xyz[i * 3 + 2];
156
+ float d2 = (nx - x) * (nx - x) + (ny - y) * (ny - y) + (nz - z) * (nz - z);
157
+ if (d2 < dist2[nsample - 1]) {
158
+ dist2[nsample - 1] = d2;
159
+ idx[nsample - 1] = i;
160
+ for (int j = nsample - 2; j >= 0; j--) {
161
+ if (d2 < dist2[j]){
162
+ dist2[j + 1] = dist2[j];
163
+ dist2[j] = d2;
164
+ idx[j + 1] = idx[j];
165
+ idx[j] = i;
166
+ }
167
+ }
168
+ }
169
+ }
170
+ }
171
+
172
+
173
+ void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample, \
174
+ const float *new_xyz, const float *xyz, float *dist2, int *idx) {
175
+ cudaError_t err;
176
+
177
+ dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
178
+ dim3 threads(THREADS_PER_BLOCK);
179
+
180
+ knn_query_kernel_fast<<<blocks, threads>>>(b, n, m, nsample, new_xyz, xyz, dist2, idx);
181
+ // cudaDeviceSynchronize(); // for using printf in kernel function
182
+ err = cudaGetLastError();
183
+ if (cudaSuccess != err) {
184
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
185
+ exit(-1);
186
+ }
187
+ }
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+ __global__ void ball_query_kernel_stack(int B, int M, float radius, int nsample, \
197
+ const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx) {
198
+ // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
199
+ // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
200
+ // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
201
+ // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
202
+ // output:
203
+ // idx: (M, nsample)
204
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
205
+ if (pt_idx >= M) return;
206
+
207
+ int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0];
208
+ for (int k = 1; k < B; k++){
209
+ if (pt_idx < pt_cnt) break;
210
+ pt_cnt += new_xyz_batch_cnt[k];
211
+ bs_idx = k;
212
+ }
213
+
214
+ int xyz_batch_start_idx = 0;
215
+ for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
216
+ // for (int k = 0; k < bs_idx; k++) new_xyz_batch_start_idx += new_xyz_batch_cnt[k];
217
+
218
+ new_xyz += pt_idx * 3;
219
+ xyz += xyz_batch_start_idx * 3;
220
+ idx += pt_idx * nsample;
221
+
222
+ float radius2 = radius * radius;
223
+ float new_x = new_xyz[0];
224
+ float new_y = new_xyz[1];
225
+ float new_z = new_xyz[2];
226
+ int n = xyz_batch_cnt[bs_idx];
227
+
228
+ int cnt = 0;
229
+ for (int k = 0; k < n; ++k) {
230
+ float x = xyz[k * 3 + 0];
231
+ float y = xyz[k * 3 + 1];
232
+ float z = xyz[k * 3 + 2];
233
+ float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
234
+ if (d2 < radius2){
235
+ if (cnt == 0){
236
+ for (int l = 0; l < nsample; ++l) {
237
+ idx[l] = k;
238
+ }
239
+ }
240
+ idx[cnt] = k;
241
+ ++cnt;
242
+ if (cnt >= nsample) break;
243
+ }
244
+ }
245
+ if (cnt == 0) idx[0] = -1;
246
+ }
247
+
248
+
249
+ void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
250
+ const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx){
251
+ // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
252
+ // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
253
+ // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
254
+ // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
255
+ // output:
256
+ // idx: (M, nsample)
257
+
258
+ cudaError_t err;
259
+
260
+ dim3 blocks(DIVUP(M, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
261
+ dim3 threads(THREADS_PER_BLOCK);
262
+
263
+ ball_query_kernel_stack<<<blocks, threads>>>(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
264
+ // cudaDeviceSynchronize(); // for using printf in kernel function
265
+ err = cudaGetLastError();
266
+ if (cudaSuccess != err) {
267
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
268
+ exit(-1);
269
+ }
270
+ }
pc_util/src/ball_query_gpu.h ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _BALL_QUERY_GPU_H
2
+ #define _BALL_QUERY_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include <vector>
6
+ #include <cuda.h>
7
+ #include <cuda_runtime_api.h>
8
+
9
+ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
10
+ at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
11
+
12
+ void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample,
13
+ const float *new_xyz, const float *xyz, int *idx);
14
+
15
+ int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
16
+ at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor);
17
+
18
+ void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius,
19
+ const float *point, const float *key_point, int *idx);
20
+
21
+ int knn_query_wrapper_fast(int b, int n, int m, int nsample,
22
+ at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
23
+
24
+ void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample,
25
+ const float *new_xyz, const float *xyz, float *dist2, int *idx);
26
+
27
+
28
+ int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
29
+ at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
30
+ at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor);
31
+
32
+
33
+ void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
34
+ const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx);
35
+
36
+
37
+
38
+ #endif
pc_util/src/cluster.cpp ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/serialize/tensor.h>
2
+ #include <vector>
3
+ // #include <THC/THC.h>
4
+ #include <cuda.h>
5
+ #include <cuda_runtime_api.h>
6
+ #include "cluster_gpu.h"
7
+
8
+ // extern THCState *state;
9
+
10
+ #include <ATen/cuda/CUDAContext.h>
11
+ #include <ATen/cuda/CUDAEvent.h>
12
+ // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
13
+
14
+ #define CHECK_CUDA(x) do { \
15
+ if (!x.type().is_cuda()) { \
16
+ fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
17
+ exit(-1); \
18
+ } \
19
+ } while (0)
20
+ #define CHECK_CONTIGUOUS(x) do { \
21
+ if (!x.is_contiguous()) { \
22
+ fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
23
+ exit(-1); \
24
+ } \
25
+ } while (0)
26
+ #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
27
+
28
+ int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
29
+ CHECK_INPUT(xyz_tensor);
30
+ const float *xyz = xyz_tensor.data<float>();
31
+ int *idx = idx_tensor.data<int>();
32
+
33
+ dbscan_kernel_launcher_fast(b, n, eps, min_pts, xyz, idx);
34
+ return 1;
35
+ }
36
+
37
+
38
+ int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
39
+ at::Tensor new_xyz_tensor, at::Tensor num_tensor) {
40
+ CHECK_INPUT(xyz_tensor);
41
+ CHECK_INPUT(idx_tensor);
42
+ const float *xyz = xyz_tensor.data<float>();
43
+ const int *idx = idx_tensor.data<int>();
44
+ float *new_xyz = new_xyz_tensor.data<float>();
45
+ int *num = num_tensor.data<int>();
46
+
47
+ cluster_pts_kernel_launcher_fast(b, n, m, xyz, idx, new_xyz, num);
48
+ return 1;
49
+ }
50
+
pc_util/src/cluster_gpu.cu ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+
5
+
6
+ #include "cluster_gpu.h"
7
+ #include "cuda_utils.h"
8
+
9
+
10
+ __device__ float get_dis(float x1, float y1, float z1, float x2, float y2, float z2) {
11
+ float dis = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2);
12
+ return sqrt(dis);
13
+ }
14
+ /*
15
+ __device__ void dfs (int i, int c, int n, int min_pts, const int* pts_cnt, const int* pts_adj, int* idx, int label) {
16
+ idx[i] = c;
17
+ if(pts_cnt[i] < min_pts) return;
18
+
19
+ for(int j=0;j<n;j++) {
20
+
21
+ int adj = pts_adj[i * n + j];
22
+ printf("%d %d %d\n", i * n, i * n + j, adj);
23
+ if (adj == -1) break;
24
+ if (idx[adj] == -1)
25
+ dfs(adj, c, n, min_pts, pts_cnt, pts_adj, idx, label);
26
+ }
27
+ }
28
+ */
29
+
30
+ __global__ void dbscan_kernel_fast(int b, int n, float eps, int min_pts, const float *__restrict__ xyz, int *__restrict__ idx,
31
+ int *__restrict__ pts_cnt, int *__restrict__ pts_adj, int *__restrict__ pts_stack) {
32
+ // xyz: (B, N, 3)
33
+ // output:
34
+ // idx: (B, N)
35
+ int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
36
+ if (bs_idx >= b) return;
37
+
38
+ xyz += bs_idx * n * 3;
39
+ idx += bs_idx * n;
40
+ pts_cnt += bs_idx * n;
41
+ pts_stack += bs_idx * n;
42
+ pts_adj += bs_idx * n * n;
43
+
44
+ for(int i=0;i<n;i++) {
45
+ pts_cnt[i] = 0;
46
+ for(int j=0;j<n;j++) {
47
+ pts_adj[i * n + j] = -1;
48
+ if(i==j) continue;
49
+ float x1 = xyz[i * 3 + 0];
50
+ float y1 = xyz[i * 3 + 1];
51
+ float z1 = xyz[i * 3 + 2];
52
+ float x2 = xyz[j * 3 + 0];
53
+ float y2 = xyz[j * 3 + 1];
54
+ float z2 = xyz[j * 3 + 2];
55
+
56
+ if(get_dis(x2, y2, z2, -10.0, -10.0, -10.0) < 1e-3) continue;
57
+ if(get_dis(x1, y1, z1, x2, y2, z2) <= eps) {
58
+ pts_adj[i * n + pts_cnt[i]] = j;
59
+ pts_cnt[i] += 1;
60
+ }
61
+
62
+ }
63
+ }
64
+
65
+ int cluster_idx = 0;
66
+
67
+ for(int i=0;i<n;i++) {
68
+ if(idx[i] != -1) continue;
69
+
70
+ if(pts_cnt[i] >= min_pts) {
71
+ for(int j=0;j<n;j++)
72
+ pts_stack[j] = -1;
73
+ pts_stack[0] = i;
74
+ int stack_idx = 0;
75
+ int stack_len = 1;
76
+ while (stack_idx < n && pts_stack[stack_idx] != -1)
77
+ {
78
+ int pts_idx = pts_stack[stack_idx];
79
+ idx[pts_idx] = cluster_idx;
80
+ if(pts_cnt[pts_idx] < min_pts){
81
+ stack_idx += 1;
82
+ continue;
83
+ }
84
+ for(int j=0;j<n;j++) {
85
+ int adj = pts_adj[pts_idx * n + j];
86
+ if (adj == -1) break;
87
+ if (idx[adj] == -1)
88
+ {
89
+ idx[adj] = -2;
90
+ pts_stack[stack_len++] = adj;
91
+ }
92
+ }
93
+ stack_idx += 1;
94
+ }
95
+ cluster_idx += 1;
96
+ }
97
+ }
98
+ }
99
+
100
+
101
+ void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx) {
102
+ // xyz: (B, N, 3)
103
+ // output:
104
+ // idx: (B, N)
105
+
106
+ cudaError_t err;
107
+
108
+ dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
109
+ dim3 threads(THREADS_PER_BLOCK);
110
+
111
+ int* pts_cnt;
112
+ int* pts_stack;
113
+ int* pts_adj;
114
+
115
+ err = cudaMalloc((void**)&pts_cnt, b * n * sizeof(int));
116
+ if (cudaSuccess != err) {
117
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
118
+ exit(-1);
119
+ }
120
+
121
+ err = cudaMalloc((void**)&pts_stack, b * n * sizeof(int));
122
+ if (cudaSuccess != err) {
123
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
124
+ exit(-1);
125
+ }
126
+
127
+ err = cudaMalloc((void**)&pts_adj, b * n * n * sizeof(int));
128
+ if (cudaSuccess != err) {
129
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
130
+ exit(-1);
131
+ }
132
+
133
+ dbscan_kernel_fast<<<blocks, threads>>>(b, n, eps, min_pts, xyz, idx, pts_cnt, pts_adj, pts_stack);
134
+ // cudaDeviceSynchronize(); // for using printf in kernel function
135
+ cudaFree(pts_cnt);
136
+ cudaFree(pts_stack);
137
+ cudaFree(pts_adj);
138
+ err = cudaGetLastError();
139
+ if (cudaSuccess != err) {
140
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
141
+ exit(-1);
142
+ }
143
+ }
144
+
145
+
146
+
147
+ __global__ void cluster_pts_kernel_fast(int b, int n, int m, const float *__restrict__ xyz, const int *__restrict__ idx,
148
+ float *__restrict__ new_xyz, int *__restrict__ num) {
149
+ int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
150
+ if (bs_idx >= b ) return;
151
+
152
+ xyz += bs_idx * n * 3;
153
+ idx += bs_idx * n;
154
+ new_xyz += bs_idx * m * 3;
155
+ num += bs_idx * m;
156
+
157
+ for(int i=0;i<n;i++) {
158
+ if (idx[i] == -1) continue;
159
+ int c_idx = idx[i];
160
+ new_xyz[c_idx * 3 + 0] += xyz[i * 3 + 0];
161
+ new_xyz[c_idx * 3 + 1] += xyz[i * 3 + 1];
162
+ new_xyz[c_idx * 3 + 2] += xyz[i * 3 + 2];
163
+ num[c_idx] += 1;
164
+ }
165
+ for(int i=0;i<m;i++) {
166
+ if (num[i] == 0) break;
167
+ new_xyz[i * 3 + 0] /= num[i];
168
+ new_xyz[i * 3 + 1] /= num[i];
169
+ new_xyz[i * 3 + 2] /= num[i];
170
+ }
171
+
172
+ }
173
+
174
+
175
+
176
+
177
+ void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num) {
178
+ cudaError_t err;
179
+
180
+ dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
181
+ dim3 threads(THREADS_PER_BLOCK);
182
+
183
+ cluster_pts_kernel_fast<<<blocks, threads>>>(b, n, m, xyz, idx, new_xyz, num);
184
+ // cudaDeviceSynchronize(); // for using printf in kernel function
185
+ err = cudaGetLastError();
186
+ if (cudaSuccess != err) {
187
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
188
+ exit(-1);
189
+ }
190
+ }
191
+
192
+
pc_util/src/cluster_gpu.h ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _CLUSTER_GPU_H
2
+ #define _CLUSTER_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include <vector>
6
+ #include <cuda.h>
7
+ #include <cuda_runtime_api.h>
8
+
9
+ int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor);
10
+
11
+ void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx);
12
+
13
+ int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
14
+ at::Tensor new_xyz_tensor, at::Tensor num_tensor);
15
+
16
+ void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num);
17
+
18
+
19
+ int dbscan_wrapper_stack(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor,
20
+ at::Tensor idx_tensor);
21
+
22
+
23
+ void dbscan_kernel_launcher_stack(int b, int n, float eps, int min_pts,
24
+ const float *xyz, const int *xyz_batch_cnt, int *idx);
25
+
26
+ int cluster_pts_wrapper_stack(int B, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor,
27
+ at::Tensor new_xyz_tensor, at::Tensor cluster_cnt_tensor);
28
+
29
+
30
+ void cluster_pts_kernel_launcher_stack(int B, const float *xyz, const int *xyz_batch_cnt, int *idx,
31
+ const float *new_xyz, const int *cluster_cnt);
32
+
33
+ #endif
34
+
pc_util/src/cuda_utils.h ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _CUDA_UTILS_H
2
+ #define _CUDA_UTILS_H
3
+
4
+ #include <cmath>
5
+
6
+ #define TOTAL_THREADS 1024
7
+ #define THREADS_PER_BLOCK 256
8
+ #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
9
+
10
+ inline int opt_n_threads(int work_size) {
11
+ const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
12
+
13
+ return max(min(1 << pow_2, TOTAL_THREADS), 1);
14
+ }
15
+ #endif
pc_util/src/group_points.cpp ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/serialize/tensor.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime_api.h>
4
+ #include <vector>
5
+ // #include <THC/THC.h>
6
+ #include "group_points_gpu.h"
7
+
8
+ // extern THCState *state;
9
+
10
+ #include <ATen/cuda/CUDAContext.h>
11
+ #include <ATen/cuda/CUDAEvent.h>
12
+ // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
13
+
14
+ #define CHECK_CUDA(x) do { \
15
+ if (!x.type().is_cuda()) { \
16
+ fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
17
+ exit(-1); \
18
+ } \
19
+ } while (0)
20
+ #define CHECK_CONTIGUOUS(x) do { \
21
+ if (!x.is_contiguous()) { \
22
+ fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
23
+ exit(-1); \
24
+ } \
25
+ } while (0)
26
+ #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
27
+
28
+
29
+
30
+ int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
31
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
32
+
33
+ float *grad_points = grad_points_tensor.data<float>();
34
+ const int *idx = idx_tensor.data<int>();
35
+ const float *grad_out = grad_out_tensor.data<float>();
36
+
37
+ group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points);
38
+ return 1;
39
+ }
40
+
41
+
42
+ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
43
+ at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
44
+
45
+ const float *points = points_tensor.data<float>();
46
+ const int *idx = idx_tensor.data<int>();
47
+ float *out = out_tensor.data<float>();
48
+
49
+ group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out);
50
+ return 1;
51
+ }
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+ int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
60
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
61
+ at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor) {
62
+
63
+ CHECK_INPUT(grad_out_tensor);
64
+ CHECK_INPUT(idx_tensor);
65
+ CHECK_INPUT(idx_batch_cnt_tensor);
66
+ CHECK_INPUT(features_batch_cnt_tensor);
67
+ CHECK_INPUT(grad_features_tensor);
68
+
69
+ const float *grad_out = grad_out_tensor.data<float>();
70
+ const int *idx = idx_tensor.data<int>();
71
+ const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
72
+ const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
73
+ float *grad_features = grad_features_tensor.data<float>();
74
+
75
+ group_points_grad_kernel_launcher_stack(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
76
+ return 1;
77
+ }
78
+
79
+
80
+ int group_points_wrapper_stack(int B, int M, int C, int nsample,
81
+ at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
82
+ at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor) {
83
+
84
+ CHECK_INPUT(features_tensor);
85
+ CHECK_INPUT(features_batch_cnt_tensor);
86
+ CHECK_INPUT(idx_tensor);
87
+ CHECK_INPUT(idx_batch_cnt_tensor);
88
+ CHECK_INPUT(out_tensor);
89
+
90
+ const float *features = features_tensor.data<float>();
91
+ const int *idx = idx_tensor.data<int>();
92
+ const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
93
+ const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
94
+ float *out = out_tensor.data<float>();
95
+
96
+ group_points_kernel_launcher_stack(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
97
+ return 1;
98
+ }
pc_util/src/group_points_gpu.cu ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+
4
+ #include "cuda_utils.h"
5
+ #include "group_points_gpu.h"
6
+
7
+
8
+ __global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample,
9
+ const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
10
+ // grad_out: (B, C, npoints, nsample)
11
+ // idx: (B, npoints, nsample)
12
+ // output:
13
+ // grad_points: (B, C, N)
14
+ int bs_idx = blockIdx.z;
15
+ int c_idx = blockIdx.y;
16
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
17
+ int pt_idx = index / nsample;
18
+ if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
19
+
20
+ int sample_idx = index % nsample;
21
+ grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
22
+ idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
23
+
24
+ atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
25
+ }
26
+
27
+ void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
28
+ const float *grad_out, const int *idx, float *grad_points) {
29
+ // grad_out: (B, C, npoints, nsample)
30
+ // idx: (B, npoints, nsample)
31
+ // output:
32
+ // grad_points: (B, C, N)
33
+ cudaError_t err;
34
+ dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
35
+ dim3 threads(THREADS_PER_BLOCK);
36
+
37
+ group_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
38
+
39
+ err = cudaGetLastError();
40
+ if (cudaSuccess != err) {
41
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
42
+ exit(-1);
43
+ }
44
+ }
45
+
46
+
47
+ __global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample,
48
+ const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
49
+ // points: (B, C, N)
50
+ // idx: (B, npoints, nsample)
51
+ // output:
52
+ // out: (B, C, npoints, nsample)
53
+ int bs_idx = blockIdx.z;
54
+ int c_idx = blockIdx.y;
55
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
56
+ int pt_idx = index / nsample;
57
+ if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
58
+
59
+ int sample_idx = index % nsample;
60
+
61
+ idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
62
+ int in_idx = bs_idx * c * n + c_idx * n + idx[0];
63
+ int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
64
+
65
+ out[out_idx] = points[in_idx];
66
+ }
67
+
68
+
69
+ void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
70
+ const float *points, const int *idx, float *out) {
71
+ // points: (B, C, N)
72
+ // idx: (B, npoints, nsample)
73
+ // output:
74
+ // out: (B, C, npoints, nsample)
75
+ cudaError_t err;
76
+ dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
77
+ dim3 threads(THREADS_PER_BLOCK);
78
+
79
+ group_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, points, idx, out);
80
+ // cudaDeviceSynchronize(); // for using printf in kernel function
81
+ err = cudaGetLastError();
82
+ if (cudaSuccess != err) {
83
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
84
+ exit(-1);
85
+ }
86
+ }
87
+
88
+
89
+ __global__ void group_points_grad_kernel_stack(int B, int M, int C, int N, int nsample,
90
+ const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
91
+ // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
92
+ // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
93
+ // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
94
+ // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
95
+ // :return:
96
+ // grad_features: (N1 + N2 ..., C) gradient of the features
97
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
98
+ int sample_idx = index % nsample;
99
+ int C_idx = (index / nsample) % C;
100
+ int pt_idx = (index / nsample / C);
101
+
102
+ if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
103
+
104
+ int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
105
+ for (int k = 1; k < B; k++){
106
+ if (pt_idx < pt_cnt) break;
107
+ pt_cnt += idx_batch_cnt[k];
108
+ bs_idx = k;
109
+ }
110
+
111
+ int features_batch_start_idx = 0;
112
+ for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
113
+
114
+ grad_out += pt_idx * C * nsample + C_idx * nsample + sample_idx;
115
+ idx += pt_idx * nsample + sample_idx;
116
+ grad_features += (features_batch_start_idx + idx[0]) * C + C_idx;
117
+
118
+ atomicAdd(grad_features, grad_out[0]);
119
+ }
120
+
121
+ void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
122
+ const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
123
+ // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
124
+ // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
125
+ // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
126
+ // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
127
+ // :return:
128
+ // grad_features: (N1 + N2 ..., C) gradient of the features
129
+
130
+ cudaError_t err;
131
+ // dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
132
+ dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
133
+ dim3 threads(THREADS_PER_BLOCK);
134
+
135
+ group_points_grad_kernel_stack<<<blocks, threads>>>(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
136
+
137
+ err = cudaGetLastError();
138
+ if (cudaSuccess != err) {
139
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
140
+ exit(-1);
141
+ }
142
+ }
143
+
144
+
145
+ __global__ void group_points_kernel_stack(int B, int M, int C, int nsample,
146
+ const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
147
+ // :param features: (N1 + N2 ..., C) tensor of features to group
148
+ // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
149
+ // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
150
+ // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
151
+ // :return:
152
+ // output: (M1 + M2, C, nsample) tensor
153
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
154
+ int sample_idx = index % nsample;
155
+ int C_idx = (index / nsample) % C;
156
+ int pt_idx = (index / nsample / C);
157
+
158
+ if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
159
+
160
+ int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
161
+ for (int k = 1; k < B; k++){
162
+ if (pt_idx < pt_cnt) break;
163
+ pt_cnt += idx_batch_cnt[k];
164
+ bs_idx = k;
165
+ }
166
+
167
+ int features_batch_start_idx = 0;
168
+ for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
169
+ features += features_batch_start_idx * C;
170
+
171
+ idx += pt_idx * nsample + sample_idx;
172
+ int in_idx = idx[0] * C + C_idx;
173
+ int out_idx = pt_idx * C * nsample + C_idx * nsample + sample_idx;
174
+
175
+ out[out_idx] = features[in_idx];
176
+ }
177
+
178
+
179
+ void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
180
+ const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
181
+ // :param features: (N1 + N2 ..., C) tensor of features to group
182
+ // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
183
+ // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
184
+ // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
185
+ // :return:
186
+ // output: (M1 + M2, C, nsample) tensor
187
+
188
+ cudaError_t err;
189
+ dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
190
+ dim3 threads(THREADS_PER_BLOCK);
191
+
192
+ group_points_kernel_stack<<<blocks, threads>>>(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
193
+ // cudaDeviceSynchronize(); // for using printf in kernel function
194
+ err = cudaGetLastError();
195
+ if (cudaSuccess != err) {
196
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
197
+ exit(-1);
198
+ }
199
+ }
pc_util/src/group_points_gpu.h ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _GROUP_POINTS_GPU_H
2
+ #define _GROUP_POINTS_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include <cuda.h>
6
+ #include <cuda_runtime_api.h>
7
+ #include <vector>
8
+
9
+
10
+ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
11
+ at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
12
+
13
+ void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
14
+ const float *points, const int *idx, float *out);
15
+
16
+ int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
17
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
18
+
19
+ void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
20
+ const float *grad_out, const int *idx, float *grad_points);
21
+
22
+ int group_points_wrapper_stack(int B, int M, int C, int nsample,
23
+ at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
24
+ at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor);
25
+
26
+ void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
27
+ const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out);
28
+
29
+ int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
30
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
31
+ at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor);
32
+
33
+ void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
34
+ const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features);
35
+
36
+ #endif
pc_util/src/interpolate.cpp ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/serialize/tensor.h>
2
+ #include <vector>
3
+ // #include <THC/THC.h>
4
+ #include <math.h>
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+ #include <cuda.h>
8
+ #include <cuda_runtime_api.h>
9
+ #include "interpolate_gpu.h"
10
+
11
+ // extern THCState *state;
12
+
13
+ #include <ATen/cuda/CUDAContext.h>
14
+ #include <ATen/cuda/CUDAEvent.h>
15
+ // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
16
+
17
+ #define CHECK_CUDA(x) do { \
18
+ if (!x.type().is_cuda()) { \
19
+ fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
20
+ exit(-1); \
21
+ } \
22
+ } while (0)
23
+ #define CHECK_CONTIGUOUS(x) do { \
24
+ if (!x.is_contiguous()) { \
25
+ fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
26
+ exit(-1); \
27
+ } \
28
+ } while (0)
29
+ #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
30
+
31
+
32
+ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
33
+ at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
34
+ const float *unknown = unknown_tensor.data<float>();
35
+ const float *known = known_tensor.data<float>();
36
+ float *dist2 = dist2_tensor.data<float>();
37
+ int *idx = idx_tensor.data<int>();
38
+
39
+ three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx);
40
+ }
41
+
42
+
43
+ void three_interpolate_wrapper_fast(int b, int c, int m, int n,
44
+ at::Tensor points_tensor,
45
+ at::Tensor idx_tensor,
46
+ at::Tensor weight_tensor,
47
+ at::Tensor out_tensor) {
48
+
49
+ const float *points = points_tensor.data<float>();
50
+ const float *weight = weight_tensor.data<float>();
51
+ float *out = out_tensor.data<float>();
52
+ const int *idx = idx_tensor.data<int>();
53
+
54
+
55
+ three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out);
56
+ }
57
+
58
+ void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
59
+ at::Tensor grad_out_tensor,
60
+ at::Tensor idx_tensor,
61
+ at::Tensor weight_tensor,
62
+ at::Tensor grad_points_tensor) {
63
+
64
+ const float *grad_out = grad_out_tensor.data<float>();
65
+ const float *weight = weight_tensor.data<float>();
66
+ float *grad_points = grad_points_tensor.data<float>();
67
+ const int *idx = idx_tensor.data<int>();
68
+
69
+ three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points);
70
+ }
71
+
72
+
73
+ void three_nn_wrapper_stack(at::Tensor unknown_tensor,
74
+ at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
75
+ at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor){
76
+ // unknown: (N1 + N2 ..., 3)
77
+ // unknown_batch_cnt: (batch_size), [N1, N2, ...]
78
+ // known: (M1 + M2 ..., 3)
79
+ // known_batch_cnt: (batch_size), [M1, M2, ...]
80
+ // Return:
81
+ // dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
82
+ // idx: (N1 + N2 ..., 3) index of the three nearest neighbors
83
+ CHECK_INPUT(unknown_tensor);
84
+ CHECK_INPUT(unknown_batch_cnt_tensor);
85
+ CHECK_INPUT(known_tensor);
86
+ CHECK_INPUT(known_batch_cnt_tensor);
87
+ CHECK_INPUT(dist2_tensor);
88
+ CHECK_INPUT(idx_tensor);
89
+
90
+ int batch_size = unknown_batch_cnt_tensor.size(0);
91
+ int N = unknown_tensor.size(0);
92
+ int M = known_tensor.size(0);
93
+ const float *unknown = unknown_tensor.data<float>();
94
+ const int *unknown_batch_cnt = unknown_batch_cnt_tensor.data<int>();
95
+ const float *known = known_tensor.data<float>();
96
+ const int *known_batch_cnt = known_batch_cnt_tensor.data<int>();
97
+ float *dist2 = dist2_tensor.data<float>();
98
+ int *idx = idx_tensor.data<int>();
99
+
100
+ three_nn_kernel_launcher_stack(batch_size, N, M, unknown, unknown_batch_cnt, known, known_batch_cnt, dist2, idx);
101
+ }
102
+
103
+
104
+ void three_interpolate_wrapper_stack(at::Tensor features_tensor,
105
+ at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) {
106
+ // features_tensor: (M1 + M2 ..., C)
107
+ // idx_tensor: [N1 + N2 ..., 3]
108
+ // weight_tensor: [N1 + N2 ..., 3]
109
+ // Return:
110
+ // out_tensor: (N1 + N2 ..., C)
111
+ CHECK_INPUT(features_tensor);
112
+ CHECK_INPUT(idx_tensor);
113
+ CHECK_INPUT(weight_tensor);
114
+ CHECK_INPUT(out_tensor);
115
+
116
+ int N = out_tensor.size(0);
117
+ int channels = features_tensor.size(1);
118
+ const float *features = features_tensor.data<float>();
119
+ const float *weight = weight_tensor.data<float>();
120
+ const int *idx = idx_tensor.data<int>();
121
+ float *out = out_tensor.data<float>();
122
+
123
+ three_interpolate_kernel_launcher_stack(N, channels, features, idx, weight, out);
124
+ }
125
+
126
+
127
+ void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
128
+ at::Tensor weight_tensor, at::Tensor grad_features_tensor) {
129
+ // grad_out_tensor: (N1 + N2 ..., C)
130
+ // idx_tensor: [N1 + N2 ..., 3]
131
+ // weight_tensor: [N1 + N2 ..., 3]
132
+ // Return:
133
+ // grad_features_tensor: (M1 + M2 ..., C)
134
+ CHECK_INPUT(grad_out_tensor);
135
+ CHECK_INPUT(idx_tensor);
136
+ CHECK_INPUT(weight_tensor);
137
+ CHECK_INPUT(grad_features_tensor);
138
+
139
+ int N = grad_out_tensor.size(0);
140
+ int channels = grad_out_tensor.size(1);
141
+ const float *grad_out = grad_out_tensor.data<float>();
142
+ const float *weight = weight_tensor.data<float>();
143
+ const int *idx = idx_tensor.data<int>();
144
+ float *grad_features = grad_features_tensor.data<float>();
145
+
146
+ // printf("N=%d, channels=%d\n", N, channels);
147
+ three_interpolate_grad_kernel_launcher_stack(N, channels, grad_out, idx, weight, grad_features);
148
+ }
pc_util/src/interpolate_gpu.cu ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "cuda_utils.h"
6
+ #include "interpolate_gpu.h"
7
+
8
+
9
+ __global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown,
10
+ const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
11
+ // unknown: (B, N, 3)
12
+ // known: (B, M, 3)
13
+ // output:
14
+ // dist2: (B, N, 3)
15
+ // idx: (B, N, 3)
16
+
17
+ int bs_idx = blockIdx.y;
18
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
19
+ if (bs_idx >= b || pt_idx >= n) return;
20
+
21
+ unknown += bs_idx * n * 3 + pt_idx * 3;
22
+ known += bs_idx * m * 3;
23
+ dist2 += bs_idx * n * 3 + pt_idx * 3;
24
+ idx += bs_idx * n * 3 + pt_idx * 3;
25
+
26
+ float ux = unknown[0];
27
+ float uy = unknown[1];
28
+ float uz = unknown[2];
29
+
30
+ double best1 = 1e40, best2 = 1e40, best3 = 1e40;
31
+ int besti1 = 0, besti2 = 0, besti3 = 0;
32
+ for (int k = 0; k < m; ++k) {
33
+ float x = known[k * 3 + 0];
34
+ float y = known[k * 3 + 1];
35
+ float z = known[k * 3 + 2];
36
+ float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
37
+ if (d < best1) {
38
+ best3 = best2; besti3 = besti2;
39
+ best2 = best1; besti2 = besti1;
40
+ best1 = d; besti1 = k;
41
+ }
42
+ else if (d < best2) {
43
+ best3 = best2; besti3 = besti2;
44
+ best2 = d; besti2 = k;
45
+ }
46
+ else if (d < best3) {
47
+ best3 = d; besti3 = k;
48
+ }
49
+ }
50
+ dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
51
+ idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
52
+ }
53
+
54
+
55
+ void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
56
+ const float *known, float *dist2, int *idx) {
57
+ // unknown: (B, N, 3)
58
+ // known: (B, M, 3)
59
+ // output:
60
+ // dist2: (B, N, 3)
61
+ // idx: (B, N, 3)
62
+
63
+ cudaError_t err;
64
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
65
+ dim3 threads(THREADS_PER_BLOCK);
66
+
67
+ three_nn_kernel_fast<<<blocks, threads>>>(b, n, m, unknown, known, dist2, idx);
68
+
69
+ err = cudaGetLastError();
70
+ if (cudaSuccess != err) {
71
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
72
+ exit(-1);
73
+ }
74
+ }
75
+
76
+
77
+ __global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points,
78
+ const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
79
+ // points: (B, C, M)
80
+ // idx: (B, N, 3)
81
+ // weight: (B, N, 3)
82
+ // output:
83
+ // out: (B, C, N)
84
+
85
+ int bs_idx = blockIdx.z;
86
+ int c_idx = blockIdx.y;
87
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
88
+
89
+ if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
90
+
91
+ weight += bs_idx * n * 3 + pt_idx * 3;
92
+ points += bs_idx * c * m + c_idx * m;
93
+ idx += bs_idx * n * 3 + pt_idx * 3;
94
+ out += bs_idx * c * n + c_idx * n;
95
+
96
+ out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
97
+ }
98
+
99
+ void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
100
+ const float *points, const int *idx, const float *weight, float *out) {
101
+ // points: (B, C, M)
102
+ // idx: (B, N, 3)
103
+ // weight: (B, N, 3)
104
+ // output:
105
+ // out: (B, C, N)
106
+
107
+ cudaError_t err;
108
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
109
+ dim3 threads(THREADS_PER_BLOCK);
110
+ three_interpolate_kernel_fast<<<blocks, threads>>>(b, c, m, n, points, idx, weight, out);
111
+
112
+ err = cudaGetLastError();
113
+ if (cudaSuccess != err) {
114
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
115
+ exit(-1);
116
+ }
117
+ }
118
+
119
+
120
+ __global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
121
+ const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
122
+ // grad_out: (B, C, N)
123
+ // weight: (B, N, 3)
124
+ // output:
125
+ // grad_points: (B, C, M)
126
+
127
+ int bs_idx = blockIdx.z;
128
+ int c_idx = blockIdx.y;
129
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
130
+
131
+ if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
132
+
133
+ grad_out += bs_idx * c * n + c_idx * n + pt_idx;
134
+ weight += bs_idx * n * 3 + pt_idx * 3;
135
+ grad_points += bs_idx * c * m + c_idx * m;
136
+ idx += bs_idx * n * 3 + pt_idx * 3;
137
+
138
+
139
+ atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
140
+ atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
141
+ atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
142
+ }
143
+
144
+ void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
145
+ const int *idx, const float *weight, float *grad_points) {
146
+ // grad_out: (B, C, N)
147
+ // weight: (B, N, 3)
148
+ // output:
149
+ // grad_points: (B, C, M)
150
+
151
+ cudaError_t err;
152
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
153
+ dim3 threads(THREADS_PER_BLOCK);
154
+ three_interpolate_grad_kernel_fast<<<blocks, threads>>>(b, c, n, m, grad_out, idx, weight, grad_points);
155
+
156
+ err = cudaGetLastError();
157
+ if (cudaSuccess != err) {
158
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
159
+ exit(-1);
160
+ }
161
+ }
162
+
163
+
164
+ __global__ void three_nn_kernel_stack(int batch_size, int N, int M, const float *unknown,
165
+ const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
166
+ float *dist2, int *idx) {
167
+ // unknown: (N1 + N2 ..., 3)
168
+ // unknown_batch_cnt: (batch_size), [N1, N2, ...]
169
+ // known: (M1 + M2 ..., 3)
170
+ // known_batch_cnt: (batch_size), [M1, M2, ...]
171
+ // Return:
172
+ // dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
173
+ // idx: (N1 + N2 ..., 3) index of the three nearest neighbors
174
+
175
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
176
+ if (pt_idx >= N) return;
177
+
178
+ int bs_idx = 0, pt_cnt = unknown_batch_cnt[0];
179
+ for (int k = 1; k < batch_size; k++){
180
+ if (pt_idx < pt_cnt) break;
181
+ pt_cnt += unknown_batch_cnt[k];
182
+ bs_idx = k;
183
+ }
184
+
185
+ int cur_num_known_points = known_batch_cnt[bs_idx];
186
+
187
+ int known_batch_start_idx = 0;
188
+ for (int k = 0; k < bs_idx; k++) known_batch_start_idx += known_batch_cnt[k];
189
+
190
+ known += known_batch_start_idx * 3;
191
+ unknown += pt_idx * 3;
192
+ dist2 += pt_idx * 3;
193
+ idx += pt_idx * 3;
194
+
195
+ float ux = unknown[0];
196
+ float uy = unknown[1];
197
+ float uz = unknown[2];
198
+
199
+ double best1 = 1e40, best2 = 1e40, best3 = 1e40;
200
+ int besti1 = 0, besti2 = 0, besti3 = 0;
201
+ for (int k = 0; k < cur_num_known_points; ++k) {
202
+ float x = known[k * 3 + 0];
203
+ float y = known[k * 3 + 1];
204
+ float z = known[k * 3 + 2];
205
+ float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
206
+ if (d < best1) {
207
+ best3 = best2; besti3 = besti2;
208
+ best2 = best1; besti2 = besti1;
209
+ best1 = d; besti1 = k;
210
+ }
211
+ else if (d < best2) {
212
+ best3 = best2; besti3 = besti2;
213
+ best2 = d; besti2 = k;
214
+ }
215
+ else if (d < best3) {
216
+ best3 = d; besti3 = k;
217
+ }
218
+ }
219
+ dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
220
+ idx[0] = besti1 + known_batch_start_idx;
221
+ idx[1] = besti2 + known_batch_start_idx;
222
+ idx[2] = besti3 + known_batch_start_idx;
223
+ }
224
+
225
+
226
+ void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
227
+ const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
228
+ float *dist2, int *idx) {
229
+ // unknown: (N1 + N2 ..., 3)
230
+ // unknown_batch_cnt: (batch_size), [N1, N2, ...]
231
+ // known: (M1 + M2 ..., 3)
232
+ // known_batch_cnt: (batch_size), [M1, M2, ...]
233
+ // Return:
234
+ // dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
235
+ // idx: (N1 + N2 ..., 3) index of the three nearest neighbors
236
+
237
+ cudaError_t err;
238
+ dim3 blocks(DIVUP(N, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
239
+ dim3 threads(THREADS_PER_BLOCK);
240
+
241
+ three_nn_kernel_stack<<<blocks, threads>>>(
242
+ batch_size, N, M, unknown, unknown_batch_cnt,
243
+ known, known_batch_cnt, dist2, idx
244
+ );
245
+
246
+ err = cudaGetLastError();
247
+ if (cudaSuccess != err) {
248
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
249
+ exit(-1);
250
+ }
251
+ }
252
+
253
+
254
+
255
+ __global__ void three_interpolate_kernel_stack(int N, int channels, const float *features,
256
+ const int *idx, const float *weight, float *out) {
257
+ // features: (M1 + M2 ..., C)
258
+ // idx: [N1 + N2 ..., 3]
259
+ // weight: [N1 + N2 ..., 3]
260
+ // Return:
261
+ // out: (N1 + N2 ..., C)
262
+
263
+ int c_idx = blockIdx.y;
264
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
265
+ if (pt_idx >= N || c_idx >= channels) return;
266
+
267
+ weight += pt_idx * 3;
268
+ idx += pt_idx * 3;
269
+ out += pt_idx * channels + c_idx;
270
+
271
+ out[0] = weight[0] * features[idx[0] * channels + c_idx] +
272
+ weight[1] * features[idx[1] * channels + c_idx] +
273
+ weight[2] * features[idx[2] * channels + c_idx];
274
+ }
275
+
276
+
277
+
278
+ void three_interpolate_kernel_launcher_stack(int N, int channels,
279
+ const float *features, const int *idx, const float *weight, float *out) {
280
+ // features: (M1 + M2 ..., C)
281
+ // idx: [N1 + N2 ..., 3]
282
+ // weight: [N1 + N2 ..., 3]
283
+ // Return:
284
+ // out: (N1 + N2 ..., C)
285
+
286
+ cudaError_t err;
287
+ dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels);
288
+ dim3 threads(THREADS_PER_BLOCK);
289
+ three_interpolate_kernel_stack<<<blocks, threads>>>(N, channels, features, idx, weight, out);
290
+
291
+ err = cudaGetLastError();
292
+ if (cudaSuccess != err) {
293
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
294
+ exit(-1);
295
+ }
296
+ }
297
+
298
+
299
+ __global__ void three_interpolate_grad_kernel_stack(int N, int channels, const float *grad_out,
300
+ const int *idx, const float *weight, float *grad_features) {
301
+ // grad_out_tensor: (N1 + N2 ..., C)
302
+ // idx_tensor: [N1 + N2 ..., 3]
303
+ // weight_tensor: [N1 + N2 ..., 3]
304
+ // Return:
305
+ // grad_features_tensor: (M1 + M2 ..., C)
306
+
307
+ int c_idx = blockIdx.y;
308
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
309
+ if (pt_idx >= N || c_idx >= channels) return;
310
+
311
+ grad_out += pt_idx * channels + c_idx;
312
+ weight += pt_idx * 3;
313
+ idx += pt_idx * 3;
314
+
315
+ // printf("pt_idx=%d, c_idx=%d, idx=(%d, %d, %d), grad_out=%f\n", pt_idx, c_idx, idx[0], idx[1], idx[2], grad_out[0]);
316
+
317
+ atomicAdd(grad_features + idx[0] * channels + c_idx, grad_out[0] * weight[0]);
318
+ atomicAdd(grad_features + idx[1] * channels + c_idx, grad_out[0] * weight[1]);
319
+ atomicAdd(grad_features + idx[2] * channels + c_idx, grad_out[0] * weight[2]);
320
+ }
321
+
322
+
323
+ void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
324
+ const int *idx, const float *weight, float *grad_features) {
325
+ // grad_out_tensor: (N1 + N2 ..., C)
326
+ // idx_tensor: [N1 + N2 ..., 3]
327
+ // weight_tensor: [N1 + N2 ..., 3]
328
+ // Return:
329
+ // grad_features_tensor: (M1 + M2 ..., C)
330
+
331
+ cudaError_t err;
332
+ dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels); // blockIdx.x(col), blockIdx.y(row)
333
+ dim3 threads(THREADS_PER_BLOCK);
334
+ three_interpolate_grad_kernel_stack<<<blocks, threads>>>(
335
+ N, channels, grad_out, idx, weight, grad_features
336
+ );
337
+
338
+ err = cudaGetLastError();
339
+ if (cudaSuccess != err) {
340
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
341
+ exit(-1);
342
+ }
343
+ }
pc_util/src/interpolate_gpu.h ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _INTERPOLATE_GPU_H
2
+ #define _INTERPOLATE_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include<vector>
6
+ #include <cuda.h>
7
+ #include <cuda_runtime_api.h>
8
+
9
+
10
+ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
11
+ at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
12
+
13
+ void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
14
+ const float *known, float *dist2, int *idx);
15
+
16
+
17
+ void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor,
18
+ at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
19
+
20
+ void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
21
+ const float *points, const int *idx, const float *weight, float *out);
22
+
23
+
24
+ void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor,
25
+ at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
26
+
27
+ void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
28
+ const int *idx, const float *weight, float *grad_points);
29
+
30
+
31
+
32
+ void three_nn_wrapper_stack(at::Tensor unknown_tensor,
33
+ at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
34
+ at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
35
+
36
+
37
+ void three_interpolate_wrapper_stack(at::Tensor features_tensor,
38
+ at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
39
+
40
+
41
+
42
+ void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
43
+ at::Tensor weight_tensor, at::Tensor grad_features_tensor);
44
+
45
+
46
+ void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
47
+ const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
48
+ float *dist2, int *idx);
49
+
50
+
51
+ void three_interpolate_kernel_launcher_stack(int N, int channels,
52
+ const float *features, const int *idx, const float *weight, float *out);
53
+
54
+
55
+
56
+ void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
57
+ const int *idx, const float *weight, float *grad_features);
58
+
59
+
60
+
61
+ #endif
pc_util/src/pointnet2_api.cpp ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/serialize/tensor.h>
2
+ #include <torch/extension.h>
3
+
4
+ #include "ball_query_gpu.h"
5
+ #include "group_points_gpu.h"
6
+ #include "sampling_gpu.h"
7
+ #include "interpolate_gpu.h"
8
+ #include "cluster_gpu.h"
9
+
10
+
11
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
12
+ m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
13
+ m.def("ball_center_query_wrapper", &ball_center_query_wrapper_fast, "ball_center_query_wrapper_fast");
14
+ m.def("knn_query_wrapper", &knn_query_wrapper_fast, "knn_query_wrapper_fast");
15
+
16
+ m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
17
+ m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
18
+
19
+ m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
20
+ m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
21
+
22
+ m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
23
+
24
+ m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
25
+ m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
26
+ m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
27
+
28
+ m.def("dbscan_wrapper", &dbscan_wrapper_fast, "dbscan_wrapper_fast");
29
+ m.def("cluster_pts_wrapper", &cluster_pts_wrapper_fast, "cluster_pts_wrapper_fast");
30
+
31
+
32
+ m.def("ball_query_wrapper_stack", &ball_query_wrapper_stack, "ball_query_wrapper_stack");
33
+
34
+ m.def("group_points_wrapper_stack", &group_points_wrapper_stack, "group_points_wrapper_stack");
35
+ m.def("group_points_grad_wrapper_stack", &group_points_grad_wrapper_stack, "group_points_grad_wrapper_stack");
36
+
37
+ m.def("three_nn_wrapper_stack", &three_nn_wrapper_stack, "three_nn_wrapper_stack");
38
+ m.def("three_interpolate_wrapper_stack", &three_interpolate_wrapper_stack, "three_interpolate_wrapper_stack");
39
+ m.def("three_interpolate_grad_wrapper_stack", &three_interpolate_grad_wrapper_stack, "three_interpolate_grad_wrapper_stack");
40
+
41
+ }
pc_util/src/sampling.cpp ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/serialize/tensor.h>
2
+ #include <ATen/cuda/CUDAContext.h>
3
+ #include <vector>
4
+ // #include <THC/THC.h>
5
+
6
+ #include "sampling_gpu.h"
7
+
8
+ // extern THCState *state;
9
+
10
+ #include <ATen/cuda/CUDAContext.h>
11
+ #include <ATen/cuda/CUDAEvent.h>
12
+ // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
13
+
14
+ int gather_points_wrapper_fast(int b, int c, int n, int npoints,
15
+ at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
16
+ const float *points = points_tensor.data<float>();
17
+ const int *idx = idx_tensor.data<int>();
18
+ float *out = out_tensor.data<float>();
19
+
20
+ gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out);
21
+ return 1;
22
+ }
23
+
24
+
25
+ int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
26
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
27
+
28
+ const float *grad_out = grad_out_tensor.data<float>();
29
+ const int *idx = idx_tensor.data<int>();
30
+ float *grad_points = grad_points_tensor.data<float>();
31
+
32
+ gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points);
33
+ return 1;
34
+ }
35
+
36
+
37
+ int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
38
+ at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
39
+
40
+ const float *points = points_tensor.data<float>();
41
+ float *temp = temp_tensor.data<float>();
42
+ int *idx = idx_tensor.data<int>();
43
+
44
+ furthest_point_sampling_kernel_launcher(b, c, n, m, w1, w2, points, temp, idx);
45
+ return 1;
46
+ }
pc_util/src/sampling_gpu.cu ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+
4
+ #include "cuda_utils.h"
5
+ #include "sampling_gpu.h"
6
+
7
+
8
+ __global__ void gather_points_kernel_fast(int b, int c, int n, int m,
9
+ const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
10
+ // points: (B, C, N)
11
+ // idx: (B, M)
12
+ // output:
13
+ // out: (B, C, M)
14
+
15
+ int bs_idx = blockIdx.z;
16
+ int c_idx = blockIdx.y;
17
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
18
+ if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
19
+
20
+ out += bs_idx * c * m + c_idx * m + pt_idx;
21
+ idx += bs_idx * m + pt_idx;
22
+ points += bs_idx * c * n + c_idx * n;
23
+ out[0] = points[idx[0]];
24
+ }
25
+
26
+ void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
27
+ const float *points, const int *idx, float *out) {
28
+ // points: (B, C, N)
29
+ // idx: (B, npoints)
30
+ // output:
31
+ // out: (B, C, npoints)
32
+
33
+ cudaError_t err;
34
+ dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
35
+ dim3 threads(THREADS_PER_BLOCK);
36
+
37
+ gather_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, points, idx, out);
38
+
39
+ err = cudaGetLastError();
40
+ if (cudaSuccess != err) {
41
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
42
+ exit(-1);
43
+ }
44
+ }
45
+
46
+ __global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
47
+ const int *__restrict__ idx, float *__restrict__ grad_points) {
48
+ // grad_out: (B, C, M)
49
+ // idx: (B, M)
50
+ // output:
51
+ // grad_points: (B, C, N)
52
+
53
+ int bs_idx = blockIdx.z;
54
+ int c_idx = blockIdx.y;
55
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
56
+ if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
57
+
58
+ grad_out += bs_idx * c * m + c_idx * m + pt_idx;
59
+ idx += bs_idx * m + pt_idx;
60
+ grad_points += bs_idx * c * n + c_idx * n;
61
+
62
+ atomicAdd(grad_points + idx[0], grad_out[0]);
63
+ }
64
+
65
+ void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
66
+ const float *grad_out, const int *idx, float *grad_points) {
67
+ // grad_out: (B, C, npoints)
68
+ // idx: (B, npoints)
69
+ // output:
70
+ // grad_points: (B, C, N)
71
+
72
+ cudaError_t err;
73
+ dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
74
+ dim3 threads(THREADS_PER_BLOCK);
75
+
76
+ gather_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, grad_out, idx, grad_points);
77
+
78
+ err = cudaGetLastError();
79
+ if (cudaSuccess != err) {
80
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
81
+ exit(-1);
82
+ }
83
+ }
84
+
85
+
86
+ __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
87
+ const float v1 = dists[idx1], v2 = dists[idx2];
88
+ const int i1 = dists_i[idx1], i2 = dists_i[idx2];
89
+ dists[idx1] = max(v1, v2);
90
+ dists_i[idx1] = v2 > v1 ? i2 : i1;
91
+ }
92
+
93
+ template <unsigned int block_size>
94
+ __global__ void furthest_point_sampling_kernel(int b, int c, int n, int m, float w1, float w2,
95
+ const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
96
+ // dataset: (B, N, 3)
97
+ // tmp: (B, N)
98
+ // output:
99
+ // idx: (B, M)
100
+
101
+ if (m <= 0) return;
102
+ __shared__ float dists[block_size];
103
+ __shared__ int dists_i[block_size];
104
+
105
+ int batch_index = blockIdx.x;
106
+ dataset += batch_index * n * c;
107
+ temp += batch_index * n;
108
+ idxs += batch_index * m;
109
+
110
+ int tid = threadIdx.x;
111
+ const int stride = block_size;
112
+
113
+ int old = 0;
114
+ if (threadIdx.x == 0)
115
+ idxs[0] = old;
116
+
117
+ __syncthreads();
118
+ for (int j = 1; j < m; j++) {
119
+ int besti = 0;
120
+ float best = -1;
121
+ float x1 = dataset[old * c + 0];
122
+ float y1 = dataset[old * c + 1];
123
+ float z1 = dataset[old * c + 2];
124
+
125
+ for (int k = tid; k < n; k += stride) {
126
+ float x2, y2, z2;
127
+ x2 = dataset[k * c + 0];
128
+ y2 = dataset[k * c + 1];
129
+ z2 = dataset[k * c + 2];
130
+ // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
131
+ // if (mag <= 1e-3)
132
+ // continue;
133
+
134
+ float xyz_d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
135
+ float fea_d = 0;
136
+ for (int l = 3; l < c; l++) {
137
+ fea_d += (dataset[old * c + l] - dataset[k * c + l]) * (dataset[old * c + l] - dataset[k * c + l]);
138
+ }
139
+ float d = w1 * xyz_d + w2 * fea_d;
140
+ float d2 = min(d, temp[k]);
141
+ temp[k] = d2;
142
+ besti = d2 > best ? k : besti;
143
+ best = d2 > best ? d2 : best;
144
+ }
145
+ dists[tid] = best;
146
+ dists_i[tid] = besti;
147
+ __syncthreads();
148
+
149
+ if (block_size >= 1024) {
150
+ if (tid < 512) {
151
+ __update(dists, dists_i, tid, tid + 512);
152
+ }
153
+ __syncthreads();
154
+ }
155
+
156
+ if (block_size >= 512) {
157
+ if (tid < 256) {
158
+ __update(dists, dists_i, tid, tid + 256);
159
+ }
160
+ __syncthreads();
161
+ }
162
+ if (block_size >= 256) {
163
+ if (tid < 128) {
164
+ __update(dists, dists_i, tid, tid + 128);
165
+ }
166
+ __syncthreads();
167
+ }
168
+ if (block_size >= 128) {
169
+ if (tid < 64) {
170
+ __update(dists, dists_i, tid, tid + 64);
171
+ }
172
+ __syncthreads();
173
+ }
174
+ if (block_size >= 64) {
175
+ if (tid < 32) {
176
+ __update(dists, dists_i, tid, tid + 32);
177
+ }
178
+ __syncthreads();
179
+ }
180
+ if (block_size >= 32) {
181
+ if (tid < 16) {
182
+ __update(dists, dists_i, tid, tid + 16);
183
+ }
184
+ __syncthreads();
185
+ }
186
+ if (block_size >= 16) {
187
+ if (tid < 8) {
188
+ __update(dists, dists_i, tid, tid + 8);
189
+ }
190
+ __syncthreads();
191
+ }
192
+ if (block_size >= 8) {
193
+ if (tid < 4) {
194
+ __update(dists, dists_i, tid, tid + 4);
195
+ }
196
+ __syncthreads();
197
+ }
198
+ if (block_size >= 4) {
199
+ if (tid < 2) {
200
+ __update(dists, dists_i, tid, tid + 2);
201
+ }
202
+ __syncthreads();
203
+ }
204
+ if (block_size >= 2) {
205
+ if (tid < 1) {
206
+ __update(dists, dists_i, tid, tid + 1);
207
+ }
208
+ __syncthreads();
209
+ }
210
+
211
+ old = dists_i[0];
212
+ if (tid == 0)
213
+ idxs[j] = old;
214
+ }
215
+ }
216
+
217
+ void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
218
+ const float *dataset, float *temp, int *idxs) {
219
+ // dataset: (B, N, 3)
220
+ // tmp: (B, N)
221
+ // output:
222
+ // idx: (B, M)
223
+
224
+ cudaError_t err;
225
+ unsigned int n_threads = opt_n_threads(n);
226
+
227
+ switch (n_threads) {
228
+ case 1024:
229
+ furthest_point_sampling_kernel<1024><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
230
+ case 512:
231
+ furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
232
+ case 256:
233
+ furthest_point_sampling_kernel<256><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
234
+ case 128:
235
+ furthest_point_sampling_kernel<128><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
236
+ case 64:
237
+ furthest_point_sampling_kernel<64><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
238
+ case 32:
239
+ furthest_point_sampling_kernel<32><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
240
+ case 16:
241
+ furthest_point_sampling_kernel<16><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
242
+ case 8:
243
+ furthest_point_sampling_kernel<8><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
244
+ case 4:
245
+ furthest_point_sampling_kernel<4><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
246
+ case 2:
247
+ furthest_point_sampling_kernel<2><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
248
+ case 1:
249
+ furthest_point_sampling_kernel<1><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
250
+ default:
251
+ furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs);
252
+ }
253
+
254
+ err = cudaGetLastError();
255
+ if (cudaSuccess != err) {
256
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
257
+ exit(-1);
258
+ }
259
+ }
pc_util/src/sampling_gpu.h ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _SAMPLING_GPU_H
2
+ #define _SAMPLING_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include <ATen/cuda/CUDAContext.h>
6
+ #include<vector>
7
+
8
+
9
+ int gather_points_wrapper_fast(int b, int c, int n, int npoints,
10
+ at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
11
+
12
+ void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
13
+ const float *points, const int *idx, float *out);
14
+
15
+
16
+ int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
17
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
18
+
19
+ void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
20
+ const float *grad_out, const int *idx, float *grad_points);
21
+
22
+
23
+ int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
24
+ at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
25
+
26
+ void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
27
+ const float *dataset, float *temp, int *idxs);
28
+
29
+ #endif
script.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### This is example of the script that will be run in the test environment.
2
+ ### Some parts of the code are compulsory and you should NOT CHANGE THEM.
3
+ ### They are between '''---compulsory---''' comments.
4
+ ### You can change the rest of the code to define and test your solution.
5
+ ### However, you should not change the signature of the provided function.
6
+ ### The script would save "submission.parquet" file in the current directory.
7
+ ### The actual logic of the solution is implemented in the `handcrafted_solution.py` file.
8
+ ### The `handcrafted_solution.py` file is a placeholder for your solution.
9
+ ### You should implement the logic of your solution in that file.
10
+ ### You can use any additional files and subdirectories to organize your code.
11
+
12
+ '''---compulsory---'''
13
+ # import subprocess
14
+ # from pathlib import Path
15
+ # def install_package_from_local_file(package_name, folder='packages'):
16
+ # """
17
+ # Installs a package from a local .whl file or a directory containing .whl files using pip.
18
+
19
+ # Parameters:
20
+ # path_to_file_or_directory (str): The path to the .whl file or the directory containing .whl files.
21
+ # """
22
+ # try:
23
+ # pth = str(Path(folder) / package_name)
24
+ # subprocess.check_call([subprocess.sys.executable, "-m", "pip", "install",
25
+ # "--no-index", # Do not use package index
26
+ # "--find-links", pth, # Look for packages in the specified directory or at the file
27
+ # package_name]) # Specify the package to install
28
+ # print(f"Package installed successfully from {pth}")
29
+ # except subprocess.CalledProcessError as e:
30
+ # print(f"Failed to install package from {pth}. Error: {e}")
31
+
32
+ # install_package_from_local_file('hoho')
33
+
34
+ import hoho; hoho.setup() # YOU MUST CALL hoho.setup() BEFORE ANYTHING ELSE
35
+ # import subprocess
36
+ # import importlib
37
+ # from pathlib import Path
38
+ # import subprocess
39
+
40
+
41
+ # ### The function below is useful for installing additional python wheels.
42
+ # def install_package_from_local_file(package_name, folder='packages'):
43
+ # """
44
+ # Installs a package from a local .whl file or a directory containing .whl files using pip.
45
+
46
+ # Parameters:
47
+ # path_to_file_or_directory (str): The path to the .whl file or the directory containing .whl files.
48
+ # """
49
+ # try:
50
+ # pth = str(Path(folder) / package_name)
51
+ # subprocess.check_call([subprocess.sys.executable, "-m", "pip", "install",
52
+ # "--no-index", # Do not use package index
53
+ # "--find-links", pth, # Look for packages in the specified directory or at the file
54
+ # package_name]) # Specify the package to install
55
+ # print(f"Package installed successfully from {pth}")
56
+ # except subprocess.CalledProcessError as e:
57
+ # print(f"Failed to install package from {pth}. Error: {e}")
58
+
59
+
60
+ # pip download webdataset -d packages/webdataset --platform manylinux1_x86_64 --python-version 38 --only-binary=:all:
61
+ # install_package_from_local_file('webdataset')
62
+ # install_package_from_local_file('tqdm')
63
+
64
+ ### Here you can import any library or module you want.
65
+ ### The code below is used to read and parse the input dataset.
66
+ ### Please, do not modify it.
67
+
68
+ import webdataset as wds
69
+ from tqdm import tqdm
70
+ from typing import Dict
71
+ import pandas as pd
72
+ from transformers import AutoTokenizer
73
+ import os
74
+ import time
75
+ import io
76
+ from PIL import Image as PImage
77
+ import numpy as np
78
+
79
+ from hoho.read_write_colmap import read_cameras_binary, read_images_binary, read_points3D_binary
80
+ from hoho import proc, Sample
81
+
82
+ def convert_entry_to_human_readable(entry):
83
+ out = {}
84
+ already_good = ['__key__', 'wf_vertices', 'wf_edges', 'edge_semantics', 'mesh_vertices', 'mesh_faces', 'face_semantics', 'K', 'R', 't']
85
+ for k, v in entry.items():
86
+ if k in already_good:
87
+ out[k] = v
88
+ continue
89
+ if k == 'points3d':
90
+ out[k] = read_points3D_binary(fid=io.BytesIO(v))
91
+ if k == 'cameras':
92
+ out[k] = read_cameras_binary(fid=io.BytesIO(v))
93
+ if k == 'images':
94
+ out[k] = read_images_binary(fid=io.BytesIO(v))
95
+ if k in ['ade20k', 'gestalt']:
96
+ out[k] = [PImage.open(io.BytesIO(x)).convert('RGB') for x in v]
97
+ if k == 'depthcm':
98
+ out[k] = [PImage.open(io.BytesIO(x)) for x in entry['depthcm']]
99
+ return out
100
+
101
+ '''---end of compulsory---'''
102
+
103
+ ### The part below is used to define and test your solution.
104
+ import subprocess
105
+ import sys
106
+ import os
107
+
108
+ import numpy as np
109
+ os.environ['MKL_THREADING_LAYER'] = 'GNU'
110
+ os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
111
+
112
+ def uninstall_package(package_name):
113
+ """
114
+ Uninstalls a package using pip.
115
+
116
+ Parameters:
117
+ package_name (str): The name of the package to uninstall.
118
+ """
119
+ try:
120
+ subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package_name])
121
+ print(f"Package {package_name} uninstalled successfully")
122
+ except subprocess.CalledProcessError as e:
123
+ print(f"Failed to uninstall package {package_name}. Error: {e}")
124
+
125
+ # def download_packages(packages, folder='packages/torch'):
126
+ # """
127
+ # Downloads packages as .whl files into the specified folder using pip.
128
+
129
+ # Parameters:
130
+ # packages (list): List of packages to download with versions.
131
+ # folder (str): The folder where the .whl files will be saved.
132
+ # """
133
+ # Path(folder).mkdir(parents=True, exist_ok=True)
134
+ # try:
135
+ # subprocess.check_call([sys.executable, "-m", "pip", "download",
136
+ # "--platform", "manylinux1_x86_64",
137
+ # "--python-version", "38",
138
+ # "--only-binary=:all:",
139
+ # "-d", folder] + packages)
140
+ # print(f"Packages downloaded successfully into {folder}")
141
+ # except subprocess.CalledProcessError as e:
142
+ # print(f"Failed to download packages. Error: {e}")
143
+
144
+ def download_packages(packages, folder):
145
+ # Create the directory if it doesn't exist
146
+ if not os.path.exists(folder):
147
+ os.makedirs(folder)
148
+
149
+ try:
150
+ subprocess.check_call([
151
+ 'pip', 'download',
152
+ '--dest', folder,
153
+ '-f', 'https://download.pytorch.org/whl/cu121'
154
+ ] + packages)
155
+ print(f"Packages downloaded successfully to {folder}")
156
+ except subprocess.CalledProcessError as e:
157
+ print(f"Failed to download packages. Error: {e}")
158
+
159
+ # Set CUDA environment variables
160
+ os.environ['CUDA_HOME'] = '/usr/local/cuda-12.1'
161
+ os.environ['PATH'] = os.environ['CUDA_HOME'] + '/bin:' + os.environ['PATH']
162
+ os.environ['LD_LIBRARY_PATH'] = os.environ['CUDA_HOME'] + '/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')
163
+
164
+ def install_package_from_local_file(package_name, folder='packages'):
165
+ """
166
+ Installs a package from a local .whl file or a directory containing .whl files using pip.
167
+
168
+ Parameters:
169
+ package_name (str): The name of the package to install.
170
+ folder (str): The folder where the .whl files are located.
171
+ """
172
+ try:
173
+ pth = str(Path(folder) / package_name)
174
+ subprocess.check_call([sys.executable, "-m", "pip", "install",
175
+ "--no-index", # Do not use package index
176
+ "--find-links", pth, # Look for packages in the specified directory or at the file
177
+ package_name]) # Specify the package to install
178
+ print(f"Package installed successfully from {pth}")
179
+ except subprocess.CalledProcessError as e:
180
+ print(f"Failed to install package from {pth}. Error: {e}")
181
+
182
+ def install_which():
183
+ try:
184
+ # Attempt to install which if it's not available
185
+ subprocess.check_call(['sudo', 'apt-get', 'install', '-y', 'which'])
186
+ print("Which installed successfully.")
187
+ except subprocess.CalledProcessError as e:
188
+ print(f"An error occurred while installing which: {e}")
189
+ sys.exit(1)
190
+
191
+ def setup_environment():
192
+ # Uninstall torch if it is already installed
193
+ # packages_to_uninstall = ['torch', 'torchvision', 'torchaudio']
194
+ # for package in packages_to_uninstall:
195
+ # uninstall_package(package)
196
+ # Download required packages
197
+ # pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
198
+ # pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
199
+ # pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
200
+ # packages_to_download = ['torch==1.13.1', 'torchvision==0.14.1', 'torchaudio==0.13.1']
201
+ # packages_to_download = ['torch==2.1.0', 'torchvision==0.16.0', 'torchaudio==2.1.0']
202
+ # download_packages(packages_to_download, folder='packages/torch')
203
+
204
+ # Install ninja
205
+ # install_package_from_local_file('ninja', folder='packages')
206
+
207
+ # packages_to_download = ['torch==2.1.0', 'torchvision==0.16.0', 'torchaudio==2.1.0']
208
+ # download_folder = 'packages/torch'
209
+
210
+ # Download the packages
211
+ # download_packages(packages_to_download, download_folder)
212
+
213
+ # Install packages from local files
214
+ # install_package_from_local_file('torch', folder='packages')
215
+ # install_package_from_local_file('packages/torch/torchvision-0.16.0-cp38-cp38-manylinux1_x86_64.whl', folder='packages/torch')
216
+ # install_package_from_local_file('packages/torch/torchaudio-2.1.0-cp38-cp38-manylinux1_x86_64.whl', folder='packages/torch')
217
+ # install_package_from_local_file('scikit-learn', folder='packages')
218
+ # install_package_from_local_file('open3d', folder='packages')
219
+ # install_package_from_local_file('easydict', folder='packages')
220
+ # install_package_from_local_file('setuptools', folder='packages')
221
+ # install_package_from_local_file('ninja', folder='packages')
222
+ # download_packages(['scikit-learn'], folder='packages/scikit-learn')
223
+ # download_packages(['open3d'], folder='packages/open3d')
224
+ # download_packages(['easydict'], folder='packages/easydict')
225
+
226
+ # try:
227
+ # subprocess.check_call(['which', 'which'])
228
+ # except subprocess.CalledProcessError:
229
+ # install_which()
230
+
231
+ pc_util_path = os.path.join(os.getcwd(), 'pc_util')
232
+ if os.path.isdir(pc_util_path):
233
+ os.chdir(pc_util_path)
234
+ subprocess.check_call([sys.executable, "setup.py", "install"], cwd=pc_util_path)
235
+ os.chdir("..")
236
+
237
+ def setup_cuda_environment():
238
+ # cuda_home = '/usr/local/cuda'
239
+ # if not os.path.exists(cuda_home):
240
+ # raise EnvironmentError(f"CUDA_HOME directory {cuda_home} does not exist. Please install CUDA and set CUDA_HOME environment variable.")
241
+ # os.environ['CUDA_HOME'] = cuda_home
242
+ # os.environ['PATH'] = f"{cuda_home}/bin:{os.environ['PATH']}"
243
+ # os.environ['LD_LIBRARY_PATH'] = f"{cuda_home}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"
244
+
245
+ os.environ['PATH'] = '/usr/local/cuda/bin'
246
+ os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64'
247
+ os.environ['LIBRARY_PATH'] = '/usr/local/cuda/lib64'
248
+
249
+ # usr_local_contents = os.listdir('/usr/local')
250
+ # # print("Items under /usr/local:")
251
+ # for item in usr_local_contents:
252
+ # print(item)
253
+
254
+ from pathlib import Path
255
+ def save_submission(submission, path):
256
+ """
257
+ Saves the submission to a specified path.
258
+
259
+ Parameters:
260
+ submission (List[Dict[]]): The submission to save.
261
+ path (str): The path to save the submission to.
262
+ """
263
+ sub = pd.DataFrame(submission, columns=["__key__", "wf_vertices", "wf_edges"])
264
+ sub.to_parquet(path)
265
+ print(f"Submission saved to {path}")
266
+
267
+ if __name__ == "__main__":
268
+ # setup_cuda_environment()
269
+ setup_environment()
270
+
271
+ from handcrafted_solution import predict
272
+ print ("------------ Loading dataset------------ ")
273
+ params = hoho.get_params()
274
+ dataset = hoho.get_dataset(decode=None, split='all', dataset_type='webdataset')
275
+
276
+ print('------------ Now you can do your solution ---------------')
277
+ solution = []
278
+ from concurrent.futures import ProcessPoolExecutor
279
+ with ProcessPoolExecutor(max_workers=8) as pool:
280
+ results = []
281
+ for i, sample in enumerate(tqdm(dataset)):
282
+ results.append(pool.submit(predict, sample, visualize=False))
283
+
284
+ for i, result in enumerate(tqdm(results)):
285
+ key, pred_vertices, pred_edges = result.result()
286
+ solution.append({
287
+ '__key__': key,
288
+ 'wf_vertices': pred_vertices.tolist(),
289
+ 'wf_edges': pred_edges
290
+ })
291
+ if i % 100 == 0:
292
+ # incrementally save the results in case we run out of time
293
+ print(f"Processed {i} samples")
294
+ # save_submission(solution, Path(params['output_path']) / "submission.parquet")
295
+ print('------------ Saving results ---------------')
296
+ save_submission(solution, Path(params['output_path']) / "submission.parquet")
297
+ print("------------ Done ------------ ")