Sohaib36 commited on
Commit
6705a8b
β€’
1 Parent(s): e5f4906

add: adding monoscene

Browse files
Files changed (46) hide show
  1. monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py +0 -97
  2. monoscene/.ipynb_checkpoints/config-checkpoint.py +0 -34
  3. monoscene/.ipynb_checkpoints/modules-checkpoint.py +0 -194
  4. monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py +0 -22
  5. monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py +0 -88
  6. monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py +0 -90
  7. monoscene/__init__.py +0 -0
  8. monoscene/app.py +0 -138
  9. monoscene/config.py +0 -26
  10. monoscene/config/monoscene.yaml +35 -0
  11. monoscene/data/NYU/collate.py +50 -0
  12. monoscene/data/NYU/nyu_dataset.py +133 -0
  13. monoscene/data/NYU/nyu_dm.py +78 -0
  14. monoscene/data/NYU/params.py +54 -0
  15. monoscene/data/NYU/preprocess.py +182 -0
  16. monoscene/data/kitti_360/collate.py +47 -0
  17. monoscene/data/kitti_360/kitti_360_dataset.py +125 -0
  18. monoscene/data/kitti_360/kitti_360_dm.py +32 -0
  19. monoscene/data/semantic_kitti/collate.py +61 -0
  20. monoscene/data/semantic_kitti/io_data.py +239 -0
  21. monoscene/data/semantic_kitti/kitti_dataset.py +200 -0
  22. monoscene/data/semantic_kitti/kitti_dm.py +91 -0
  23. monoscene/data/semantic_kitti/params.py +48 -0
  24. monoscene/data/semantic_kitti/preprocess.py +102 -0
  25. monoscene/data/semantic_kitti/semantic-kitti.yaml +213 -0
  26. monoscene/data/utils/fusion.py +507 -0
  27. monoscene/data/utils/helpers.py +185 -0
  28. monoscene/data/utils/torch_util.py +15 -0
  29. monoscene/loss/CRP_loss.py +24 -0
  30. monoscene/loss/sscMetrics.py +204 -0
  31. monoscene/loss/ssc_loss.py +99 -0
  32. monoscene/{CRP3D.py β†’ models/CRP3D.py} +1 -1
  33. monoscene/{DDR.py β†’ models/DDR.py} +0 -0
  34. monoscene/{flosp.py β†’ models/flosp.py} +0 -0
  35. monoscene/{modules.py β†’ models/modules.py} +1 -1
  36. monoscene/{.ipynb_checkpoints/monoscene-checkpoint.py β†’ models/monoscene.py} +174 -7
  37. monoscene/{unet2d.py β†’ models/unet2d.py} +0 -0
  38. monoscene/{unet3d_kitti.py β†’ models/unet3d_kitti.py} +3 -3
  39. monoscene/{unet3d_nyu.py β†’ models/unet3d_nyu.py} +2 -2
  40. monoscene/monoscene.py +0 -125
  41. monoscene/monoscene_model.py +0 -21
  42. monoscene/scripts/eval_monoscene.py +71 -0
  43. monoscene/scripts/generate_output.py +127 -0
  44. monoscene/scripts/train_monoscene.py +173 -0
  45. monoscene/scripts/visualization/NYU_vis_pred.py +156 -0
  46. monoscene/scripts/visualization/kitti_vis_pred.py +201 -0
monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py DELETED
@@ -1,97 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from monoscene.modules import (
4
- Process,
5
- ASPP,
6
- )
7
-
8
-
9
- class CPMegaVoxels(nn.Module):
10
- def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
11
- super().__init__()
12
- self.size = size
13
- self.n_relations = n_relations
14
- print("n_relations", self.n_relations)
15
- self.flatten_size = size[0] * size[1] * size[2]
16
- self.feature = feature
17
- self.context_feature = feature * 2
18
- self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
19
- padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
20
-
21
- self.mega_context = nn.Sequential(
22
- nn.Conv3d(
23
- feature, self.context_feature, stride=2, padding=padding, kernel_size=3
24
- ),
25
- )
26
- self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
27
-
28
- self.context_prior_logits = nn.ModuleList(
29
- [
30
- nn.Sequential(
31
- nn.Conv3d(
32
- self.feature,
33
- self.flatten_context_size,
34
- padding=0,
35
- kernel_size=1,
36
- ),
37
- )
38
- for i in range(n_relations)
39
- ]
40
- )
41
- self.aspp = ASPP(feature, [1, 2, 3])
42
-
43
- self.resize = nn.Sequential(
44
- nn.Conv3d(
45
- self.context_feature * self.n_relations + feature,
46
- feature,
47
- kernel_size=1,
48
- padding=0,
49
- bias=False,
50
- ),
51
- Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
52
- )
53
-
54
- def forward(self, input):
55
- ret = {}
56
- bs = input.shape[0]
57
-
58
- x_agg = self.aspp(input)
59
-
60
- # get the mega context
61
- x_mega_context_raw = self.mega_context(x_agg)
62
- x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
63
- x_mega_context = x_mega_context.permute(0, 2, 1)
64
-
65
- # get context prior map
66
- x_context_prior_logits = []
67
- x_context_rels = []
68
- for rel in range(self.n_relations):
69
-
70
- # Compute the relation matrices
71
- x_context_prior_logit = self.context_prior_logits[rel](x_agg)
72
- x_context_prior_logit = x_context_prior_logit.reshape(
73
- bs, self.flatten_context_size, self.flatten_size
74
- )
75
- x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
76
-
77
- x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
78
- x_context_prior = torch.sigmoid(x_context_prior_logit)
79
-
80
- # Multiply the relation matrices with the mega context to gather context features
81
- x_context_rel = torch.bmm(x_context_prior, x_mega_context) # bs, N, f
82
- x_context_rels.append(x_context_rel)
83
-
84
- x_context = torch.cat(x_context_rels, dim=2)
85
- x_context = x_context.permute(0, 2, 1)
86
- x_context = x_context.reshape(
87
- bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
88
- )
89
-
90
- x = torch.cat([input, x_context], dim=1)
91
- x = self.resize(x)
92
-
93
- x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
94
- ret["P_logits"] = x_context_prior_logits
95
- ret["x"] = x
96
-
97
- return ret
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/.ipynb_checkpoints/config-checkpoint.py DELETED
@@ -1,34 +0,0 @@
1
- from transformers import PretrainedConfig
2
- from typing import List
3
-
4
-
5
- class MonoSceneConfig(PretrainedConfig):
6
-
7
- def __init__(
8
- self,
9
- block_type="bottleneck",
10
- layers: List[int] = [3, 4, 6, 3],
11
- num_classes: int = 1000,
12
- input_channels: int = 3,
13
- cardinality: int = 1,
14
- base_width: int = 64,
15
- stem_width: int = 64,
16
- stem_type: str = "",
17
- avg_down: bool = False,
18
- **kwargs,
19
- ):
20
- self.block_type = block_type
21
- self.layers = layers
22
- self.num_classes = num_classes
23
- self.input_channels = input_channels
24
- self.cardinality = cardinality
25
- self.base_width = base_width
26
- self.stem_width = stem_width
27
- self.stem_type = stem_type
28
- self.avg_down = avg_down
29
- super().__init__(**kwargs)
30
-
31
-
32
-
33
-
34
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/.ipynb_checkpoints/modules-checkpoint.py DELETED
@@ -1,194 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from monoscene.DDR import Bottleneck3D
4
-
5
-
6
- class ASPP(nn.Module):
7
- """
8
- ASPP 3D
9
- Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
10
- """
11
-
12
- def __init__(self, planes, dilations_conv_list):
13
- super().__init__()
14
-
15
- # ASPP Block
16
- self.conv_list = dilations_conv_list
17
- self.conv1 = nn.ModuleList(
18
- [
19
- nn.Conv3d(
20
- planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
21
- )
22
- for dil in dilations_conv_list
23
- ]
24
- )
25
- self.bn1 = nn.ModuleList(
26
- [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
27
- )
28
- self.conv2 = nn.ModuleList(
29
- [
30
- nn.Conv3d(
31
- planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
32
- )
33
- for dil in dilations_conv_list
34
- ]
35
- )
36
- self.bn2 = nn.ModuleList(
37
- [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
38
- )
39
- self.relu = nn.ReLU()
40
-
41
- def forward(self, x_in):
42
-
43
- y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
44
- for i in range(1, len(self.conv_list)):
45
- y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
46
- x_in = self.relu(y + x_in) # modified
47
-
48
- return x_in
49
-
50
-
51
- class SegmentationHead(nn.Module):
52
- """
53
- 3D Segmentation heads to retrieve semantic segmentation at each scale.
54
- Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
55
- Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
56
- """
57
-
58
- def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
59
- super().__init__()
60
-
61
- # First convolution
62
- self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
63
-
64
- # ASPP Block
65
- self.conv_list = dilations_conv_list
66
- self.conv1 = nn.ModuleList(
67
- [
68
- nn.Conv3d(
69
- planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
70
- )
71
- for dil in dilations_conv_list
72
- ]
73
- )
74
- self.bn1 = nn.ModuleList(
75
- [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
76
- )
77
- self.conv2 = nn.ModuleList(
78
- [
79
- nn.Conv3d(
80
- planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
81
- )
82
- for dil in dilations_conv_list
83
- ]
84
- )
85
- self.bn2 = nn.ModuleList(
86
- [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
87
- )
88
- self.relu = nn.ReLU()
89
-
90
- self.conv_classes = nn.Conv3d(
91
- planes, nbr_classes, kernel_size=3, padding=1, stride=1
92
- )
93
-
94
- def forward(self, x_in):
95
-
96
- # Convolution to go from inplanes to planes features...
97
- x_in = self.relu(self.conv0(x_in))
98
-
99
- y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
100
- for i in range(1, len(self.conv_list)):
101
- y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
102
- x_in = self.relu(y + x_in) # modified
103
-
104
- x_in = self.conv_classes(x_in)
105
-
106
- return x_in
107
-
108
-
109
- class ProcessKitti(nn.Module):
110
- def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
111
- super(Process, self).__init__()
112
- self.main = nn.Sequential(
113
- *[
114
- Bottleneck3D(
115
- feature,
116
- feature // 4,
117
- bn_momentum=bn_momentum,
118
- norm_layer=norm_layer,
119
- dilation=[i, i, i],
120
- )
121
- for i in dilations
122
- ]
123
- )
124
-
125
- def forward(self, x):
126
- return self.main(x)
127
-
128
-
129
- class Process(nn.Module):
130
- def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
131
- super(Process, self).__init__()
132
- self.main = nn.Sequential(
133
- *[
134
- Bottleneck3D(
135
- feature,
136
- feature // 4,
137
- bn_momentum=bn_momentum,
138
- norm_layer=norm_layer,
139
- dilation=[i, i, i],
140
- )
141
- for i in dilations
142
- ]
143
- )
144
-
145
- def forward(self, x):
146
- return self.main(x)
147
-
148
-
149
- class Upsample(nn.Module):
150
- def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
151
- super(Upsample, self).__init__()
152
- self.main = nn.Sequential(
153
- nn.ConvTranspose3d(
154
- in_channels,
155
- out_channels,
156
- kernel_size=3,
157
- stride=2,
158
- padding=1,
159
- dilation=1,
160
- output_padding=1,
161
- ),
162
- norm_layer(out_channels, momentum=bn_momentum),
163
- nn.ReLU(),
164
- )
165
-
166
- def forward(self, x):
167
- return self.main(x)
168
-
169
-
170
- class Downsample(nn.Module):
171
- def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
172
- super(Downsample, self).__init__()
173
- self.main = Bottleneck3D(
174
- feature,
175
- feature // 4,
176
- bn_momentum=bn_momentum,
177
- expansion=expansion,
178
- stride=2,
179
- downsample=nn.Sequential(
180
- nn.AvgPool3d(kernel_size=2, stride=2),
181
- nn.Conv3d(
182
- feature,
183
- int(feature * expansion / 4),
184
- kernel_size=1,
185
- stride=1,
186
- bias=False,
187
- ),
188
- norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
189
- ),
190
- norm_layer=norm_layer,
191
- )
192
-
193
- def forward(self, x):
194
- return self.main(x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py DELETED
@@ -1,22 +0,0 @@
1
- from transformers import PreTrainedModel
2
- from .config import MonoSceneConfig
3
- from monoscene.monoscene import MonoScene
4
-
5
-
6
-
7
- class MonoSceneModel(PreTrainedModel):
8
- config_class = ResnetConfig
9
-
10
- def __init__(self, config):
11
- super().__init__(config)
12
- self.model = MonoScene(
13
- dataset=config.dataset,
14
- n_classes=config.n_classes,
15
- feature=config.feature,
16
- project_scale=config.project_scale,
17
- full_scene_size=config.full_scene_size
18
- )
19
-
20
-
21
- def forward(self, tensor):
22
- return self.model.forward(tensor)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py DELETED
@@ -1,88 +0,0 @@
1
- # encoding: utf-8
2
- import torch
3
- import torch.nn as nn
4
- import torch.nn.functional as F
5
- from monoscene.modules import SegmentationHead
6
- from monoscene.CRP3D import CPMegaVoxels
7
- from monoscene.modules import Process, Upsample, Downsample
8
-
9
-
10
- class UNet3D(nn.Module):
11
- def __init__(
12
- self,
13
- class_num,
14
- norm_layer,
15
- full_scene_size,
16
- feature,
17
- project_scale,
18
- context_prior=None,
19
- bn_momentum=0.1,
20
- ):
21
- super(UNet3D, self).__init__()
22
- self.business_layer = []
23
- self.project_scale = project_scale
24
- self.full_scene_size = full_scene_size
25
- self.feature = feature
26
-
27
- size_l1 = (
28
- int(self.full_scene_size[0] / project_scale),
29
- int(self.full_scene_size[1] / project_scale),
30
- int(self.full_scene_size[2] / project_scale),
31
- )
32
- size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
33
- size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
34
-
35
- dilations = [1, 2, 3]
36
- self.process_l1 = nn.Sequential(
37
- Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
38
- Downsample(self.feature, norm_layer, bn_momentum),
39
- )
40
- self.process_l2 = nn.Sequential(
41
- Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
42
- Downsample(self.feature * 2, norm_layer, bn_momentum),
43
- )
44
-
45
- self.up_13_l2 = Upsample(
46
- self.feature * 4, self.feature * 2, norm_layer, bn_momentum
47
- )
48
- self.up_12_l1 = Upsample(
49
- self.feature * 2, self.feature, norm_layer, bn_momentum
50
- )
51
- self.up_l1_lfull = Upsample(
52
- self.feature, self.feature // 2, norm_layer, bn_momentum
53
- )
54
-
55
- self.ssc_head = SegmentationHead(
56
- self.feature // 2, self.feature // 2, class_num, dilations
57
- )
58
-
59
- self.context_prior = context_prior
60
- if context_prior:
61
- self.CP_mega_voxels = CPMegaVoxels(
62
- self.feature * 4, size_l3, bn_momentum=bn_momentum
63
- )
64
-
65
- def forward(self, input_dict):
66
- res = {}
67
-
68
- x3d_l1 = input_dict["x3d"]
69
-
70
- x3d_l2 = self.process_l1(x3d_l1)
71
-
72
- x3d_l3 = self.process_l2(x3d_l2)
73
-
74
- if self.context_prior:
75
- ret = self.CP_mega_voxels(x3d_l3)
76
- x3d_l3 = ret["x"]
77
- for k in ret.keys():
78
- res[k] = ret[k]
79
-
80
- x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
81
- x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
82
- x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
83
-
84
- ssc_logit_full = self.ssc_head(x3d_up_lfull)
85
-
86
- res["ssc_logit"] = ssc_logit_full
87
-
88
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py DELETED
@@ -1,90 +0,0 @@
1
- # encoding: utf-8
2
- import torch
3
- import torch.nn as nn
4
- import torch.nn.functional as F
5
- import numpy as np
6
- from monoscene.CRP3D import CPMegaVoxels
7
- from monoscene.modules import (
8
- Process,
9
- Upsample,
10
- Downsample,
11
- SegmentationHead,
12
- ASPP,
13
- )
14
-
15
-
16
- class UNet3D(nn.Module):
17
- def __init__(
18
- self,
19
- class_num,
20
- norm_layer,
21
- feature,
22
- full_scene_size,
23
- n_relations=4,
24
- project_res=[],
25
- context_prior=True,
26
- bn_momentum=0.1,
27
- ):
28
- super(UNet3D, self).__init__()
29
- self.business_layer = []
30
- self.project_res = project_res
31
-
32
- self.feature_1_4 = feature
33
- self.feature_1_8 = feature * 2
34
- self.feature_1_16 = feature * 4
35
-
36
- self.feature_1_16_dec = self.feature_1_16
37
- self.feature_1_8_dec = self.feature_1_8
38
- self.feature_1_4_dec = self.feature_1_4
39
-
40
- self.process_1_4 = nn.Sequential(
41
- Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
42
- Downsample(self.feature_1_4, norm_layer, bn_momentum),
43
- )
44
- self.process_1_8 = nn.Sequential(
45
- Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
46
- Downsample(self.feature_1_8, norm_layer, bn_momentum),
47
- )
48
- self.up_1_16_1_8 = Upsample(
49
- self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
50
- )
51
- self.up_1_8_1_4 = Upsample(
52
- self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
53
- )
54
- self.ssc_head_1_4 = SegmentationHead(
55
- self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
56
- )
57
-
58
- self.context_prior = context_prior
59
- size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
60
-
61
- if context_prior:
62
- self.CP_mega_voxels = CPMegaVoxels(
63
- self.feature_1_16,
64
- size_1_16,
65
- n_relations=n_relations,
66
- bn_momentum=bn_momentum,
67
- )
68
-
69
- #
70
- def forward(self, input_dict):
71
- res = {}
72
-
73
- x3d_1_4 = input_dict["x3d"]
74
- x3d_1_8 = self.process_1_4(x3d_1_4)
75
- x3d_1_16 = self.process_1_8(x3d_1_8)
76
-
77
- if self.context_prior:
78
- ret = self.CP_mega_voxels(x3d_1_16)
79
- x3d_1_16 = ret["x"]
80
- for k in ret.keys():
81
- res[k] = ret[k]
82
-
83
- x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
84
- x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
85
-
86
- ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
87
-
88
- res["ssc_logit"] = ssc_logit_1_4
89
-
90
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/__init__.py DELETED
File without changes
monoscene/app.py DELETED
@@ -1,138 +0,0 @@
1
- from pytorch_lightning import Trainer
2
- from monoscene.models.monoscene import MonoScene
3
- from monoscene.data.NYU.nyu_dm import NYUDataModule
4
- from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
5
- from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
6
- # import hydra
7
- from omegaconf import DictConfig
8
- import torch
9
- import numpy as np
10
- import os
11
- from hydra.utils import get_original_cwd
12
- import gradio as gr
13
- import numpy as np
14
- import plotly.express as px
15
- import pandas as pd
16
-
17
-
18
- # @hydra.main(config_name="../config/monoscene.yaml")
19
- def plot(input_img):
20
- torch.set_grad_enabled(False)
21
-
22
- # Setup dataloader
23
- # if config.dataset == "kitti" or config.dataset == "kitti_360":
24
- feature = 64
25
- project_scale = 2
26
- full_scene_size = (256, 256, 32)
27
-
28
- # if config.dataset == "kitti":
29
- # data_module = KittiDataModule(
30
- # root=config.kitti_root,
31
- # preprocess_root=config.kitti_preprocess_root,
32
- # frustum_size=config.frustum_size,
33
- # batch_size=int(config.batch_size / config.n_gpus),
34
- # num_workers=int(config.num_workers_per_gpu * config.n_gpus),
35
- # )
36
- # data_module.setup()
37
- # data_loader = data_module.val_dataloader()
38
- # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
39
- # else:
40
- # data_module = Kitti360DataModule(
41
- # root=config.kitti_360_root,
42
- # sequences=[config.kitti_360_sequence],
43
- # n_scans=2000,
44
- # batch_size=1,
45
- # num_workers=3,
46
- # )
47
- # data_module.setup()
48
- # data_loader = data_module.dataloader()
49
-
50
- # elif config.dataset == "NYU":
51
- # project_scale = 1
52
- # feature = 200
53
- # full_scene_size = (60, 36, 60)
54
- # data_module = NYUDataModule(
55
- # root=config.NYU_root,
56
- # preprocess_root=config.NYU_preprocess_root,
57
- # n_relations=config.n_relations,
58
- # frustum_size=config.frustum_size,
59
- # batch_size=int(config.batch_size / config.n_gpus),
60
- # num_workers=int(config.num_workers_per_gpu * config.n_gpus),
61
- # )
62
- # data_module.setup()
63
- # data_loader = data_module.val_dataloader()
64
- # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
65
- # else:
66
- # print("dataset not support")
67
-
68
- # Load pretrained models
69
- # if config.dataset == "NYU":
70
- # model_path = os.path.join(
71
- # get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
72
- # )
73
- # else:
74
- # model_path = os.path.join(
75
- # get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
76
- # )
77
- model_path = "trained_models/monoscene_kitti.ckpt"
78
-
79
- model = MonoScene.load_from_checkpoint(
80
- model_path,
81
- feature=feature,
82
- project_scale=project_scale,
83
- fp_loss=False,
84
- full_scene_size=full_scene_size,
85
- )
86
- model.cuda()
87
- model.eval()
88
-
89
- print(input_img.shape)
90
-
91
- x = np.arange(12).reshape(4, 3) / 12
92
- data = pd.DataFrame(data=x, columns=['x', 'y', 'z'])
93
- fig = px.scatter_3d(data, x="x", y="y", z="z")
94
- return fig
95
-
96
- demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot())
97
- demo.launch()
98
-
99
-
100
-
101
- # Save prediction and additional data
102
- # to draw the viewing frustum and remove scene outside the room for NYUv2
103
- # output_path = os.path.join(config.output_path, config.dataset)
104
- # with torch.no_grad():
105
- # for batch in tqdm(data_loader):
106
- # batch["img"] = batch["img"].cuda()
107
- # pred = model(batch)
108
- # y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
109
- # y_pred = np.argmax(y_pred, axis=1)
110
- # for i in range(config.batch_size):
111
- # out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
112
- # if "target" in batch:
113
- # out_dict["target"] = (
114
- # batch["target"][i].detach().cpu().numpy().astype(np.uint16)
115
- # )
116
-
117
- # if config.dataset == "NYU":
118
- # write_path = output_path
119
- # filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
120
- # out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
121
- # out_dict["vox_origin"] = (
122
- # batch["vox_origin"][i].detach().cpu().numpy()
123
- # )
124
- # else:
125
- # write_path = os.path.join(output_path, batch["sequence"][i])
126
- # filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
127
- # out_dict["fov_mask_1"] = (
128
- # batch["fov_mask_1"][i].detach().cpu().numpy()
129
- # )
130
- # out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
131
- # out_dict["T_velo_2_cam"] = (
132
- # batch["T_velo_2_cam"][i].detach().cpu().numpy()
133
- # )
134
-
135
- # os.makedirs(write_path, exist_ok=True)
136
- # with open(filepath, "wb") as handle:
137
- # pickle.dump(out_dict, handle)
138
- # print("wrote to", filepath)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/config.py DELETED
@@ -1,26 +0,0 @@
1
- from transformers import PretrainedConfig
2
- from typing import List
3
-
4
-
5
- class MonoSceneConfig(PretrainedConfig):
6
-
7
- def __init__(
8
- self,
9
- dataset="kitti",
10
- n_classes=20,
11
- feature=64,
12
- project_scale=2,
13
- full_scene_size=(256, 256, 32),
14
- **kwargs,
15
- ):
16
- self.dataset = dataset
17
- self.n_classes = n_classes
18
- self.feature = feature
19
- self.project_scale = project_scale
20
- self.full_scene_size = full_scene_size
21
- super().__init__(**kwargs)
22
-
23
-
24
-
25
-
26
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/config/monoscene.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #dataset: "NYU" # "kitti", "kitti_360"
2
+ dataset: "kitti_360"
3
+
4
+ n_relations: 4
5
+
6
+ enable_log: false
7
+ kitti_root: '/path/to/semantic_kitti'
8
+ kitti_preprocess_root: '/path/to/kitti/preprocess/folder'
9
+ kitti_logdir: '/path/to/semantic_kitti/logdir'
10
+
11
+ NYU_root: '/path/to/NYU/depthbin'
12
+ NYU_preprocess_root: '/path/to/NYU/preprocess/folder'
13
+ logdir: '/path/to/NYU/logdir'
14
+
15
+
16
+ fp_loss: true
17
+ frustum_size: 8
18
+ batch_size: 1
19
+ n_gpus: 1
20
+ num_workers_per_gpu: 3
21
+ exp_prefix: "exp"
22
+ run: 1
23
+ lr: 1e-4
24
+ weight_decay: 1e-4
25
+
26
+ context_prior: true
27
+
28
+ relation_loss: true
29
+ CE_ssc_loss: true
30
+ sem_scal_loss: true
31
+ geo_scal_loss: true
32
+
33
+ project_1_2: true
34
+ project_1_4: true
35
+ project_1_8: true
monoscene/data/NYU/collate.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def collate_fn(batch):
5
+ data = {}
6
+ imgs = []
7
+ targets = []
8
+ names = []
9
+ cam_poses = []
10
+
11
+ vox_origins = []
12
+ cam_ks = []
13
+
14
+ CP_mega_matrices = []
15
+
16
+ data["projected_pix_1"] = []
17
+ data["fov_mask_1"] = []
18
+ data["frustums_masks"] = []
19
+ data["frustums_class_dists"] = []
20
+
21
+ for idx, input_dict in enumerate(batch):
22
+ CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
23
+ for key in data:
24
+ if key in input_dict:
25
+ data[key].append(torch.from_numpy(input_dict[key]))
26
+
27
+ cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
28
+ cam_poses.append(torch.from_numpy(input_dict["cam_pose"]).float())
29
+ vox_origins.append(torch.from_numpy(input_dict["voxel_origin"]).double())
30
+
31
+ names.append(input_dict["name"])
32
+
33
+ img = input_dict["img"]
34
+ imgs.append(img)
35
+
36
+ target = torch.from_numpy(input_dict["target"])
37
+ targets.append(target)
38
+
39
+ ret_data = {
40
+ "CP_mega_matrices": CP_mega_matrices,
41
+ "cam_pose": torch.stack(cam_poses),
42
+ "cam_k": torch.stack(cam_ks),
43
+ "vox_origin": torch.stack(vox_origins),
44
+ "name": names,
45
+ "img": torch.stack(imgs),
46
+ "target": torch.stack(targets),
47
+ }
48
+ for key in data:
49
+ ret_data[key] = data[key]
50
+ return ret_data
monoscene/data/NYU/nyu_dataset.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import glob
4
+ from torch.utils.data import Dataset
5
+ import numpy as np
6
+ from PIL import Image
7
+ from torchvision import transforms
8
+ from monoscene.data.utils.helpers import (
9
+ vox2pix,
10
+ compute_local_frustums,
11
+ compute_CP_mega_matrix,
12
+ )
13
+ import pickle
14
+ import torch.nn.functional as F
15
+
16
+
17
+ class NYUDataset(Dataset):
18
+ def __init__(
19
+ self,
20
+ split,
21
+ root,
22
+ preprocess_root,
23
+ n_relations=4,
24
+ color_jitter=None,
25
+ frustum_size=4,
26
+ fliplr=0.0,
27
+ ):
28
+ self.n_relations = n_relations
29
+ self.frustum_size = frustum_size
30
+ self.n_classes = 12
31
+ self.root = os.path.join(root, "NYU" + split)
32
+ self.preprocess_root = preprocess_root
33
+ self.base_dir = os.path.join(preprocess_root, "base", "NYU" + split)
34
+ self.fliplr = fliplr
35
+
36
+ self.voxel_size = 0.08 # 0.08m
37
+ self.scene_size = (4.8, 4.8, 2.88) # (4.8m, 4.8m, 2.88m)
38
+ self.img_W = 640
39
+ self.img_H = 480
40
+ self.cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]])
41
+
42
+ self.color_jitter = (
43
+ transforms.ColorJitter(*color_jitter) if color_jitter else None
44
+ )
45
+
46
+ self.scan_names = glob.glob(os.path.join(self.root, "*.bin"))
47
+
48
+ self.normalize_rgb = transforms.Compose(
49
+ [
50
+ transforms.ToTensor(),
51
+ transforms.Normalize(
52
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
53
+ ),
54
+ ]
55
+ )
56
+
57
+ def __getitem__(self, index):
58
+ file_path = self.scan_names[index]
59
+ filename = os.path.basename(file_path)
60
+ name = filename[:-4]
61
+
62
+ os.makedirs(self.base_dir, exist_ok=True)
63
+ filepath = os.path.join(self.base_dir, name + ".pkl")
64
+
65
+ with open(filepath, "rb") as handle:
66
+ data = pickle.load(handle)
67
+
68
+ cam_pose = data["cam_pose"]
69
+ T_world_2_cam = np.linalg.inv(cam_pose)
70
+ vox_origin = data["voxel_origin"]
71
+ data["cam_k"] = self.cam_k
72
+ target = data[
73
+ "target_1_4"
74
+ ] # Following SSC literature, the output resolution on NYUv2 is set to 1:4
75
+ data["target"] = target
76
+ target_1_4 = data["target_1_16"]
77
+
78
+ CP_mega_matrix = compute_CP_mega_matrix(
79
+ target_1_4, is_binary=self.n_relations == 2
80
+ )
81
+ data["CP_mega_matrix"] = CP_mega_matrix
82
+
83
+ # compute the 3D-2D mapping
84
+ projected_pix, fov_mask, pix_z = vox2pix(
85
+ T_world_2_cam,
86
+ self.cam_k,
87
+ vox_origin,
88
+ self.voxel_size,
89
+ self.img_W,
90
+ self.img_H,
91
+ self.scene_size,
92
+ )
93
+
94
+ data["projected_pix_1"] = projected_pix
95
+ data["fov_mask_1"] = fov_mask
96
+
97
+ # compute the masks, each indicates voxels inside a frustum
98
+ frustums_masks, frustums_class_dists = compute_local_frustums(
99
+ projected_pix,
100
+ pix_z,
101
+ target,
102
+ self.img_W,
103
+ self.img_H,
104
+ dataset="NYU",
105
+ n_classes=12,
106
+ size=self.frustum_size,
107
+ )
108
+ data["frustums_masks"] = frustums_masks
109
+ data["frustums_class_dists"] = frustums_class_dists
110
+
111
+ rgb_path = os.path.join(self.root, name + "_color.jpg")
112
+ img = Image.open(rgb_path).convert("RGB")
113
+
114
+ # Image augmentation
115
+ if self.color_jitter is not None:
116
+ img = self.color_jitter(img)
117
+
118
+ # PIL to numpy
119
+ img = np.array(img, dtype=np.float32, copy=False) / 255.0
120
+
121
+ # randomly fliplr the image
122
+ if np.random.rand() < self.fliplr:
123
+ img = np.ascontiguousarray(np.fliplr(img))
124
+ data["projected_pix_1"][:, 0] = (
125
+ img.shape[1] - 1 - data["projected_pix_1"][:, 0]
126
+ )
127
+
128
+ data["img"] = self.normalize_rgb(img) # (3, img_H, img_W)
129
+
130
+ return data
131
+
132
+ def __len__(self):
133
+ return len(self.scan_names)
monoscene/data/NYU/nyu_dm.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data.dataloader import DataLoader
2
+ from monoscene.data.NYU.nyu_dataset import NYUDataset
3
+ from monoscene.data.NYU.collate import collate_fn
4
+ import pytorch_lightning as pl
5
+ from monoscene.data.utils.torch_util import worker_init_fn
6
+
7
+
8
+ class NYUDataModule(pl.LightningDataModule):
9
+ def __init__(
10
+ self,
11
+ root,
12
+ preprocess_root,
13
+ n_relations=4,
14
+ batch_size=4,
15
+ frustum_size=4,
16
+ num_workers=6,
17
+ ):
18
+ super().__init__()
19
+ self.n_relations = n_relations
20
+ self.preprocess_root = preprocess_root
21
+ self.root = root
22
+ self.batch_size = batch_size
23
+ self.num_workers = num_workers
24
+ self.frustum_size = frustum_size
25
+
26
+ def setup(self, stage=None):
27
+ self.train_ds = NYUDataset(
28
+ split="train",
29
+ preprocess_root=self.preprocess_root,
30
+ n_relations=self.n_relations,
31
+ root=self.root,
32
+ fliplr=0.5,
33
+ frustum_size=self.frustum_size,
34
+ color_jitter=(0.4, 0.4, 0.4),
35
+ )
36
+ self.test_ds = NYUDataset(
37
+ split="test",
38
+ preprocess_root=self.preprocess_root,
39
+ n_relations=self.n_relations,
40
+ root=self.root,
41
+ frustum_size=self.frustum_size,
42
+ fliplr=0.0,
43
+ color_jitter=None,
44
+ )
45
+
46
+ def train_dataloader(self):
47
+ return DataLoader(
48
+ self.train_ds,
49
+ batch_size=self.batch_size,
50
+ drop_last=True,
51
+ num_workers=self.num_workers,
52
+ shuffle=True,
53
+ pin_memory=True,
54
+ worker_init_fn=worker_init_fn,
55
+ collate_fn=collate_fn,
56
+ )
57
+
58
+ def val_dataloader(self):
59
+ return DataLoader(
60
+ self.test_ds,
61
+ batch_size=self.batch_size,
62
+ num_workers=self.num_workers,
63
+ drop_last=False,
64
+ shuffle=False,
65
+ pin_memory=True,
66
+ collate_fn=collate_fn,
67
+ )
68
+
69
+ def test_dataloader(self):
70
+ return DataLoader(
71
+ self.test_ds,
72
+ batch_size=self.batch_size,
73
+ num_workers=self.num_workers,
74
+ drop_last=False,
75
+ shuffle=False,
76
+ pin_memory=True,
77
+ collate_fn=collate_fn,
78
+ )
monoscene/data/NYU/params.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+ NYU_class_names = [
5
+ "empty",
6
+ "ceiling",
7
+ "floor",
8
+ "wall",
9
+ "window",
10
+ "chair",
11
+ "bed",
12
+ "sofa",
13
+ "table",
14
+ "tvs",
15
+ "furn",
16
+ "objs",
17
+ ]
18
+ class_weights = torch.FloatTensor([0.05, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
19
+
20
+ class_freq_1_4 = np.array(
21
+ [
22
+ 43744234,
23
+ 80205,
24
+ 1070052,
25
+ 905632,
26
+ 116952,
27
+ 180994,
28
+ 436852,
29
+ 279714,
30
+ 254611,
31
+ 28247,
32
+ 1805949,
33
+ 850724,
34
+ ]
35
+ )
36
+ class_freq_1_8 = np.array(
37
+ [
38
+ 5176253,
39
+ 17277,
40
+ 220105,
41
+ 183849,
42
+ 21827,
43
+ 33520,
44
+ 67022,
45
+ 44248,
46
+ 46615,
47
+ 4419,
48
+ 290218,
49
+ 142573,
50
+ ]
51
+ )
52
+ class_freq_1_16 = np.array(
53
+ [587620, 3820, 46836, 36256, 4241, 5978, 10939, 8000, 8224, 781, 49778, 25864]
54
+ )
monoscene/data/NYU/preprocess.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from tqdm import tqdm
3
+ import numpy.matlib
4
+ import os
5
+ import glob
6
+ import pickle
7
+ import hydra
8
+ from omegaconf import DictConfig
9
+
10
+
11
+ seg_class_map = [
12
+ 0,
13
+ 1,
14
+ 2,
15
+ 3,
16
+ 4,
17
+ 11,
18
+ 5,
19
+ 6,
20
+ 7,
21
+ 8,
22
+ 8,
23
+ 10,
24
+ 10,
25
+ 10,
26
+ 11,
27
+ 11,
28
+ 9,
29
+ 8,
30
+ 11,
31
+ 11,
32
+ 11,
33
+ 11,
34
+ 11,
35
+ 11,
36
+ 11,
37
+ 11,
38
+ 11,
39
+ 10,
40
+ 10,
41
+ 11,
42
+ 8,
43
+ 10,
44
+ 11,
45
+ 9,
46
+ 11,
47
+ 11,
48
+ 11,
49
+ ]
50
+
51
+
52
+ def _rle2voxel(rle, voxel_size=(240, 144, 240), rle_filename=""):
53
+ r"""Read voxel label data from file (RLE compression), and convert it to fully occupancy labeled voxels.
54
+ code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L172
55
+ In the data loader of pytorch, only single thread is allowed.
56
+ For multi-threads version and more details, see 'readRLE.py'.
57
+ output: seg_label: 3D numpy array, size 240 x 144 x 240
58
+ """
59
+ seg_label = np.zeros(
60
+ int(voxel_size[0] * voxel_size[1] * voxel_size[2]), dtype=np.uint8
61
+ ) # segmentation label
62
+ vox_idx = 0
63
+ for idx in range(int(rle.shape[0] / 2)):
64
+ check_val = rle[idx * 2]
65
+ check_iter = rle[idx * 2 + 1]
66
+ if check_val >= 37 and check_val != 255: # 37 classes to 12 classes
67
+ print("RLE {} check_val: {}".format(rle_filename, check_val))
68
+ seg_label_val = (
69
+ seg_class_map[check_val] if check_val != 255 else 255
70
+ ) # 37 classes to 12 classes
71
+ seg_label[vox_idx : vox_idx + check_iter] = np.matlib.repmat(
72
+ seg_label_val, 1, check_iter
73
+ )
74
+ vox_idx = vox_idx + check_iter
75
+ seg_label = seg_label.reshape(voxel_size) # 3D array, size 240 x 144 x 240
76
+ return seg_label
77
+
78
+
79
+ def _read_rle(rle_filename): # 0.0005s
80
+ """Read RLE compression data
81
+ code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L153
82
+ Return:
83
+ vox_origin,
84
+ cam_pose,
85
+ vox_rle, voxel label data from file
86
+ Shape:
87
+ vox_rle, (240, 144, 240)
88
+ """
89
+ fid = open(rle_filename, "rb")
90
+ vox_origin = np.fromfile(
91
+ fid, np.float32, 3
92
+ ).T # Read voxel origin in world coordinates
93
+ cam_pose = np.fromfile(fid, np.float32, 16).reshape((4, 4)) # Read camera pose
94
+ vox_rle = (
95
+ np.fromfile(fid, np.uint32).reshape((-1, 1)).T
96
+ ) # Read voxel label data from file
97
+ vox_rle = np.squeeze(vox_rle) # 2d array: (1 x N), to 1d array: (N , )
98
+ fid.close()
99
+ return vox_origin, cam_pose, vox_rle
100
+
101
+
102
+ def _downsample_label(label, voxel_size=(240, 144, 240), downscale=4):
103
+ r"""downsample the labeled data,
104
+ code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L262
105
+ Shape:
106
+ label, (240, 144, 240)
107
+ label_downscale, if downsample==4, then (60, 36, 60)
108
+ """
109
+ if downscale == 1:
110
+ return label
111
+ ds = downscale
112
+ small_size = (
113
+ voxel_size[0] // ds,
114
+ voxel_size[1] // ds,
115
+ voxel_size[2] // ds,
116
+ ) # small size
117
+ label_downscale = np.zeros(small_size, dtype=np.uint8)
118
+ empty_t = 0.95 * ds * ds * ds # threshold
119
+ s01 = small_size[0] * small_size[1]
120
+ label_i = np.zeros((ds, ds, ds), dtype=np.int32)
121
+
122
+ for i in range(small_size[0] * small_size[1] * small_size[2]):
123
+ z = int(i / s01)
124
+ y = int((i - z * s01) / small_size[0])
125
+ x = int(i - z * s01 - y * small_size[0])
126
+
127
+ label_i[:, :, :] = label[
128
+ x * ds : (x + 1) * ds, y * ds : (y + 1) * ds, z * ds : (z + 1) * ds
129
+ ]
130
+ label_bin = label_i.flatten()
131
+
132
+ zero_count_0 = np.array(np.where(label_bin == 0)).size
133
+ zero_count_255 = np.array(np.where(label_bin == 255)).size
134
+
135
+ zero_count = zero_count_0 + zero_count_255
136
+ if zero_count > empty_t:
137
+ label_downscale[x, y, z] = 0 if zero_count_0 > zero_count_255 else 255
138
+ else:
139
+ label_i_s = label_bin[
140
+ np.where(np.logical_and(label_bin > 0, label_bin < 255))
141
+ ]
142
+ label_downscale[x, y, z] = np.argmax(np.bincount(label_i_s))
143
+ return label_downscale
144
+
145
+
146
+ @hydra.main(config_name="../../config/monoscene.yaml")
147
+ def main(config: DictConfig):
148
+ scene_size = (240, 144, 240)
149
+ for split in ["train", "test"]:
150
+ root = os.path.join(config.NYU_root, "NYU" + split)
151
+ base_dir = os.path.join(config.NYU_preprocess_root, "base", "NYU" + split)
152
+ os.makedirs(base_dir, exist_ok=True)
153
+
154
+ scans = glob.glob(os.path.join(root, "*.bin"))
155
+ for scan in tqdm(scans):
156
+ filename = os.path.basename(scan)
157
+ name = filename[:-4]
158
+ filepath = os.path.join(base_dir, name + ".pkl")
159
+ if os.path.exists(filepath):
160
+ continue
161
+
162
+ vox_origin, cam_pose, rle = _read_rle(scan)
163
+
164
+ target_1_1 = _rle2voxel(rle, scene_size, scan)
165
+ target_1_4 = _downsample_label(target_1_1, scene_size, 4)
166
+ target_1_16 = _downsample_label(target_1_1, scene_size, 16)
167
+
168
+ data = {
169
+ "cam_pose": cam_pose,
170
+ "voxel_origin": vox_origin,
171
+ "name": name,
172
+ "target_1_4": target_1_4,
173
+ "target_1_16": target_1_16,
174
+ }
175
+
176
+ with open(filepath, "wb") as handle:
177
+ pickle.dump(data, handle)
178
+ print("wrote to", filepath)
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
monoscene/data/kitti_360/collate.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def collate_fn(batch):
5
+ data = {}
6
+ imgs = []
7
+ frame_ids = []
8
+ img_paths = []
9
+ sequences = []
10
+
11
+ cam_ks = []
12
+ T_velo_2_cams = []
13
+
14
+ scale_3ds = batch[0]["scale_3ds"]
15
+ for scale_3d in scale_3ds:
16
+ data["projected_pix_{}".format(scale_3d)] = []
17
+ data["fov_mask_{}".format(scale_3d)] = []
18
+
19
+ for _, input_dict in enumerate(batch):
20
+ if "img_path" in input_dict:
21
+ img_paths.append(input_dict["img_path"])
22
+
23
+ for key in data:
24
+ data[key].append(torch.from_numpy(input_dict[key]))
25
+
26
+ cam_ks.append(torch.from_numpy(input_dict["cam_k"]).float())
27
+ T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
28
+
29
+ sequences.append(input_dict["sequence"])
30
+
31
+ img = input_dict["img"]
32
+ imgs.append(img)
33
+
34
+ frame_ids.append(input_dict["frame_id"])
35
+
36
+ ret_data = {
37
+ "sequence": sequences,
38
+ "frame_id": frame_ids,
39
+ "cam_k": cam_ks,
40
+ "T_velo_2_cam": T_velo_2_cams,
41
+ "img": torch.stack(imgs),
42
+ "img_path": img_paths,
43
+ }
44
+ for key in data:
45
+ ret_data[key] = data[key]
46
+
47
+ return ret_data
monoscene/data/kitti_360/kitti_360_dataset.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import glob
4
+ from torch.utils.data import Dataset
5
+ import numpy as np
6
+ from monoscene.data.utils.helpers import vox2pix
7
+ from PIL import Image
8
+ from torchvision import transforms
9
+
10
+
11
+ class Kitti360Dataset(Dataset):
12
+ def __init__(self, root, sequences, n_scans):
13
+ """
14
+ Paramters
15
+ --------
16
+ root: str
17
+ Path to KITTI-360 dataset i.e. contain sequences such as 2013_05_28_drive_0009_sync
18
+ sequence: str
19
+ KITTI-360 sequence e.g. 2013_05_28_drive_0009_sync
20
+ n_scans: int
21
+ Only use the first n_scans since KITTI-360 sequence is very long
22
+ """
23
+ self.root = root
24
+ self.img_H = 376
25
+ self.img_W = 1408
26
+ self.project_scale = 2
27
+ self.output_scale = 1
28
+ self.voxel_size = 0.2
29
+ self.vox_origin = np.array([0, -25.6, -2])
30
+ self.scene_size = (51.2, 51.2, 6.4)
31
+ self.T_velo_2_cam = self.get_velo2cam()
32
+ self.cam_k = self.get_cam_k()
33
+ self.scans = []
34
+ for sequence in sequences:
35
+ glob_path = os.path.join(
36
+ self.root, "data_2d_raw", sequence, "image_00/data_rect", "*.png"
37
+ )
38
+ for img_path in glob.glob(glob_path):
39
+ self.scans.append({"img_path": img_path, "sequence": sequence})
40
+ self.scans = self.scans[:n_scans]
41
+ self.normalize_rgb = transforms.Compose(
42
+ [
43
+ transforms.ToTensor(),
44
+ transforms.Normalize(
45
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
46
+ ),
47
+ ]
48
+ )
49
+
50
+ def __len__(self):
51
+ return len(self.scans)
52
+
53
+ def get_cam_k(self):
54
+ cam_k = np.array(
55
+ [
56
+ 552.554261,
57
+ 0.000000,
58
+ 682.049453,
59
+ 0.000000,
60
+ 0.000000,
61
+ 552.554261,
62
+ 238.769549,
63
+ 0.000000,
64
+ 0.000000,
65
+ 0.000000,
66
+ 1.000000,
67
+ 0.000000,
68
+ ]
69
+ ).reshape(3, 4)
70
+ return cam_k[:3, :3]
71
+
72
+ def get_velo2cam(self):
73
+ cam2velo = np.array(
74
+ [
75
+ 0.04307104361,
76
+ -0.08829286498,
77
+ 0.995162929,
78
+ 0.8043914418,
79
+ -0.999004371,
80
+ 0.007784614041,
81
+ 0.04392796942,
82
+ 0.2993489574,
83
+ -0.01162548558,
84
+ -0.9960641394,
85
+ -0.08786966659,
86
+ -0.1770225824,
87
+ ]
88
+ ).reshape(3, 4)
89
+ cam2velo = np.concatenate(
90
+ [cam2velo, np.array([0, 0, 0, 1]).reshape(1, 4)], axis=0
91
+ )
92
+ return np.linalg.inv(cam2velo)
93
+
94
+ def __getitem__(self, index):
95
+ data = {"T_velo_2_cam": self.T_velo_2_cam, "cam_k": self.cam_k}
96
+ scan = self.scans[index]
97
+ img_path = scan["img_path"]
98
+ sequence = scan["sequence"]
99
+ filename = os.path.basename(img_path)
100
+ frame_id = os.path.splitext(filename)[0]
101
+ data["frame_id"] = frame_id
102
+ data["img_path"] = img_path
103
+ data["sequence"] = sequence
104
+
105
+ img = Image.open(img_path).convert("RGB")
106
+ img = np.array(img, dtype=np.float32, copy=False) / 255.0
107
+ img = self.normalize_rgb(img)
108
+ data["img"] = img
109
+
110
+ scale_3ds = [self.project_scale, self.output_scale]
111
+ data["scale_3ds"] = scale_3ds
112
+
113
+ for scale_3d in scale_3ds:
114
+ projected_pix, fov_mask, _ = vox2pix(
115
+ self.T_velo_2_cam,
116
+ self.cam_k,
117
+ self.vox_origin,
118
+ self.voxel_size * scale_3d,
119
+ self.img_W,
120
+ self.img_H,
121
+ self.scene_size,
122
+ )
123
+ data["projected_pix_{}".format(scale_3d)] = projected_pix
124
+ data["fov_mask_{}".format(scale_3d)] = fov_mask
125
+ return data
monoscene/data/kitti_360/kitti_360_dm.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data.dataloader import DataLoader
2
+ from monoscene.data.kitti_360.kitti_360_dataset import Kitti360Dataset
3
+ import pytorch_lightning as pl
4
+ from monoscene.data.kitti_360.collate import collate_fn
5
+ from monoscene.data.utils.torch_util import worker_init_fn
6
+
7
+
8
+ class Kitti360DataModule(pl.LightningDataModule):
9
+ def __init__(self, root, sequences, n_scans, batch_size=4, num_workers=3):
10
+ super().__init__()
11
+ self.root = root
12
+ self.batch_size = batch_size
13
+ self.num_workers = num_workers
14
+ self.sequences = sequences
15
+ self.n_scans = n_scans
16
+
17
+ def setup(self, stage=None):
18
+ self.ds = Kitti360Dataset(
19
+ root=self.root, sequences=self.sequences, n_scans=self.n_scans
20
+ )
21
+
22
+ def dataloader(self):
23
+ return DataLoader(
24
+ self.ds,
25
+ batch_size=self.batch_size,
26
+ drop_last=False,
27
+ num_workers=self.num_workers,
28
+ shuffle=False,
29
+ pin_memory=True,
30
+ worker_init_fn=worker_init_fn,
31
+ collate_fn=collate_fn,
32
+ )
monoscene/data/semantic_kitti/collate.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def collate_fn(batch):
5
+ data = {}
6
+ imgs = []
7
+ CP_mega_matrices = []
8
+ targets = []
9
+ frame_ids = []
10
+ sequences = []
11
+
12
+ cam_ks = []
13
+ T_velo_2_cams = []
14
+ frustums_masks = []
15
+ frustums_class_dists = []
16
+
17
+ scale_3ds = batch[0]["scale_3ds"]
18
+ for scale_3d in scale_3ds:
19
+ data["projected_pix_{}".format(scale_3d)] = []
20
+ data["fov_mask_{}".format(scale_3d)] = []
21
+
22
+ for idx, input_dict in enumerate(batch):
23
+ cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
24
+ T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
25
+
26
+ if "frustums_masks" in input_dict:
27
+ frustums_masks.append(torch.from_numpy(input_dict["frustums_masks"]))
28
+ frustums_class_dists.append(
29
+ torch.from_numpy(input_dict["frustums_class_dists"]).float()
30
+ )
31
+
32
+ for key in data:
33
+ data[key].append(torch.from_numpy(input_dict[key]))
34
+
35
+ img = input_dict["img"]
36
+ imgs.append(img)
37
+
38
+ frame_ids.append(input_dict["frame_id"])
39
+ sequences.append(input_dict["sequence"])
40
+
41
+
42
+ target = torch.from_numpy(input_dict["target"])
43
+ targets.append(target)
44
+ CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
45
+
46
+ ret_data = {
47
+ "frame_id": frame_ids,
48
+ "sequence": sequences,
49
+ "frustums_class_dists": frustums_class_dists,
50
+ "frustums_masks": frustums_masks,
51
+ "cam_k": cam_ks,
52
+ "T_velo_2_cam": T_velo_2_cams,
53
+ "img": torch.stack(imgs),
54
+ "CP_mega_matrices": CP_mega_matrices,
55
+ "target": torch.stack(targets)
56
+ }
57
+
58
+
59
+ for key in data:
60
+ ret_data[key] = data[key]
61
+ return ret_data
monoscene/data/semantic_kitti/io_data.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
3
+ """
4
+
5
+ import numpy as np
6
+ import yaml
7
+ import imageio
8
+
9
+
10
+ def unpack(compressed):
11
+ ''' given a bit encoded voxel grid, make a normal voxel grid out of it. '''
12
+ uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
13
+ uncompressed[::8] = compressed[:] >> 7 & 1
14
+ uncompressed[1::8] = compressed[:] >> 6 & 1
15
+ uncompressed[2::8] = compressed[:] >> 5 & 1
16
+ uncompressed[3::8] = compressed[:] >> 4 & 1
17
+ uncompressed[4::8] = compressed[:] >> 3 & 1
18
+ uncompressed[5::8] = compressed[:] >> 2 & 1
19
+ uncompressed[6::8] = compressed[:] >> 1 & 1
20
+ uncompressed[7::8] = compressed[:] & 1
21
+
22
+ return uncompressed
23
+
24
+
25
+ def img_normalize(img, mean, std):
26
+ img = img.astype(np.float32) / 255.0
27
+ img = img - mean
28
+ img = img / std
29
+
30
+ return img
31
+
32
+
33
+ def pack(array):
34
+ """ convert a boolean array into a bitwise array. """
35
+ array = array.reshape((-1))
36
+
37
+ #compressing bit flags.
38
+ # yapf: disable
39
+ compressed = array[::8] << 7 | array[1::8] << 6 | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
40
+ # yapf: enable
41
+
42
+ return np.array(compressed, dtype=np.uint8)
43
+
44
+
45
+ def get_grid_coords(dims, resolution):
46
+ '''
47
+ :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
48
+ :return coords_grid: is the center coords of voxels in the grid
49
+ '''
50
+
51
+ # The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
52
+ g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
53
+ # The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
54
+ g_yy = np.arange(0, dims[1] + 1)
55
+ # The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
56
+ # if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
57
+ # (we go to 22 + 1 for the histogramdd)
58
+ # ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
59
+ sensor_pose = 10
60
+ g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)
61
+
62
+ # Obtaining the grid with coords...
63
+ xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
64
+ coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
65
+ coords_grid = coords_grid.astype(np.float)
66
+
67
+ coords_grid = (coords_grid * resolution) + resolution/2
68
+
69
+ temp = np.copy(coords_grid)
70
+ temp[:, 0] = coords_grid[:, 1]
71
+ temp[:, 1] = coords_grid[:, 0]
72
+ coords_grid = np.copy(temp)
73
+
74
+ return coords_grid, g_xx, g_yy, g_zz
75
+
76
+
77
+ def _get_remap_lut(config_path):
78
+ '''
79
+ remap_lut to remap classes of semantic kitti for training...
80
+ :return:
81
+ '''
82
+
83
+ dataset_config = yaml.safe_load(open(config_path, 'r'))
84
+ # make lookup table for mapping
85
+ maxkey = max(dataset_config['learning_map'].keys())
86
+
87
+ # +100 hack making lut bigger just in case there are unknown labels
88
+ remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
89
+ remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
90
+
91
+ # in completion we have to distinguish empty and invalid voxels.
92
+ # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
93
+ remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
94
+ remap_lut[0] = 0 # only 'empty' stays 'empty'.
95
+
96
+ return remap_lut
97
+
98
+
99
+ def get_inv_map():
100
+ '''
101
+ remap_lut to remap classes of semantic kitti for training...
102
+ :return:
103
+ '''
104
+ config_path = "./semantic-kitti.yaml"
105
+ dataset_config = yaml.safe_load(open(config_path, 'r'))
106
+ # make lookup table for mapping
107
+
108
+ inv_map = np.zeros(20, dtype=np.int32)
109
+ inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())
110
+
111
+ return inv_map
112
+
113
+ def _read_SemKITTI(path, dtype, do_unpack):
114
+ bin = np.fromfile(path, dtype=dtype) # Flattened array
115
+ if do_unpack:
116
+ bin = unpack(bin)
117
+ return bin
118
+
119
+
120
+ def _read_label_SemKITTI(path):
121
+ label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
122
+ return label
123
+
124
+
125
+ def _read_invalid_SemKITTI(path):
126
+ invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
127
+ return invalid
128
+
129
+
130
+ def _read_occluded_SemKITTI(path):
131
+ occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
132
+ return occluded
133
+
134
+
135
+ def _read_occupancy_SemKITTI(path):
136
+ occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
137
+ return occupancy
138
+
139
+
140
+ def _read_rgb_SemKITTI(path):
141
+ rgb = np.asarray(imageio.imread(path))
142
+ return rgb
143
+
144
+
145
+ def _read_pointcloud_SemKITTI(path):
146
+ 'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
147
+ pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
148
+ pointcloud = pointcloud.reshape((-1, 4))
149
+ return pointcloud
150
+
151
+
152
+ def _read_calib_SemKITTI(calib_path):
153
+ """
154
+ :param calib_path: Path to a calibration text file.
155
+ :return: dict with calibration matrices.
156
+ """
157
+ calib_all = {}
158
+ with open(calib_path, 'r') as f:
159
+ for line in f.readlines():
160
+ if line == '\n':
161
+ break
162
+ key, value = line.split(':', 1)
163
+ calib_all[key] = np.array([float(x) for x in value.split()])
164
+
165
+ # reshape matrices
166
+ calib_out = {}
167
+ calib_out['P2'] = calib_all['P2'].reshape(3, 4) # 3x4 projection matrix for left camera
168
+ calib_out['Tr'] = np.identity(4) # 4x4 matrix
169
+ calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
170
+ return calib_out
171
+
172
+
173
+ def get_remap_lut(path):
174
+ '''
175
+ remap_lut to remap classes of semantic kitti for training...
176
+ :return:
177
+ '''
178
+
179
+ dataset_config = yaml.safe_load(open(path, 'r'))
180
+
181
+ # make lookup table for mapping
182
+ maxkey = max(dataset_config['learning_map'].keys())
183
+
184
+ # +100 hack making lut bigger just in case there are unknown labels
185
+ remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
186
+ remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
187
+
188
+ # in completion we have to distinguish empty and invalid voxels.
189
+ # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
190
+ remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
191
+ remap_lut[0] = 0 # only 'empty' stays 'empty'.
192
+
193
+ return remap_lut
194
+
195
+
196
+ def data_augmentation_3Dflips(flip, data):
197
+ # The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
198
+ # into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
199
+ # https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
200
+ # Dims -> {XZY}
201
+ # Flipping around the X axis...
202
+ if np.isclose(flip, 1):
203
+ data = np.flip(data, axis=0).copy()
204
+
205
+ # Flipping around the Y axis...
206
+ if np.isclose(flip, 2):
207
+ data = np.flip(data, 2).copy()
208
+
209
+ # Flipping around the X and the Y axis...
210
+ if np.isclose(flip, 3):
211
+ data = np.flip(np.flip(data, axis=0), axis=2).copy()
212
+
213
+ return data
214
+
215
+
216
+ def get_cmap_semanticKITTI20():
217
+ colors = np.array([
218
+ # [0 , 0 , 0, 255],
219
+ [100, 150, 245, 255],
220
+ [100, 230, 245, 255],
221
+ [30, 60, 150, 255],
222
+ [80, 30, 180, 255],
223
+ [100, 80, 250, 255],
224
+ [255, 30, 30, 255],
225
+ [255, 40, 200, 255],
226
+ [150, 30, 90, 255],
227
+ [255, 0, 255, 255],
228
+ [255, 150, 255, 255],
229
+ [75, 0, 75, 255],
230
+ [175, 0, 75, 255],
231
+ [255, 200, 0, 255],
232
+ [255, 120, 50, 255],
233
+ [0, 175, 0, 255],
234
+ [135, 60, 0, 255],
235
+ [150, 240, 80, 255],
236
+ [255, 240, 150, 255],
237
+ [255, 0, 0, 255]]).astype(np.uint8)
238
+
239
+ return colors
monoscene/data/semantic_kitti/kitti_dataset.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import glob
4
+ from torch.utils.data import Dataset
5
+ import numpy as np
6
+ from PIL import Image
7
+ from torchvision import transforms
8
+ from monoscene.data.utils.helpers import (
9
+ vox2pix,
10
+ compute_local_frustums,
11
+ compute_CP_mega_matrix,
12
+ )
13
+
14
+
15
+ class KittiDataset(Dataset):
16
+ def __init__(
17
+ self,
18
+ split,
19
+ root,
20
+ preprocess_root,
21
+ project_scale=2,
22
+ frustum_size=4,
23
+ color_jitter=None,
24
+ fliplr=0.0,
25
+ ):
26
+ super().__init__()
27
+ self.root = root
28
+ self.label_root = os.path.join(preprocess_root, "labels")
29
+ self.n_classes = 20
30
+ splits = {
31
+ "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
32
+ "val": ["08"],
33
+ "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
34
+ }
35
+ self.split = split
36
+ self.sequences = splits[split]
37
+ self.frustum_size = frustum_size
38
+ self.project_scale = project_scale
39
+ self.output_scale = int(self.project_scale / 2)
40
+ self.scene_size = (51.2, 51.2, 6.4)
41
+ self.vox_origin = np.array([0, -25.6, -2])
42
+ self.fliplr = fliplr
43
+
44
+ self.voxel_size = 0.2 # 0.2m
45
+ self.img_W = 1220
46
+ self.img_H = 370
47
+
48
+ self.color_jitter = (
49
+ transforms.ColorJitter(*color_jitter) if color_jitter else None
50
+ )
51
+ self.scans = []
52
+ for sequence in self.sequences:
53
+ calib = self.read_calib(
54
+ os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
55
+ )
56
+ P = calib["P2"]
57
+ T_velo_2_cam = calib["Tr"]
58
+ proj_matrix = P @ T_velo_2_cam
59
+
60
+ glob_path = os.path.join(
61
+ self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
62
+ )
63
+ for voxel_path in glob.glob(glob_path):
64
+ self.scans.append(
65
+ {
66
+ "sequence": sequence,
67
+ "P": P,
68
+ "T_velo_2_cam": T_velo_2_cam,
69
+ "proj_matrix": proj_matrix,
70
+ "voxel_path": voxel_path,
71
+ }
72
+ )
73
+
74
+ self.normalize_rgb = transforms.Compose(
75
+ [
76
+ transforms.ToTensor(),
77
+ transforms.Normalize(
78
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
79
+ ),
80
+ ]
81
+ )
82
+
83
+ def __getitem__(self, index):
84
+ scan = self.scans[index]
85
+ voxel_path = scan["voxel_path"]
86
+ sequence = scan["sequence"]
87
+ P = scan["P"]
88
+ T_velo_2_cam = scan["T_velo_2_cam"]
89
+ proj_matrix = scan["proj_matrix"]
90
+
91
+ filename = os.path.basename(voxel_path)
92
+ frame_id = os.path.splitext(filename)[0]
93
+
94
+ rgb_path = os.path.join(
95
+ self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png"
96
+ )
97
+
98
+ data = {
99
+ "frame_id": frame_id,
100
+ "sequence": sequence,
101
+ "P": P,
102
+ "T_velo_2_cam": T_velo_2_cam,
103
+ "proj_matrix": proj_matrix,
104
+ }
105
+ scale_3ds = [self.output_scale, self.project_scale]
106
+ data["scale_3ds"] = scale_3ds
107
+ cam_k = P[0:3, 0:3]
108
+ data["cam_k"] = cam_k
109
+ for scale_3d in scale_3ds:
110
+
111
+ # compute the 3D-2D mapping
112
+ projected_pix, fov_mask, pix_z = vox2pix(
113
+ T_velo_2_cam,
114
+ cam_k,
115
+ self.vox_origin,
116
+ self.voxel_size * scale_3d,
117
+ self.img_W,
118
+ self.img_H,
119
+ self.scene_size,
120
+ )
121
+
122
+ data["projected_pix_{}".format(scale_3d)] = projected_pix
123
+ data["pix_z_{}".format(scale_3d)] = pix_z
124
+ data["fov_mask_{}".format(scale_3d)] = fov_mask
125
+
126
+ target_1_path = os.path.join(self.label_root, sequence, frame_id + "_1_1.npy")
127
+ target = np.load(target_1_path)
128
+ data["target"] = target
129
+ target_8_path = os.path.join(self.label_root, sequence, frame_id + "_1_8.npy")
130
+ target_1_8 = np.load(target_8_path)
131
+ CP_mega_matrix = compute_CP_mega_matrix(target_1_8)
132
+ data["CP_mega_matrix"] = CP_mega_matrix
133
+
134
+ # Compute the masks, each indicate the voxels of a local frustum
135
+ if self.split != "test":
136
+ projected_pix_output = data["projected_pix_{}".format(self.output_scale)]
137
+ pix_z_output = data[
138
+ "pix_z_{}".format(self.output_scale)
139
+ ]
140
+ frustums_masks, frustums_class_dists = compute_local_frustums(
141
+ projected_pix_output,
142
+ pix_z_output,
143
+ target,
144
+ self.img_W,
145
+ self.img_H,
146
+ dataset="kitti",
147
+ n_classes=20,
148
+ size=self.frustum_size,
149
+ )
150
+ else:
151
+ frustums_masks = None
152
+ frustums_class_dists = None
153
+ data["frustums_masks"] = frustums_masks
154
+ data["frustums_class_dists"] = frustums_class_dists
155
+
156
+ img = Image.open(rgb_path).convert("RGB")
157
+
158
+ # Image augmentation
159
+ if self.color_jitter is not None:
160
+ img = self.color_jitter(img)
161
+
162
+ # PIL to numpy
163
+ img = np.array(img, dtype=np.float32, copy=False) / 255.0
164
+ img = img[:370, :1220, :] # crop image
165
+
166
+ # Fliplr the image
167
+ if np.random.rand() < self.fliplr:
168
+ img = np.ascontiguousarray(np.fliplr(img))
169
+ for scale in scale_3ds:
170
+ key = "projected_pix_" + str(scale)
171
+ data[key][:, 0] = img.shape[1] - 1 - data[key][:, 0]
172
+
173
+ data["img"] = self.normalize_rgb(img)
174
+ return data
175
+
176
+ def __len__(self):
177
+ return len(self.scans)
178
+
179
+ @staticmethod
180
+ def read_calib(calib_path):
181
+ """
182
+ Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
183
+ :param calib_path: Path to a calibration text file.
184
+ :return: dict with calibration matrices.
185
+ """
186
+ calib_all = {}
187
+ with open(calib_path, "r") as f:
188
+ for line in f.readlines():
189
+ if line == "\n":
190
+ break
191
+ key, value = line.split(":", 1)
192
+ calib_all[key] = np.array([float(x) for x in value.split()])
193
+
194
+ # reshape matrices
195
+ calib_out = {}
196
+ # 3x4 projection matrix for left camera
197
+ calib_out["P2"] = calib_all["P2"].reshape(3, 4)
198
+ calib_out["Tr"] = np.identity(4) # 4x4 matrix
199
+ calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
200
+ return calib_out
monoscene/data/semantic_kitti/kitti_dm.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data.dataloader import DataLoader
2
+ from monoscene.data.semantic_kitti.kitti_dataset import KittiDataset
3
+ import pytorch_lightning as pl
4
+ from monoscene.data.semantic_kitti.collate import collate_fn
5
+ from monoscene.data.utils.torch_util import worker_init_fn
6
+
7
+
8
+ class KittiDataModule(pl.LightningDataModule):
9
+ def __init__(
10
+ self,
11
+ root,
12
+ preprocess_root,
13
+ project_scale=2,
14
+ frustum_size=4,
15
+ batch_size=4,
16
+ num_workers=6,
17
+ ):
18
+ super().__init__()
19
+ self.root = root
20
+ self.preprocess_root = preprocess_root
21
+ self.project_scale = project_scale
22
+ self.batch_size = batch_size
23
+ self.num_workers = num_workers
24
+ self.frustum_size = frustum_size
25
+
26
+ def setup(self, stage=None):
27
+ self.train_ds = KittiDataset(
28
+ split="train",
29
+ root=self.root,
30
+ preprocess_root=self.preprocess_root,
31
+ project_scale=self.project_scale,
32
+ frustum_size=self.frustum_size,
33
+ fliplr=0.5,
34
+ color_jitter=(0.4, 0.4, 0.4),
35
+ )
36
+
37
+ self.val_ds = KittiDataset(
38
+ split="val",
39
+ root=self.root,
40
+ preprocess_root=self.preprocess_root,
41
+ project_scale=self.project_scale,
42
+ frustum_size=self.frustum_size,
43
+ fliplr=0,
44
+ color_jitter=None,
45
+ )
46
+
47
+ self.test_ds = KittiDataset(
48
+ split="test",
49
+ root=self.root,
50
+ preprocess_root=self.preprocess_root,
51
+ project_scale=self.project_scale,
52
+ frustum_size=self.frustum_size,
53
+ fliplr=0,
54
+ color_jitter=None,
55
+ )
56
+
57
+ def train_dataloader(self):
58
+ return DataLoader(
59
+ self.train_ds,
60
+ batch_size=self.batch_size,
61
+ drop_last=True,
62
+ num_workers=self.num_workers,
63
+ shuffle=True,
64
+ pin_memory=True,
65
+ worker_init_fn=worker_init_fn,
66
+ collate_fn=collate_fn,
67
+ )
68
+
69
+ def val_dataloader(self):
70
+ return DataLoader(
71
+ self.val_ds,
72
+ batch_size=self.batch_size,
73
+ drop_last=False,
74
+ num_workers=self.num_workers,
75
+ shuffle=False,
76
+ pin_memory=True,
77
+ worker_init_fn=worker_init_fn,
78
+ collate_fn=collate_fn,
79
+ )
80
+
81
+ def test_dataloader(self):
82
+ return DataLoader(
83
+ self.test_ds,
84
+ batch_size=self.batch_size,
85
+ drop_last=False,
86
+ num_workers=self.num_workers,
87
+ shuffle=False,
88
+ pin_memory=True,
89
+ worker_init_fn=worker_init_fn,
90
+ collate_fn=collate_fn,
91
+ )
monoscene/data/semantic_kitti/params.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ semantic_kitti_class_frequencies = np.array(
4
+ [
5
+ 5.41773033e09,
6
+ 1.57835390e07,
7
+ 1.25136000e05,
8
+ 1.18809000e05,
9
+ 6.46799000e05,
10
+ 8.21951000e05,
11
+ 2.62978000e05,
12
+ 2.83696000e05,
13
+ 2.04750000e05,
14
+ 6.16887030e07,
15
+ 4.50296100e06,
16
+ 4.48836500e07,
17
+ 2.26992300e06,
18
+ 5.68402180e07,
19
+ 1.57196520e07,
20
+ 1.58442623e08,
21
+ 2.06162300e06,
22
+ 3.69705220e07,
23
+ 1.15198800e06,
24
+ 3.34146000e05,
25
+ ]
26
+ )
27
+ kitti_class_names = [
28
+ "empty",
29
+ "car",
30
+ "bicycle",
31
+ "motorcycle",
32
+ "truck",
33
+ "other-vehicle",
34
+ "person",
35
+ "bicyclist",
36
+ "motorcyclist",
37
+ "road",
38
+ "parking",
39
+ "sidewalk",
40
+ "other-ground",
41
+ "building",
42
+ "fence",
43
+ "vegetation",
44
+ "trunk",
45
+ "terrain",
46
+ "pole",
47
+ "traffic-sign",
48
+ ]
monoscene/data/semantic_kitti/preprocess.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Code partly taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/labels_downscale.py
3
+ """
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ import numpy.matlib
7
+ import os
8
+ import glob
9
+ import hydra
10
+ from omegaconf import DictConfig
11
+ import monoscene.data.semantic_kitti.io_data as SemanticKittiIO
12
+ from hydra.utils import get_original_cwd
13
+ from monoscene.data.NYU.preprocess import _downsample_label
14
+
15
+
16
+ def majority_pooling(grid, k_size=2):
17
+ result = np.zeros(
18
+ (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
19
+ )
20
+ for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
21
+ for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
22
+ for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
23
+
24
+ sub_m = grid[
25
+ (xx * k_size) : (xx * k_size) + k_size,
26
+ (yy * k_size) : (yy * k_size) + k_size,
27
+ (zz * k_size) : (zz * k_size) + k_size,
28
+ ]
29
+ unique, counts = np.unique(sub_m, return_counts=True)
30
+ if True in ((unique != 0) & (unique != 255)):
31
+ # Remove counts with 0 and 255
32
+ counts = counts[((unique != 0) & (unique != 255))]
33
+ unique = unique[((unique != 0) & (unique != 255))]
34
+ else:
35
+ if True in (unique == 0):
36
+ counts = counts[(unique != 255)]
37
+ unique = unique[(unique != 255)]
38
+ value = unique[np.argmax(counts)]
39
+ result[xx, yy, zz] = value
40
+ return result
41
+
42
+
43
+ @hydra.main(config_name="../../config/monoscene.yaml")
44
+ def main(config: DictConfig):
45
+ scene_size = (256, 256, 32)
46
+ sequences = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
47
+ remap_lut = SemanticKittiIO.get_remap_lut(
48
+ os.path.join(
49
+ get_original_cwd(),
50
+ "monoscene",
51
+ "data",
52
+ "semantic_kitti",
53
+ "semantic-kitti.yaml",
54
+ )
55
+ )
56
+
57
+ for sequence in sequences:
58
+ sequence_path = os.path.join(
59
+ config.kitti_root, "dataset", "sequences", sequence
60
+ )
61
+ label_paths = sorted(
62
+ glob.glob(os.path.join(sequence_path, "voxels", "*.label"))
63
+ )
64
+ invalid_paths = sorted(
65
+ glob.glob(os.path.join(sequence_path, "voxels", "*.invalid"))
66
+ )
67
+ out_dir = os.path.join(config.kitti_preprocess_root, "labels", sequence)
68
+ os.makedirs(out_dir, exist_ok=True)
69
+
70
+ downscaling = {"1_1": 1, "1_8": 8}
71
+
72
+ for i in tqdm(range(len(label_paths))):
73
+
74
+ frame_id, extension = os.path.splitext(os.path.basename(label_paths[i]))
75
+
76
+ LABEL = SemanticKittiIO._read_label_SemKITTI(label_paths[i])
77
+ INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_paths[i])
78
+ LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
79
+ np.float32
80
+ ) # Remap 20 classes semanticKITTI SSC
81
+ LABEL[
82
+ np.isclose(INVALID, 1)
83
+ ] = 255 # Setting to unknown all voxels marked on invalid mask...
84
+ LABEL = LABEL.reshape([256, 256, 32])
85
+
86
+ for scale in downscaling:
87
+ filename = frame_id + "_" + scale + ".npy"
88
+ label_filename = os.path.join(out_dir, filename)
89
+ # If files have not been created...
90
+ if not os.path.exists(label_filename):
91
+ if scale == "1_8":
92
+ LABEL_ds = _downsample_label(
93
+ LABEL, (256, 256, 32), downscaling[scale]
94
+ )
95
+ else:
96
+ LABEL_ds = LABEL
97
+ np.save(label_filename, LABEL_ds)
98
+ print("wrote to", label_filename)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ main()
monoscene/data/semantic_kitti/semantic-kitti.yaml ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is covered by the LICENSE file in the root of this project.
2
+ nbr_classes: 20
3
+ grid_dims: [256, 32, 256] # (W, H, D)
4
+ labels:
5
+ 0 : "unlabeled"
6
+ 1 : "outlier"
7
+ 10: "car"
8
+ 11: "bicycle"
9
+ 13: "bus"
10
+ 15: "motorcycle"
11
+ 16: "on-rails"
12
+ 18: "truck"
13
+ 20: "other-vehicle"
14
+ 30: "person"
15
+ 31: "bicyclist"
16
+ 32: "motorcyclist"
17
+ 40: "road"
18
+ 44: "parking"
19
+ 48: "sidewalk"
20
+ 49: "other-ground"
21
+ 50: "building"
22
+ 51: "fence"
23
+ 52: "other-structure"
24
+ 60: "lane-marking"
25
+ 70: "vegetation"
26
+ 71: "trunk"
27
+ 72: "terrain"
28
+ 80: "pole"
29
+ 81: "traffic-sign"
30
+ 99: "other-object"
31
+ 252: "moving-car"
32
+ 253: "moving-bicyclist"
33
+ 254: "moving-person"
34
+ 255: "moving-motorcyclist"
35
+ 256: "moving-on-rails"
36
+ 257: "moving-bus"
37
+ 258: "moving-truck"
38
+ 259: "moving-other-vehicle"
39
+ color_map: # bgr
40
+ 0 : [0, 0, 0]
41
+ 1 : [0, 0, 255]
42
+ 10: [245, 150, 100]
43
+ 11: [245, 230, 100]
44
+ 13: [250, 80, 100]
45
+ 15: [150, 60, 30]
46
+ 16: [255, 0, 0]
47
+ 18: [180, 30, 80]
48
+ 20: [255, 0, 0]
49
+ 30: [30, 30, 255]
50
+ 31: [200, 40, 255]
51
+ 32: [90, 30, 150]
52
+ 40: [255, 0, 255]
53
+ 44: [255, 150, 255]
54
+ 48: [75, 0, 75]
55
+ 49: [75, 0, 175]
56
+ 50: [0, 200, 255]
57
+ 51: [50, 120, 255]
58
+ 52: [0, 150, 255]
59
+ 60: [170, 255, 150]
60
+ 70: [0, 175, 0]
61
+ 71: [0, 60, 135]
62
+ 72: [80, 240, 150]
63
+ 80: [150, 240, 255]
64
+ 81: [0, 0, 255]
65
+ 99: [255, 255, 50]
66
+ 252: [245, 150, 100]
67
+ 256: [255, 0, 0]
68
+ 253: [200, 40, 255]
69
+ 254: [30, 30, 255]
70
+ 255: [90, 30, 150]
71
+ 257: [250, 80, 100]
72
+ 258: [180, 30, 80]
73
+ 259: [255, 0, 0]
74
+ content: # as a ratio with the total number of points
75
+ 0: 0.018889854628292943
76
+ 1: 0.0002937197336781505
77
+ 10: 0.040818519255974316
78
+ 11: 0.00016609538710764618
79
+ 13: 2.7879693665067774e-05
80
+ 15: 0.00039838616015114444
81
+ 16: 0.0
82
+ 18: 0.0020633612104619787
83
+ 20: 0.0016218197275284021
84
+ 30: 0.00017698551338515307
85
+ 31: 1.1065903904919655e-08
86
+ 32: 5.532951952459828e-09
87
+ 40: 0.1987493871255525
88
+ 44: 0.014717169549888214
89
+ 48: 0.14392298360372
90
+ 49: 0.0039048553037472045
91
+ 50: 0.1326861944777486
92
+ 51: 0.0723592229456223
93
+ 52: 0.002395131480328884
94
+ 60: 4.7084144280367186e-05
95
+ 70: 0.26681502148037506
96
+ 71: 0.006035012012626033
97
+ 72: 0.07814222006271769
98
+ 80: 0.002855498193863172
99
+ 81: 0.0006155958086189918
100
+ 99: 0.009923127583046915
101
+ 252: 0.001789309418528068
102
+ 253: 0.00012709999297008662
103
+ 254: 0.00016059776092534436
104
+ 255: 3.745553104802113e-05
105
+ 256: 0.0
106
+ 257: 0.00011351574470342043
107
+ 258: 0.00010157861367183268
108
+ 259: 4.3840131989471124e-05
109
+ # classes that are indistinguishable from single scan or inconsistent in
110
+ # ground truth are mapped to their closest equivalent
111
+ learning_map:
112
+ 0 : 0 # "unlabeled"
113
+ 1 : 0 # "outlier" mapped to "unlabeled" --------------------------mapped
114
+ 10: 1 # "car"
115
+ 11: 2 # "bicycle"
116
+ 13: 5 # "bus" mapped to "other-vehicle" --------------------------mapped
117
+ 15: 3 # "motorcycle"
118
+ 16: 5 # "on-rails" mapped to "other-vehicle" ---------------------mapped
119
+ 18: 4 # "truck"
120
+ 20: 5 # "other-vehicle"
121
+ 30: 6 # "person"
122
+ 31: 7 # "bicyclist"
123
+ 32: 8 # "motorcyclist"
124
+ 40: 9 # "road"
125
+ 44: 10 # "parking"
126
+ 48: 11 # "sidewalk"
127
+ 49: 12 # "other-ground"
128
+ 50: 13 # "building"
129
+ 51: 14 # "fence"
130
+ 52: 0 # "other-structure" mapped to "unlabeled" ------------------mapped
131
+ 60: 9 # "lane-marking" to "road" ---------------------------------mapped
132
+ 70: 15 # "vegetation"
133
+ 71: 16 # "trunk"
134
+ 72: 17 # "terrain"
135
+ 80: 18 # "pole"
136
+ 81: 19 # "traffic-sign"
137
+ 99: 0 # "other-object" to "unlabeled" ----------------------------mapped
138
+ 252: 1 # "moving-car" to "car" ------------------------------------mapped
139
+ 253: 7 # "moving-bicyclist" to "bicyclist" ------------------------mapped
140
+ 254: 6 # "moving-person" to "person" ------------------------------mapped
141
+ 255: 8 # "moving-motorcyclist" to "motorcyclist" ------------------mapped
142
+ 256: 5 # "moving-on-rails" mapped to "other-vehicle" --------------mapped
143
+ 257: 5 # "moving-bus" mapped to "other-vehicle" -------------------mapped
144
+ 258: 4 # "moving-truck" to "truck" --------------------------------mapped
145
+ 259: 5 # "moving-other"-vehicle to "other-vehicle" ----------------mapped
146
+ learning_map_inv: # inverse of previous map
147
+ 0: 0 # "unlabeled", and others ignored
148
+ 1: 10 # "car"
149
+ 2: 11 # "bicycle"
150
+ 3: 15 # "motorcycle"
151
+ 4: 18 # "truck"
152
+ 5: 20 # "other-vehicle"
153
+ 6: 30 # "person"
154
+ 7: 31 # "bicyclist"
155
+ 8: 32 # "motorcyclist"
156
+ 9: 40 # "road"
157
+ 10: 44 # "parking"
158
+ 11: 48 # "sidewalk"
159
+ 12: 49 # "other-ground"
160
+ 13: 50 # "building"
161
+ 14: 51 # "fence"
162
+ 15: 70 # "vegetation"
163
+ 16: 71 # "trunk"
164
+ 17: 72 # "terrain"
165
+ 18: 80 # "pole"
166
+ 19: 81 # "traffic-sign"
167
+ learning_ignore: # Ignore classes
168
+ 0: True # "unlabeled", and others ignored
169
+ 1: False # "car"
170
+ 2: False # "bicycle"
171
+ 3: False # "motorcycle"
172
+ 4: False # "truck"
173
+ 5: False # "other-vehicle"
174
+ 6: False # "person"
175
+ 7: False # "bicyclist"
176
+ 8: False # "motorcyclist"
177
+ 9: False # "road"
178
+ 10: False # "parking"
179
+ 11: False # "sidewalk"
180
+ 12: False # "other-ground"
181
+ 13: False # "building"
182
+ 14: False # "fence"
183
+ 15: False # "vegetation"
184
+ 16: False # "trunk"
185
+ 17: False # "terrain"
186
+ 18: False # "pole"
187
+ 19: False # "traffic-sign"
188
+ split: # sequence numbers
189
+ train:
190
+ - 0
191
+ - 1
192
+ - 2
193
+ - 3
194
+ - 4
195
+ - 5
196
+ - 6
197
+ - 7
198
+ - 9
199
+ - 10
200
+ valid:
201
+ - 8
202
+ test:
203
+ - 11
204
+ - 12
205
+ - 13
206
+ - 14
207
+ - 15
208
+ - 16
209
+ - 17
210
+ - 18
211
+ - 19
212
+ - 20
213
+ - 21
monoscene/data/utils/fusion.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
3
+
4
+ @inproceedings{zeng20163dmatch,
5
+ title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
6
+ author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
7
+ booktitle={CVPR},
8
+ year={2017}
9
+ }
10
+ """
11
+
12
+ import numpy as np
13
+
14
+ from numba import njit, prange
15
+ from skimage import measure
16
+
17
+ FUSION_GPU_MODE = 0
18
+
19
+
20
+ class TSDFVolume:
21
+ """Volumetric TSDF Fusion of RGB-D Images."""
22
+
23
+ def __init__(self, vol_bnds, voxel_size, use_gpu=True):
24
+ """Constructor.
25
+
26
+ Args:
27
+ vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
28
+ xyz bounds (min/max) in meters.
29
+ voxel_size (float): The volume discretization in meters.
30
+ """
31
+ vol_bnds = np.asarray(vol_bnds)
32
+ assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
33
+
34
+ # Define voxel volume parameters
35
+ self._vol_bnds = vol_bnds
36
+ self._voxel_size = float(voxel_size)
37
+ self._trunc_margin = 5 * self._voxel_size # truncation on SDF
38
+ # self._trunc_margin = 10 # truncation on SDF
39
+ self._color_const = 256 * 256
40
+
41
+ # Adjust volume bounds and ensure C-order contiguous
42
+ self._vol_dim = (
43
+ np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
44
+ .copy(order="C")
45
+ .astype(int)
46
+ )
47
+ self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
48
+ self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
49
+
50
+ print(
51
+ "Voxel volume size: {} x {} x {} - # points: {:,}".format(
52
+ self._vol_dim[0],
53
+ self._vol_dim[1],
54
+ self._vol_dim[2],
55
+ self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
56
+ )
57
+ )
58
+
59
+ # Initialize pointers to voxel volume in CPU memory
60
+ self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
61
+ # for computing the cumulative moving average of observations per voxel
62
+ self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
63
+ self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
64
+
65
+ self.gpu_mode = use_gpu and FUSION_GPU_MODE
66
+
67
+ # Copy voxel volumes to GPU
68
+ if self.gpu_mode:
69
+ self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
70
+ cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
71
+ self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
72
+ cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
73
+ self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
74
+ cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
75
+
76
+ # Cuda kernel function (C++)
77
+ self._cuda_src_mod = SourceModule(
78
+ """
79
+ __global__ void integrate(float * tsdf_vol,
80
+ float * weight_vol,
81
+ float * color_vol,
82
+ float * vol_dim,
83
+ float * vol_origin,
84
+ float * cam_intr,
85
+ float * cam_pose,
86
+ float * other_params,
87
+ float * color_im,
88
+ float * depth_im) {
89
+ // Get voxel index
90
+ int gpu_loop_idx = (int) other_params[0];
91
+ int max_threads_per_block = blockDim.x;
92
+ int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
93
+ int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
94
+ int vol_dim_x = (int) vol_dim[0];
95
+ int vol_dim_y = (int) vol_dim[1];
96
+ int vol_dim_z = (int) vol_dim[2];
97
+ if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
98
+ return;
99
+ // Get voxel grid coordinates (note: be careful when casting)
100
+ float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
101
+ float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
102
+ float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
103
+ // Voxel grid coordinates to world coordinates
104
+ float voxel_size = other_params[1];
105
+ float pt_x = vol_origin[0]+voxel_x*voxel_size;
106
+ float pt_y = vol_origin[1]+voxel_y*voxel_size;
107
+ float pt_z = vol_origin[2]+voxel_z*voxel_size;
108
+ // World coordinates to camera coordinates
109
+ float tmp_pt_x = pt_x-cam_pose[0*4+3];
110
+ float tmp_pt_y = pt_y-cam_pose[1*4+3];
111
+ float tmp_pt_z = pt_z-cam_pose[2*4+3];
112
+ float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
113
+ float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
114
+ float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
115
+ // Camera coordinates to image pixels
116
+ int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
117
+ int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
118
+ // Skip if outside view frustum
119
+ int im_h = (int) other_params[2];
120
+ int im_w = (int) other_params[3];
121
+ if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
122
+ return;
123
+ // Skip invalid depth
124
+ float depth_value = depth_im[pixel_y*im_w+pixel_x];
125
+ if (depth_value == 0)
126
+ return;
127
+ // Integrate TSDF
128
+ float trunc_margin = other_params[4];
129
+ float depth_diff = depth_value-cam_pt_z;
130
+ if (depth_diff < -trunc_margin)
131
+ return;
132
+ float dist = fmin(1.0f,depth_diff/trunc_margin);
133
+ float w_old = weight_vol[voxel_idx];
134
+ float obs_weight = other_params[5];
135
+ float w_new = w_old + obs_weight;
136
+ weight_vol[voxel_idx] = w_new;
137
+ tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
138
+ // Integrate color
139
+ float old_color = color_vol[voxel_idx];
140
+ float old_b = floorf(old_color/(256*256));
141
+ float old_g = floorf((old_color-old_b*256*256)/256);
142
+ float old_r = old_color-old_b*256*256-old_g*256;
143
+ float new_color = color_im[pixel_y*im_w+pixel_x];
144
+ float new_b = floorf(new_color/(256*256));
145
+ float new_g = floorf((new_color-new_b*256*256)/256);
146
+ float new_r = new_color-new_b*256*256-new_g*256;
147
+ new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
148
+ new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
149
+ new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
150
+ color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
151
+ }"""
152
+ )
153
+
154
+ self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
155
+
156
+ # Determine block/grid size on GPU
157
+ gpu_dev = cuda.Device(0)
158
+ self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
159
+ n_blocks = int(
160
+ np.ceil(
161
+ float(np.prod(self._vol_dim))
162
+ / float(self._max_gpu_threads_per_block)
163
+ )
164
+ )
165
+ grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
166
+ grid_dim_y = min(
167
+ gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
168
+ )
169
+ grid_dim_z = min(
170
+ gpu_dev.MAX_GRID_DIM_Z,
171
+ int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
172
+ )
173
+ self._max_gpu_grid_dim = np.array(
174
+ [grid_dim_x, grid_dim_y, grid_dim_z]
175
+ ).astype(int)
176
+ self._n_gpu_loops = int(
177
+ np.ceil(
178
+ float(np.prod(self._vol_dim))
179
+ / float(
180
+ np.prod(self._max_gpu_grid_dim)
181
+ * self._max_gpu_threads_per_block
182
+ )
183
+ )
184
+ )
185
+
186
+ else:
187
+ # Get voxel grid coordinates
188
+ xv, yv, zv = np.meshgrid(
189
+ range(self._vol_dim[0]),
190
+ range(self._vol_dim[1]),
191
+ range(self._vol_dim[2]),
192
+ indexing="ij",
193
+ )
194
+ self.vox_coords = (
195
+ np.concatenate(
196
+ [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
197
+ )
198
+ .astype(int)
199
+ .T
200
+ )
201
+
202
+ @staticmethod
203
+ @njit(parallel=True)
204
+ def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
205
+ """Convert voxel grid coordinates to world coordinates."""
206
+ vol_origin = vol_origin.astype(np.float32)
207
+ vox_coords = vox_coords.astype(np.float32)
208
+ # print(np.min(vox_coords))
209
+ cam_pts = np.empty_like(vox_coords, dtype=np.float32)
210
+
211
+ for i in prange(vox_coords.shape[0]):
212
+ for j in range(3):
213
+ cam_pts[i, j] = (
214
+ vol_origin[j]
215
+ + (vox_size * vox_coords[i, j])
216
+ + vox_size * offsets[j]
217
+ )
218
+ return cam_pts
219
+
220
+ @staticmethod
221
+ @njit(parallel=True)
222
+ def cam2pix(cam_pts, intr):
223
+ """Convert camera coordinates to pixel coordinates."""
224
+ intr = intr.astype(np.float32)
225
+ fx, fy = intr[0, 0], intr[1, 1]
226
+ cx, cy = intr[0, 2], intr[1, 2]
227
+ pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
228
+ for i in prange(cam_pts.shape[0]):
229
+ pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
230
+ pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
231
+ return pix
232
+
233
+ @staticmethod
234
+ @njit(parallel=True)
235
+ def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
236
+ """Integrate the TSDF volume."""
237
+ tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
238
+ # print(tsdf_vol.shape)
239
+ w_new = np.empty_like(w_old, dtype=np.float32)
240
+ for i in prange(len(tsdf_vol)):
241
+ w_new[i] = w_old[i] + obs_weight
242
+ tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
243
+ return tsdf_vol_int, w_new
244
+
245
+ def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
246
+ """Integrate an RGB-D frame into the TSDF volume.
247
+
248
+ Args:
249
+ color_im (ndarray): An RGB image of shape (H, W, 3).
250
+ depth_im (ndarray): A depth image of shape (H, W).
251
+ cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
252
+ cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
253
+ obs_weight (float): The weight to assign for the current observation. A higher
254
+ value
255
+ """
256
+ im_h, im_w = depth_im.shape
257
+
258
+ # Fold RGB color image into a single channel image
259
+ color_im = color_im.astype(np.float32)
260
+ color_im = np.floor(
261
+ color_im[..., 2] * self._color_const
262
+ + color_im[..., 1] * 256
263
+ + color_im[..., 0]
264
+ )
265
+
266
+ if self.gpu_mode: # GPU mode: integrate voxel volume (calls CUDA kernel)
267
+ for gpu_loop_idx in range(self._n_gpu_loops):
268
+ self._cuda_integrate(
269
+ self._tsdf_vol_gpu,
270
+ self._weight_vol_gpu,
271
+ self._color_vol_gpu,
272
+ cuda.InOut(self._vol_dim.astype(np.float32)),
273
+ cuda.InOut(self._vol_origin.astype(np.float32)),
274
+ cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
275
+ cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
276
+ cuda.InOut(
277
+ np.asarray(
278
+ [
279
+ gpu_loop_idx,
280
+ self._voxel_size,
281
+ im_h,
282
+ im_w,
283
+ self._trunc_margin,
284
+ obs_weight,
285
+ ],
286
+ np.float32,
287
+ )
288
+ ),
289
+ cuda.InOut(color_im.reshape(-1).astype(np.float32)),
290
+ cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
291
+ block=(self._max_gpu_threads_per_block, 1, 1),
292
+ grid=(
293
+ int(self._max_gpu_grid_dim[0]),
294
+ int(self._max_gpu_grid_dim[1]),
295
+ int(self._max_gpu_grid_dim[2]),
296
+ ),
297
+ )
298
+ else: # CPU mode: integrate voxel volume (vectorized implementation)
299
+ # Convert voxel grid coordinates to pixel coordinates
300
+ cam_pts = self.vox2world(
301
+ self._vol_origin, self.vox_coords, self._voxel_size
302
+ )
303
+ cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
304
+ pix_z = cam_pts[:, 2]
305
+ pix = self.cam2pix(cam_pts, cam_intr)
306
+ pix_x, pix_y = pix[:, 0], pix[:, 1]
307
+
308
+ # Eliminate pixels outside view frustum
309
+ valid_pix = np.logical_and(
310
+ pix_x >= 0,
311
+ np.logical_and(
312
+ pix_x < im_w,
313
+ np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
314
+ ),
315
+ )
316
+ depth_val = np.zeros(pix_x.shape)
317
+ depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
318
+
319
+ # Integrate TSDF
320
+ depth_diff = depth_val - pix_z
321
+
322
+ valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
323
+ dist = depth_diff
324
+
325
+ valid_vox_x = self.vox_coords[valid_pts, 0]
326
+ valid_vox_y = self.vox_coords[valid_pts, 1]
327
+ valid_vox_z = self.vox_coords[valid_pts, 2]
328
+ w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
329
+ tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
330
+ valid_dist = dist[valid_pts]
331
+ tsdf_vol_new, w_new = self.integrate_tsdf(
332
+ tsdf_vals, valid_dist, w_old, obs_weight
333
+ )
334
+ self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
335
+ self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
336
+
337
+ # Integrate color
338
+ old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
339
+ old_b = np.floor(old_color / self._color_const)
340
+ old_g = np.floor((old_color - old_b * self._color_const) / 256)
341
+ old_r = old_color - old_b * self._color_const - old_g * 256
342
+ new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
343
+ new_b = np.floor(new_color / self._color_const)
344
+ new_g = np.floor((new_color - new_b * self._color_const) / 256)
345
+ new_r = new_color - new_b * self._color_const - new_g * 256
346
+ new_b = np.minimum(
347
+ 255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
348
+ )
349
+ new_g = np.minimum(
350
+ 255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
351
+ )
352
+ new_r = np.minimum(
353
+ 255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
354
+ )
355
+ self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
356
+ new_b * self._color_const + new_g * 256 + new_r
357
+ )
358
+
359
+ def get_volume(self):
360
+ if self.gpu_mode:
361
+ cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
362
+ cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
363
+ return self._tsdf_vol_cpu, self._color_vol_cpu
364
+
365
+ def get_point_cloud(self):
366
+ """Extract a point cloud from the voxel volume."""
367
+ tsdf_vol, color_vol = self.get_volume()
368
+
369
+ # Marching cubes
370
+ verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
371
+ verts_ind = np.round(verts).astype(int)
372
+ verts = verts * self._voxel_size + self._vol_origin
373
+
374
+ # Get vertex colors
375
+ rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
376
+ colors_b = np.floor(rgb_vals / self._color_const)
377
+ colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
378
+ colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
379
+ colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
380
+ colors = colors.astype(np.uint8)
381
+
382
+ pc = np.hstack([verts, colors])
383
+ return pc
384
+
385
+ def get_mesh(self):
386
+ """Compute a mesh from the voxel volume using marching cubes."""
387
+ tsdf_vol, color_vol = self.get_volume()
388
+
389
+ # Marching cubes
390
+ verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
391
+ verts_ind = np.round(verts).astype(int)
392
+ verts = (
393
+ verts * self._voxel_size + self._vol_origin
394
+ ) # voxel grid coordinates to world coordinates
395
+
396
+ # Get vertex colors
397
+ rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
398
+ colors_b = np.floor(rgb_vals / self._color_const)
399
+ colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
400
+ colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
401
+ colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
402
+ colors = colors.astype(np.uint8)
403
+ return verts, faces, norms, colors
404
+
405
+
406
+ def rigid_transform(xyz, transform):
407
+ """Applies a rigid transform to an (N, 3) pointcloud."""
408
+ xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
409
+ xyz_t_h = np.dot(transform, xyz_h.T).T
410
+ return xyz_t_h[:, :3]
411
+
412
+
413
+ def get_view_frustum(depth_im, cam_intr, cam_pose):
414
+ """Get corners of 3D camera view frustum of depth image"""
415
+ im_h = depth_im.shape[0]
416
+ im_w = depth_im.shape[1]
417
+ max_depth = np.max(depth_im)
418
+ view_frust_pts = np.array(
419
+ [
420
+ (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
421
+ * np.array([0, max_depth, max_depth, max_depth, max_depth])
422
+ / cam_intr[0, 0],
423
+ (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
424
+ * np.array([0, max_depth, max_depth, max_depth, max_depth])
425
+ / cam_intr[1, 1],
426
+ np.array([0, max_depth, max_depth, max_depth, max_depth]),
427
+ ]
428
+ )
429
+ view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
430
+ return view_frust_pts
431
+
432
+
433
+ def meshwrite(filename, verts, faces, norms, colors):
434
+ """Save a 3D mesh to a polygon .ply file."""
435
+ # Write header
436
+ ply_file = open(filename, "w")
437
+ ply_file.write("ply\n")
438
+ ply_file.write("format ascii 1.0\n")
439
+ ply_file.write("element vertex %d\n" % (verts.shape[0]))
440
+ ply_file.write("property float x\n")
441
+ ply_file.write("property float y\n")
442
+ ply_file.write("property float z\n")
443
+ ply_file.write("property float nx\n")
444
+ ply_file.write("property float ny\n")
445
+ ply_file.write("property float nz\n")
446
+ ply_file.write("property uchar red\n")
447
+ ply_file.write("property uchar green\n")
448
+ ply_file.write("property uchar blue\n")
449
+ ply_file.write("element face %d\n" % (faces.shape[0]))
450
+ ply_file.write("property list uchar int vertex_index\n")
451
+ ply_file.write("end_header\n")
452
+
453
+ # Write vertex list
454
+ for i in range(verts.shape[0]):
455
+ ply_file.write(
456
+ "%f %f %f %f %f %f %d %d %d\n"
457
+ % (
458
+ verts[i, 0],
459
+ verts[i, 1],
460
+ verts[i, 2],
461
+ norms[i, 0],
462
+ norms[i, 1],
463
+ norms[i, 2],
464
+ colors[i, 0],
465
+ colors[i, 1],
466
+ colors[i, 2],
467
+ )
468
+ )
469
+
470
+ # Write face list
471
+ for i in range(faces.shape[0]):
472
+ ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
473
+
474
+ ply_file.close()
475
+
476
+
477
+ def pcwrite(filename, xyzrgb):
478
+ """Save a point cloud to a polygon .ply file."""
479
+ xyz = xyzrgb[:, :3]
480
+ rgb = xyzrgb[:, 3:].astype(np.uint8)
481
+
482
+ # Write header
483
+ ply_file = open(filename, "w")
484
+ ply_file.write("ply\n")
485
+ ply_file.write("format ascii 1.0\n")
486
+ ply_file.write("element vertex %d\n" % (xyz.shape[0]))
487
+ ply_file.write("property float x\n")
488
+ ply_file.write("property float y\n")
489
+ ply_file.write("property float z\n")
490
+ ply_file.write("property uchar red\n")
491
+ ply_file.write("property uchar green\n")
492
+ ply_file.write("property uchar blue\n")
493
+ ply_file.write("end_header\n")
494
+
495
+ # Write vertex list
496
+ for i in range(xyz.shape[0]):
497
+ ply_file.write(
498
+ "%f %f %f %d %d %d\n"
499
+ % (
500
+ xyz[i, 0],
501
+ xyz[i, 1],
502
+ xyz[i, 2],
503
+ rgb[i, 0],
504
+ rgb[i, 1],
505
+ rgb[i, 2],
506
+ )
507
+ )
monoscene/data/utils/helpers.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import monoscene.data.utils.fusion as fusion
3
+ import torch
4
+
5
+
6
+ def compute_CP_mega_matrix(target, is_binary=False):
7
+ """
8
+ Parameters
9
+ ---------
10
+ target: (H, W, D)
11
+ contains voxels semantic labels
12
+
13
+ is_binary: bool
14
+ if True, return binary voxels relations else return 4-way relations
15
+ """
16
+ label = target.reshape(-1)
17
+ label_row = label
18
+ N = label.shape[0]
19
+ super_voxel_size = [i//2 for i in target.shape]
20
+ if is_binary:
21
+ matrix = np.zeros((2, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
22
+ else:
23
+ matrix = np.zeros((4, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
24
+
25
+ for xx in range(super_voxel_size[0]):
26
+ for yy in range(super_voxel_size[1]):
27
+ for zz in range(super_voxel_size[2]):
28
+ col_idx = xx * (super_voxel_size[1] * super_voxel_size[2]) + yy * super_voxel_size[2] + zz
29
+ label_col_megas = np.array([
30
+ target[xx * 2, yy * 2, zz * 2],
31
+ target[xx * 2 + 1, yy * 2, zz * 2],
32
+ target[xx * 2, yy * 2 + 1, zz * 2],
33
+ target[xx * 2, yy * 2, zz * 2 + 1],
34
+ target[xx * 2 + 1, yy * 2 + 1, zz * 2],
35
+ target[xx * 2 + 1, yy * 2, zz * 2 + 1],
36
+ target[xx * 2, yy * 2 + 1, zz * 2 + 1],
37
+ target[xx * 2 + 1, yy * 2 + 1, zz * 2 + 1],
38
+ ])
39
+ label_col_megas = label_col_megas[label_col_megas != 255]
40
+ for label_col_mega in label_col_megas:
41
+ label_col = np.ones(N) * label_col_mega
42
+ if not is_binary:
43
+ matrix[0, (label_row != 255) & (label_col == label_row) & (label_col != 0), col_idx] = 1.0 # non non same
44
+ matrix[1, (label_row != 255) & (label_col != label_row) & (label_col != 0) & (label_row != 0), col_idx] = 1.0 # non non diff
45
+ matrix[2, (label_row != 255) & (label_row == label_col) & (label_col == 0), col_idx] = 1.0 # empty empty
46
+ matrix[3, (label_row != 255) & (label_row != label_col) & ((label_row == 0) | (label_col == 0)), col_idx] = 1.0 # nonempty empty
47
+ else:
48
+ matrix[0, (label_row != 255) & (label_col != label_row), col_idx] = 1.0 # diff
49
+ matrix[1, (label_row != 255) & (label_col == label_row), col_idx] = 1.0 # same
50
+ return matrix
51
+
52
+
53
+ def vox2pix(cam_E, cam_k,
54
+ vox_origin, voxel_size,
55
+ img_W, img_H,
56
+ scene_size):
57
+ """
58
+ compute the 2D projection of voxels centroids
59
+
60
+ Parameters:
61
+ ----------
62
+ cam_E: 4x4
63
+ =camera pose in case of NYUv2 dataset
64
+ =Transformation from camera to lidar coordinate in case of SemKITTI
65
+ cam_k: 3x3
66
+ camera intrinsics
67
+ vox_origin: (3,)
68
+ world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
69
+ img_W: int
70
+ image width
71
+ img_H: int
72
+ image height
73
+ scene_size: (3,)
74
+ scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
75
+
76
+ Returns
77
+ -------
78
+ projected_pix: (N, 2)
79
+ Projected 2D positions of voxels
80
+ fov_mask: (N,)
81
+ Voxels mask indice voxels inside image's FOV
82
+ pix_z: (N,)
83
+ Voxels'distance to the sensor in meter
84
+ """
85
+ # Compute the x, y, z bounding of the scene in meter
86
+ vol_bnds = np.zeros((3,2))
87
+ vol_bnds[:,0] = vox_origin
88
+ vol_bnds[:,1] = vox_origin + np.array(scene_size)
89
+
90
+ # Compute the voxels centroids in lidar cooridnates
91
+ vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
92
+ xv, yv, zv = np.meshgrid(
93
+ range(vol_dim[0]),
94
+ range(vol_dim[1]),
95
+ range(vol_dim[2]),
96
+ indexing='ij'
97
+ )
98
+ vox_coords = np.concatenate([
99
+ xv.reshape(1,-1),
100
+ yv.reshape(1,-1),
101
+ zv.reshape(1,-1)
102
+ ], axis=0).astype(int).T
103
+
104
+ # Project voxels'centroid from lidar coordinates to camera coordinates
105
+ cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
106
+ cam_pts = fusion.rigid_transform(cam_pts, cam_E)
107
+
108
+ # Project camera coordinates to pixel positions
109
+ projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
110
+ pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
111
+
112
+ # Eliminate pixels outside view frustum
113
+ pix_z = cam_pts[:, 2]
114
+ fov_mask = np.logical_and(pix_x >= 0,
115
+ np.logical_and(pix_x < img_W,
116
+ np.logical_and(pix_y >= 0,
117
+ np.logical_and(pix_y < img_H,
118
+ pix_z > 0))))
119
+
120
+
121
+ return projected_pix, fov_mask, pix_z
122
+
123
+
124
+ def compute_local_frustum(pix_x, pix_y, min_x, max_x, min_y, max_y, pix_z):
125
+ valid_pix = np.logical_and(pix_x >= min_x,
126
+ np.logical_and(pix_x < max_x,
127
+ np.logical_and(pix_y >= min_y,
128
+ np.logical_and(pix_y < max_y,
129
+ pix_z > 0))))
130
+ return valid_pix
131
+
132
+ def compute_local_frustums(projected_pix, pix_z, target, img_W, img_H, dataset, n_classes, size=4):
133
+ """
134
+ Compute the local frustums mask and their class frequencies
135
+
136
+ Parameters:
137
+ ----------
138
+ projected_pix: (N, 2)
139
+ 2D projected pix of all voxels
140
+ pix_z: (N,)
141
+ Distance of the camera sensor to voxels
142
+ target: (H, W, D)
143
+ Voxelized sematic labels
144
+ img_W: int
145
+ Image width
146
+ img_H: int
147
+ Image height
148
+ dataset: str
149
+ ="NYU" or "kitti" (for both SemKITTI and KITTI-360)
150
+ n_classes: int
151
+ Number of classes (12 for NYU and 20 for SemKITTI)
152
+ size: int
153
+ determine the number of local frustums i.e. size * size
154
+
155
+ Returns
156
+ -------
157
+ frustums_masks: (n_frustums, N)
158
+ List of frustums_masks, each indicates the belonging voxels
159
+ frustums_class_dists: (n_frustums, n_classes)
160
+ Contains the class frequencies in each frustum
161
+ """
162
+ H, W, D = target.shape
163
+ ranges = [(i * 1.0/size, (i * 1.0 + 1)/size) for i in range(size)]
164
+ local_frustum_masks = []
165
+ local_frustum_class_dists = []
166
+ pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
167
+ for y in ranges:
168
+ for x in ranges:
169
+ start_x = x[0] * img_W
170
+ end_x = x[1] * img_W
171
+ start_y = y[0] * img_H
172
+ end_y = y[1] * img_H
173
+ local_frustum = compute_local_frustum(pix_x, pix_y, start_x, end_x, start_y, end_y, pix_z)
174
+ if dataset == "NYU":
175
+ mask = (target != 255) & np.moveaxis(local_frustum.reshape(60, 60, 36), [0, 1, 2], [0, 2, 1])
176
+ elif dataset == "kitti":
177
+ mask = (target != 255) & local_frustum.reshape(H, W, D)
178
+
179
+ local_frustum_masks.append(mask)
180
+ classes, cnts = np.unique(target[mask], return_counts=True)
181
+ class_counts = np.zeros(n_classes)
182
+ class_counts[classes.astype(int)] = cnts
183
+ local_frustum_class_dists.append(class_counts)
184
+ frustums_masks, frustums_class_dists = np.array(local_frustum_masks), np.array(local_frustum_class_dists)
185
+ return frustums_masks, frustums_class_dists
monoscene/data/utils/torch_util.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+
5
+ def worker_init_fn(worker_id):
6
+ """The function is designed for pytorch multi-process dataloader.
7
+ Note that we use the pytorch random generator to generate a base_seed.
8
+ Please try to be consistent.
9
+
10
+ References:
11
+ https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
12
+
13
+ """
14
+ base_seed = torch.IntTensor(1).random_().item()
15
+ np.random.seed(base_seed + worker_id)
monoscene/loss/CRP_loss.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def compute_super_CP_multilabel_loss(pred_logits, CP_mega_matrices):
5
+ logits = []
6
+ labels = []
7
+ bs, n_relations, _, _ = pred_logits.shape
8
+ for i in range(bs):
9
+ pred_logit = pred_logits[i, :, :, :].permute(
10
+ 0, 2, 1
11
+ ) # n_relations, N, n_mega_voxels
12
+ CP_mega_matrix = CP_mega_matrices[i] # n_relations, N, n_mega_voxels
13
+ logits.append(pred_logit.reshape(n_relations, -1))
14
+ labels.append(CP_mega_matrix.reshape(n_relations, -1))
15
+
16
+ logits = torch.cat(logits, dim=1).T # M, 4
17
+ labels = torch.cat(labels, dim=1).T # M, 4
18
+
19
+ cnt_neg = (labels == 0).sum(0)
20
+ cnt_pos = labels.sum(0)
21
+ pos_weight = cnt_neg / cnt_pos
22
+ criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
23
+ loss_bce = criterion(logits, labels.float())
24
+ return loss_bce
monoscene/loss/sscMetrics.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Part of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py
3
+ """
4
+ import numpy as np
5
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
6
+
7
+
8
+ def get_iou(iou_sum, cnt_class):
9
+ _C = iou_sum.shape[0] # 12
10
+ iou = np.zeros(_C, dtype=np.float32) # iou for each class
11
+ for idx in range(_C):
12
+ iou[idx] = iou_sum[idx] / cnt_class[idx] if cnt_class[idx] else 0
13
+
14
+ mean_iou = np.sum(iou[1:]) / np.count_nonzero(cnt_class[1:])
15
+ return iou, mean_iou
16
+
17
+
18
+ def get_accuracy(predict, target, weight=None): # 0.05s
19
+ _bs = predict.shape[0] # batch size
20
+ _C = predict.shape[1] # _C = 12
21
+ target = np.int32(target)
22
+ target = target.reshape(_bs, -1) # (_bs, 60*36*60) 129600
23
+ predict = predict.reshape(_bs, _C, -1) # (_bs, _C, 60*36*60)
24
+ predict = np.argmax(
25
+ predict, axis=1
26
+ ) # one-hot: _bs x _C x 60*36*60 --> label: _bs x 60*36*60.
27
+
28
+ correct = predict == target # (_bs, 129600)
29
+ if weight: # 0.04s, add class weights
30
+ weight_k = np.ones(target.shape)
31
+ for i in range(_bs):
32
+ for n in range(target.shape[1]):
33
+ idx = 0 if target[i, n] == 255 else target[i, n]
34
+ weight_k[i, n] = weight[idx]
35
+ correct = correct * weight_k
36
+ acc = correct.sum() / correct.size
37
+ return acc
38
+
39
+
40
+ class SSCMetrics:
41
+ def __init__(self, n_classes):
42
+ self.n_classes = n_classes
43
+ self.reset()
44
+
45
+ def hist_info(self, n_cl, pred, gt):
46
+ assert pred.shape == gt.shape
47
+ k = (gt >= 0) & (gt < n_cl) # exclude 255
48
+ labeled = np.sum(k)
49
+ correct = np.sum((pred[k] == gt[k]))
50
+
51
+ return (
52
+ np.bincount(
53
+ n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
54
+ ).reshape(n_cl, n_cl),
55
+ correct,
56
+ labeled,
57
+ )
58
+
59
+ @staticmethod
60
+ def compute_score(hist, correct, labeled):
61
+ iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
62
+ mean_IU = np.nanmean(iu)
63
+ mean_IU_no_back = np.nanmean(iu[1:])
64
+ freq = hist.sum(1) / hist.sum()
65
+ freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
66
+ mean_pixel_acc = correct / labeled if labeled != 0 else 0
67
+
68
+ return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
69
+
70
+ def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
71
+ self.count += 1
72
+ mask = y_true != 255
73
+ if nonempty is not None:
74
+ mask = mask & nonempty
75
+ if nonsurface is not None:
76
+ mask = mask & nonsurface
77
+ tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)
78
+
79
+ self.completion_tp += tp
80
+ self.completion_fp += fp
81
+ self.completion_fn += fn
82
+
83
+ mask = y_true != 255
84
+ if nonempty is not None:
85
+ mask = mask & nonempty
86
+ tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
87
+ y_pred, y_true, mask
88
+ )
89
+ self.tps += tp_sum
90
+ self.fps += fp_sum
91
+ self.fns += fn_sum
92
+
93
+ def get_stats(self):
94
+ if self.completion_tp != 0:
95
+ precision = self.completion_tp / (self.completion_tp + self.completion_fp)
96
+ recall = self.completion_tp / (self.completion_tp + self.completion_fn)
97
+ iou = self.completion_tp / (
98
+ self.completion_tp + self.completion_fp + self.completion_fn
99
+ )
100
+ else:
101
+ precision, recall, iou = 0, 0, 0
102
+ iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
103
+ return {
104
+ "precision": precision,
105
+ "recall": recall,
106
+ "iou": iou,
107
+ "iou_ssc": iou_ssc,
108
+ "iou_ssc_mean": np.mean(iou_ssc[1:]),
109
+ }
110
+
111
+ def reset(self):
112
+
113
+ self.completion_tp = 0
114
+ self.completion_fp = 0
115
+ self.completion_fn = 0
116
+ self.tps = np.zeros(self.n_classes)
117
+ self.fps = np.zeros(self.n_classes)
118
+ self.fns = np.zeros(self.n_classes)
119
+
120
+ self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
121
+ self.labeled_ssc = 0
122
+ self.correct_ssc = 0
123
+
124
+ self.precision = 0
125
+ self.recall = 0
126
+ self.iou = 0
127
+ self.count = 1e-8
128
+ self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
129
+ self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)
130
+
131
+ def get_score_completion(self, predict, target, nonempty=None):
132
+ predict = np.copy(predict)
133
+ target = np.copy(target)
134
+
135
+ """for scene completion, treat the task as two-classes problem, just empty or occupancy"""
136
+ _bs = predict.shape[0] # batch size
137
+ # ---- ignore
138
+ predict[target == 255] = 0
139
+ target[target == 255] = 0
140
+ # ---- flatten
141
+ target = target.reshape(_bs, -1) # (_bs, 129600)
142
+ predict = predict.reshape(_bs, -1) # (_bs, _C, 129600), 60*36*60=129600
143
+ # ---- treat all non-empty object class as one category, set them to label 1
144
+ b_pred = np.zeros(predict.shape)
145
+ b_true = np.zeros(target.shape)
146
+ b_pred[predict > 0] = 1
147
+ b_true[target > 0] = 1
148
+ p, r, iou = 0.0, 0.0, 0.0
149
+ tp_sum, fp_sum, fn_sum = 0, 0, 0
150
+ for idx in range(_bs):
151
+ y_true = b_true[idx, :] # GT
152
+ y_pred = b_pred[idx, :]
153
+ if nonempty is not None:
154
+ nonempty_idx = nonempty[idx, :].reshape(-1)
155
+ y_true = y_true[nonempty_idx == 1]
156
+ y_pred = y_pred[nonempty_idx == 1]
157
+
158
+ tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
159
+ fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
160
+ fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
161
+ tp_sum += tp
162
+ fp_sum += fp
163
+ fn_sum += fn
164
+ return tp_sum, fp_sum, fn_sum
165
+
166
+ def get_score_semantic_and_completion(self, predict, target, nonempty=None):
167
+ target = np.copy(target)
168
+ predict = np.copy(predict)
169
+ _bs = predict.shape[0] # batch size
170
+ _C = self.n_classes # _C = 12
171
+ # ---- ignore
172
+ predict[target == 255] = 0
173
+ target[target == 255] = 0
174
+ # ---- flatten
175
+ target = target.reshape(_bs, -1) # (_bs, 129600)
176
+ predict = predict.reshape(_bs, -1) # (_bs, 129600), 60*36*60=129600
177
+
178
+ cnt_class = np.zeros(_C, dtype=np.int32) # count for each class
179
+ iou_sum = np.zeros(_C, dtype=np.float32) # sum of iou for each class
180
+ tp_sum = np.zeros(_C, dtype=np.int32) # tp
181
+ fp_sum = np.zeros(_C, dtype=np.int32) # fp
182
+ fn_sum = np.zeros(_C, dtype=np.int32) # fn
183
+
184
+ for idx in range(_bs):
185
+ y_true = target[idx, :] # GT
186
+ y_pred = predict[idx, :]
187
+ if nonempty is not None:
188
+ nonempty_idx = nonempty[idx, :].reshape(-1)
189
+ y_pred = y_pred[
190
+ np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
191
+ ]
192
+ y_true = y_true[
193
+ np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
194
+ ]
195
+ for j in range(_C): # for each class
196
+ tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
197
+ fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
198
+ fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size
199
+
200
+ tp_sum[j] += tp
201
+ fp_sum[j] += fp
202
+ fn_sum[j] += fn
203
+
204
+ return tp_sum, fp_sum, fn_sum
monoscene/loss/ssc_loss.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+
6
+ def KL_sep(p, target):
7
+ """
8
+ KL divergence on nonzeros classes
9
+ """
10
+ nonzeros = target != 0
11
+ nonzero_p = p[nonzeros]
12
+ kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
13
+ return kl_term
14
+
15
+
16
+ def geo_scal_loss(pred, ssc_target):
17
+
18
+ # Get softmax probabilities
19
+ pred = F.softmax(pred, dim=1)
20
+
21
+ # Compute empty and nonempty probabilities
22
+ empty_probs = pred[:, 0, :, :, :]
23
+ nonempty_probs = 1 - empty_probs
24
+
25
+ # Remove unknown voxels
26
+ mask = ssc_target != 255
27
+ nonempty_target = ssc_target != 0
28
+ nonempty_target = nonempty_target[mask].float()
29
+ nonempty_probs = nonempty_probs[mask]
30
+ empty_probs = empty_probs[mask]
31
+
32
+ intersection = (nonempty_target * nonempty_probs).sum()
33
+ precision = intersection / nonempty_probs.sum()
34
+ recall = intersection / nonempty_target.sum()
35
+ spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
36
+ return (
37
+ F.binary_cross_entropy(precision, torch.ones_like(precision))
38
+ + F.binary_cross_entropy(recall, torch.ones_like(recall))
39
+ + F.binary_cross_entropy(spec, torch.ones_like(spec))
40
+ )
41
+
42
+
43
+ def sem_scal_loss(pred, ssc_target):
44
+ # Get softmax probabilities
45
+ pred = F.softmax(pred, dim=1)
46
+ loss = 0
47
+ count = 0
48
+ mask = ssc_target != 255
49
+ n_classes = pred.shape[1]
50
+ for i in range(0, n_classes):
51
+
52
+ # Get probability of class i
53
+ p = pred[:, i, :, :, :]
54
+
55
+ # Remove unknown voxels
56
+ target_ori = ssc_target
57
+ p = p[mask]
58
+ target = ssc_target[mask]
59
+
60
+ completion_target = torch.ones_like(target)
61
+ completion_target[target != i] = 0
62
+ completion_target_ori = torch.ones_like(target_ori).float()
63
+ completion_target_ori[target_ori != i] = 0
64
+ if torch.sum(completion_target) > 0:
65
+ count += 1.0
66
+ nominator = torch.sum(p * completion_target)
67
+ loss_class = 0
68
+ if torch.sum(p) > 0:
69
+ precision = nominator / (torch.sum(p))
70
+ loss_precision = F.binary_cross_entropy(
71
+ precision, torch.ones_like(precision)
72
+ )
73
+ loss_class += loss_precision
74
+ if torch.sum(completion_target) > 0:
75
+ recall = nominator / (torch.sum(completion_target))
76
+ loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
77
+ loss_class += loss_recall
78
+ if torch.sum(1 - completion_target) > 0:
79
+ specificity = torch.sum((1 - p) * (1 - completion_target)) / (
80
+ torch.sum(1 - completion_target)
81
+ )
82
+ loss_specificity = F.binary_cross_entropy(
83
+ specificity, torch.ones_like(specificity)
84
+ )
85
+ loss_class += loss_specificity
86
+ loss += loss_class
87
+ return loss / count
88
+
89
+
90
+ def CE_ssc_loss(pred, target, class_weights):
91
+ """
92
+ :param: prediction: the predicted tensor, must be [BS, C, H, W, D]
93
+ """
94
+ criterion = nn.CrossEntropyLoss(
95
+ weight=class_weights, ignore_index=255, reduction="mean"
96
+ )
97
+ loss = criterion(pred, target.long())
98
+
99
+ return loss
monoscene/{CRP3D.py β†’ models/CRP3D.py} RENAMED
@@ -1,6 +1,6 @@
1
  import torch
2
  import torch.nn as nn
3
- from monoscene.modules import (
4
  Process,
5
  ASPP,
6
  )
 
1
  import torch
2
  import torch.nn as nn
3
+ from monoscene.models.modules import (
4
  Process,
5
  ASPP,
6
  )
monoscene/{DDR.py β†’ models/DDR.py} RENAMED
File without changes
monoscene/{flosp.py β†’ models/flosp.py} RENAMED
File without changes
monoscene/{modules.py β†’ models/modules.py} RENAMED
@@ -1,6 +1,6 @@
1
  import torch
2
  import torch.nn as nn
3
- from monoscene.DDR import Bottleneck3D
4
 
5
 
6
  class ASPP(nn.Module):
 
1
  import torch
2
  import torch.nn as nn
3
+ from monoscene.models.DDR import Bottleneck3D
4
 
5
 
6
  class ASPP(nn.Module):
monoscene/{.ipynb_checkpoints/monoscene-checkpoint.py β†’ models/monoscene.py} RENAMED
@@ -1,19 +1,25 @@
1
  import pytorch_lightning as pl
2
  import torch
3
  import torch.nn as nn
4
- from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
5
- from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
6
- from monoscene.flosp import FLoSP
 
 
 
7
  import numpy as np
8
  import torch.nn.functional as F
9
- from monoscene.unet2d import UNet2D
 
10
 
11
 
12
  class MonoScene(pl.LightningModule):
13
  def __init__(
14
  self,
15
  n_classes,
 
16
  feature,
 
17
  project_scale,
18
  full_scene_size,
19
  dataset,
@@ -36,11 +42,13 @@ class MonoScene(pl.LightningModule):
36
  self.dataset = dataset
37
  self.context_prior = context_prior
38
  self.frustum_size = frustum_size
 
39
  self.relation_loss = relation_loss
40
  self.CE_ssc_loss = CE_ssc_loss
41
  self.sem_scal_loss = sem_scal_loss
42
  self.geo_scal_loss = geo_scal_loss
43
  self.project_scale = project_scale
 
44
  self.lr = lr
45
  self.weight_decay = weight_decay
46
 
@@ -73,6 +81,13 @@ class MonoScene(pl.LightningModule):
73
  )
74
  self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
75
 
 
 
 
 
 
 
 
76
  def forward(self, batch):
77
 
78
  img = batch["img"]
@@ -111,13 +126,165 @@ class MonoScene(pl.LightningModule):
111
  "x3d": torch.stack(x3ds),
112
  }
113
 
114
- out_dict = self.net_3d_decoder(input_dict)
115
 
 
 
 
 
 
 
116
  ssc_pred = out_dict["ssc_logit"]
117
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  y_pred = ssc_pred.detach().cpu().numpy()
119
  y_pred = np.argmax(y_pred, axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- return y_pred
 
 
 
 
 
 
 
 
 
 
 
 
122
 
 
 
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pytorch_lightning as pl
2
  import torch
3
  import torch.nn as nn
4
+ from monoscene.models.unet3d_nyu import UNet3D as UNet3DNYU
5
+ from monoscene.models.unet3d_kitti import UNet3D as UNet3DKitti
6
+ from monoscene.loss.sscMetrics import SSCMetrics
7
+ from monoscene.loss.ssc_loss import sem_scal_loss, CE_ssc_loss, KL_sep, geo_scal_loss
8
+ from monoscene.models.flosp import FLoSP
9
+ from monoscene.loss.CRP_loss import compute_super_CP_multilabel_loss
10
  import numpy as np
11
  import torch.nn.functional as F
12
+ from monoscene.models.unet2d import UNet2D
13
+ from torch.optim.lr_scheduler import MultiStepLR
14
 
15
 
16
  class MonoScene(pl.LightningModule):
17
  def __init__(
18
  self,
19
  n_classes,
20
+ class_names,
21
  feature,
22
+ class_weights,
23
  project_scale,
24
  full_scene_size,
25
  dataset,
 
42
  self.dataset = dataset
43
  self.context_prior = context_prior
44
  self.frustum_size = frustum_size
45
+ self.class_names = class_names
46
  self.relation_loss = relation_loss
47
  self.CE_ssc_loss = CE_ssc_loss
48
  self.sem_scal_loss = sem_scal_loss
49
  self.geo_scal_loss = geo_scal_loss
50
  self.project_scale = project_scale
51
+ self.class_weights = class_weights
52
  self.lr = lr
53
  self.weight_decay = weight_decay
54
 
 
81
  )
82
  self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
83
 
84
+ # log hyperparameters
85
+ self.save_hyperparameters()
86
+
87
+ self.train_metrics = SSCMetrics(self.n_classes)
88
+ self.val_metrics = SSCMetrics(self.n_classes)
89
+ self.test_metrics = SSCMetrics(self.n_classes)
90
+
91
  def forward(self, batch):
92
 
93
  img = batch["img"]
 
126
  "x3d": torch.stack(x3ds),
127
  }
128
 
129
+ out = self.net_3d_decoder(input_dict)
130
 
131
+ return out
132
+
133
+ def step(self, batch, step_type, metric):
134
+ bs = len(batch["img"])
135
+ loss = 0
136
+ out_dict = self(batch)
137
  ssc_pred = out_dict["ssc_logit"]
138
+ target = batch["target"]
139
+
140
+ if self.context_prior:
141
+ P_logits = out_dict["P_logits"]
142
+ CP_mega_matrices = batch["CP_mega_matrices"]
143
+
144
+ if self.relation_loss:
145
+ loss_rel_ce = compute_super_CP_multilabel_loss(
146
+ P_logits, CP_mega_matrices
147
+ )
148
+ loss += loss_rel_ce
149
+ self.log(
150
+ step_type + "/loss_relation_ce_super",
151
+ loss_rel_ce.detach(),
152
+ on_epoch=True,
153
+ sync_dist=True,
154
+ )
155
+
156
+ class_weight = self.class_weights.type_as(batch["img"])
157
+ if self.CE_ssc_loss:
158
+ loss_ssc = CE_ssc_loss(ssc_pred, target, class_weight)
159
+ loss += loss_ssc
160
+ self.log(
161
+ step_type + "/loss_ssc",
162
+ loss_ssc.detach(),
163
+ on_epoch=True,
164
+ sync_dist=True,
165
+ )
166
+
167
+ if self.sem_scal_loss:
168
+ loss_sem_scal = sem_scal_loss(ssc_pred, target)
169
+ loss += loss_sem_scal
170
+ self.log(
171
+ step_type + "/loss_sem_scal",
172
+ loss_sem_scal.detach(),
173
+ on_epoch=True,
174
+ sync_dist=True,
175
+ )
176
+
177
+ if self.geo_scal_loss:
178
+ loss_geo_scal = geo_scal_loss(ssc_pred, target)
179
+ loss += loss_geo_scal
180
+ self.log(
181
+ step_type + "/loss_geo_scal",
182
+ loss_geo_scal.detach(),
183
+ on_epoch=True,
184
+ sync_dist=True,
185
+ )
186
+
187
+ if self.fp_loss and step_type != "test":
188
+ frustums_masks = torch.stack(batch["frustums_masks"])
189
+ frustums_class_dists = torch.stack(
190
+ batch["frustums_class_dists"]
191
+ ).float() # (bs, n_frustums, n_classes)
192
+ n_frustums = frustums_class_dists.shape[1]
193
+
194
+ pred_prob = F.softmax(ssc_pred, dim=1)
195
+ batch_cnt = frustums_class_dists.sum(0) # (n_frustums, n_classes)
196
+
197
+ frustum_loss = 0
198
+ frustum_nonempty = 0
199
+ for frus in range(n_frustums):
200
+ frustum_mask = frustums_masks[:, frus, :, :, :].unsqueeze(1).float()
201
+ prob = frustum_mask * pred_prob # bs, n_classes, H, W, D
202
+ prob = prob.reshape(bs, self.n_classes, -1).permute(1, 0, 2)
203
+ prob = prob.reshape(self.n_classes, -1)
204
+ cum_prob = prob.sum(dim=1) # n_classes
205
+
206
+ total_cnt = torch.sum(batch_cnt[frus])
207
+ total_prob = prob.sum()
208
+ if total_prob > 0 and total_cnt > 0:
209
+ frustum_target_proportion = batch_cnt[frus] / total_cnt
210
+ cum_prob = cum_prob / total_prob # n_classes
211
+ frustum_loss_i = KL_sep(cum_prob, frustum_target_proportion)
212
+ frustum_loss += frustum_loss_i
213
+ frustum_nonempty += 1
214
+ frustum_loss = frustum_loss / frustum_nonempty
215
+ loss += frustum_loss
216
+ self.log(
217
+ step_type + "/loss_frustums",
218
+ frustum_loss.detach(),
219
+ on_epoch=True,
220
+ sync_dist=True,
221
+ )
222
+
223
+ y_true = target.cpu().numpy()
224
  y_pred = ssc_pred.detach().cpu().numpy()
225
  y_pred = np.argmax(y_pred, axis=1)
226
+ metric.add_batch(y_pred, y_true)
227
+
228
+ self.log(step_type + "/loss", loss.detach(), on_epoch=True, sync_dist=True)
229
+
230
+ return loss
231
+
232
+ def training_step(self, batch, batch_idx):
233
+ return self.step(batch, "train", self.train_metrics)
234
+
235
+ def validation_step(self, batch, batch_idx):
236
+ self.step(batch, "val", self.val_metrics)
237
+
238
+ def validation_epoch_end(self, outputs):
239
+ metric_list = [("train", self.train_metrics), ("val", self.val_metrics)]
240
 
241
+ for prefix, metric in metric_list:
242
+ stats = metric.get_stats()
243
+ for i, class_name in enumerate(self.class_names):
244
+ self.log(
245
+ "{}_SemIoU/{}".format(prefix, class_name),
246
+ stats["iou_ssc"][i],
247
+ sync_dist=True,
248
+ )
249
+ self.log("{}/mIoU".format(prefix), stats["iou_ssc_mean"], sync_dist=True)
250
+ self.log("{}/IoU".format(prefix), stats["iou"], sync_dist=True)
251
+ self.log("{}/Precision".format(prefix), stats["precision"], sync_dist=True)
252
+ self.log("{}/Recall".format(prefix), stats["recall"], sync_dist=True)
253
+ metric.reset()
254
 
255
+ def test_step(self, batch, batch_idx):
256
+ self.step(batch, "test", self.test_metrics)
257
 
258
+ def test_epoch_end(self, outputs):
259
+ classes = self.class_names
260
+ metric_list = [("test", self.test_metrics)]
261
+ for prefix, metric in metric_list:
262
+ print("{}======".format(prefix))
263
+ stats = metric.get_stats()
264
+ print(
265
+ "Precision={:.4f}, Recall={:.4f}, IoU={:.4f}".format(
266
+ stats["precision"] * 100, stats["recall"] * 100, stats["iou"] * 100
267
+ )
268
+ )
269
+ print("class IoU: {}, ".format(classes))
270
+ print(
271
+ " ".join(["{:.4f}, "] * len(classes)).format(
272
+ *(stats["iou_ssc"] * 100).tolist()
273
+ )
274
+ )
275
+ print("mIoU={:.4f}".format(stats["iou_ssc_mean"] * 100))
276
+ metric.reset()
277
+
278
+ def configure_optimizers(self):
279
+ if self.dataset == "NYU":
280
+ optimizer = torch.optim.AdamW(
281
+ self.parameters(), lr=self.lr, weight_decay=self.weight_decay
282
+ )
283
+ scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
284
+ return [optimizer], [scheduler]
285
+ elif self.dataset == "kitti":
286
+ optimizer = torch.optim.AdamW(
287
+ self.parameters(), lr=self.lr, weight_decay=self.weight_decay
288
+ )
289
+ scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
290
+ return [optimizer], [scheduler]
monoscene/{unet2d.py β†’ models/unet2d.py} RENAMED
File without changes
monoscene/{unet3d_kitti.py β†’ models/unet3d_kitti.py} RENAMED
@@ -2,9 +2,9 @@
2
  import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
- from monoscene.modules import SegmentationHead
6
- from monoscene.CRP3D import CPMegaVoxels
7
- from monoscene.modules import Process, Upsample, Downsample
8
 
9
 
10
  class UNet3D(nn.Module):
 
2
  import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
+ from monoscene.models.modules import SegmentationHead
6
+ from monoscene.models.CRP3D import CPMegaVoxels
7
+ from monoscene.models.modules import Process, Upsample, Downsample
8
 
9
 
10
  class UNet3D(nn.Module):
monoscene/{unet3d_nyu.py β†’ models/unet3d_nyu.py} RENAMED
@@ -3,8 +3,8 @@ import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
  import numpy as np
6
- from monoscene.CRP3D import CPMegaVoxels
7
- from monoscene.modules import (
8
  Process,
9
  Upsample,
10
  Downsample,
 
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
  import numpy as np
6
+ from monoscene.models.CRP3D import CPMegaVoxels
7
+ from monoscene.models.modules import (
8
  Process,
9
  Upsample,
10
  Downsample,
monoscene/monoscene.py DELETED
@@ -1,125 +0,0 @@
1
- import pytorch_lightning as pl
2
- import torch
3
- import torch.nn as nn
4
- from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
5
- from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
6
- from monoscene.flosp import FLoSP
7
- import numpy as np
8
- import torch.nn.functional as F
9
- from monoscene.unet2d import UNet2D
10
-
11
-
12
- class MonoScene(pl.LightningModule):
13
- def __init__(
14
- self,
15
- n_classes,
16
- feature,
17
- project_scale,
18
- full_scene_size,
19
- dataset,
20
- project_res=["1", "2", "4", "8"],
21
- n_relations=4,
22
- context_prior=True,
23
- fp_loss=True,
24
- frustum_size=4,
25
- relation_loss=False,
26
- CE_ssc_loss=True,
27
- geo_scal_loss=True,
28
- sem_scal_loss=True,
29
- lr=1e-4,
30
- weight_decay=1e-4,
31
- ):
32
- super().__init__()
33
-
34
- self.project_res = project_res
35
- self.fp_loss = fp_loss
36
- self.dataset = dataset
37
- self.context_prior = context_prior
38
- self.frustum_size = frustum_size
39
- self.relation_loss = relation_loss
40
- self.CE_ssc_loss = CE_ssc_loss
41
- self.sem_scal_loss = sem_scal_loss
42
- self.geo_scal_loss = geo_scal_loss
43
- self.project_scale = project_scale
44
- self.lr = lr
45
- self.weight_decay = weight_decay
46
-
47
- self.projects = {}
48
- self.scale_2ds = [1, 2, 4, 8] # 2D scales
49
- for scale_2d in self.scale_2ds:
50
- self.projects[str(scale_2d)] = FLoSP(
51
- full_scene_size, project_scale=self.project_scale, dataset=self.dataset
52
- )
53
- self.projects = nn.ModuleDict(self.projects)
54
-
55
- self.n_classes = n_classes
56
- if self.dataset == "NYU":
57
- self.net_3d_decoder = UNet3DNYU(
58
- self.n_classes,
59
- nn.BatchNorm3d,
60
- n_relations=n_relations,
61
- feature=feature,
62
- full_scene_size=full_scene_size,
63
- context_prior=context_prior,
64
- )
65
- elif self.dataset == "kitti":
66
- self.net_3d_decoder = UNet3DKitti(
67
- self.n_classes,
68
- nn.BatchNorm3d,
69
- project_scale=project_scale,
70
- feature=feature,
71
- full_scene_size=full_scene_size,
72
- context_prior=context_prior,
73
- )
74
- self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
75
-
76
- def forward(self, batch):
77
-
78
- img = batch["img"]
79
- bs = len(img)
80
-
81
- out = {}
82
-
83
- x_rgb = self.net_rgb(img)
84
-
85
- x3ds = []
86
- for i in range(bs):
87
- x3d = None
88
- for scale_2d in self.project_res:
89
-
90
- # project features at each 2D scale to target 3D scale
91
- scale_2d = int(scale_2d)
92
- projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda()
93
- fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda()
94
-
95
- # Sum all the 3D features
96
- if x3d is None:
97
- x3d = self.projects[str(scale_2d)](
98
- x_rgb["1_" + str(scale_2d)][i],
99
- # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
100
- projected_pix // scale_2d,
101
- fov_mask,
102
- )
103
- else:
104
- x3d += self.projects[str(scale_2d)](
105
- x_rgb["1_" + str(scale_2d)][i],
106
- # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
107
- projected_pix // scale_2d,
108
- fov_mask,
109
- )
110
- x3ds.append(x3d)
111
-
112
- input_dict = {
113
- "x3d": torch.stack(x3ds),
114
- }
115
-
116
- out_dict = self.net_3d_decoder(input_dict)
117
-
118
- ssc_pred = out_dict["ssc_logit"]
119
-
120
- y_pred = ssc_pred.detach().cpu().numpy()
121
- y_pred = np.argmax(y_pred, axis=1)
122
-
123
- return y_pred
124
-
125
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/monoscene_model.py DELETED
@@ -1,21 +0,0 @@
1
- from transformers import PreTrainedModel
2
- from .config import MonoSceneConfig
3
- from monoscene.monoscene import MonoScene
4
-
5
-
6
- class MonoSceneModel(PreTrainedModel):
7
- config_class = MonoSceneConfig
8
-
9
- def __init__(self, config):
10
- super().__init__(config)
11
- self.model = MonoScene(
12
- dataset=config.dataset,
13
- n_classes=config.n_classes,
14
- feature=config.feature,
15
- project_scale=config.project_scale,
16
- full_scene_size=config.full_scene_size
17
- )
18
-
19
-
20
- def forward(self, tensor):
21
- return self.model.forward(tensor)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/scripts/eval_monoscene.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytorch_lightning import Trainer
2
+ from monoscene.models.monoscene import MonoScene
3
+ from monoscene.data.NYU.nyu_dm import NYUDataModule
4
+ from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
5
+ import hydra
6
+ from omegaconf import DictConfig
7
+ import torch
8
+ import os
9
+ from hydra.utils import get_original_cwd
10
+
11
+
12
+ @hydra.main(config_name="../config/monoscene.yaml")
13
+ def main(config: DictConfig):
14
+ torch.set_grad_enabled(False)
15
+ if config.dataset == "kitti":
16
+ config.batch_size = 1
17
+ n_classes = 20
18
+ feature = 64
19
+ project_scale = 2
20
+ full_scene_size = (256, 256, 32)
21
+ data_module = KittiDataModule(
22
+ root=config.kitti_root,
23
+ preprocess_root=config.kitti_preprocess_root,
24
+ frustum_size=config.frustum_size,
25
+ batch_size=int(config.batch_size / config.n_gpus),
26
+ num_workers=int(config.num_workers_per_gpu * config.n_gpus),
27
+ )
28
+
29
+ elif config.dataset == "NYU":
30
+ config.batch_size = 2
31
+ project_scale = 1
32
+ n_classes = 12
33
+ feature = 200
34
+ full_scene_size = (60, 36, 60)
35
+ data_module = NYUDataModule(
36
+ root=config.NYU_root,
37
+ preprocess_root=config.NYU_preprocess_root,
38
+ n_relations=config.n_relations,
39
+ frustum_size=config.frustum_size,
40
+ batch_size=int(config.batch_size / config.n_gpus),
41
+ num_workers=int(config.num_workers_per_gpu * config.n_gpus),
42
+ )
43
+
44
+ trainer = Trainer(
45
+ sync_batchnorm=True, deterministic=True, gpus=config.n_gpus, accelerator="ddp"
46
+ )
47
+
48
+ if config.dataset == "NYU":
49
+ model_path = os.path.join(
50
+ get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
51
+ )
52
+ else:
53
+ model_path = os.path.join(
54
+ get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
55
+ )
56
+
57
+ model = MonoScene.load_from_checkpoint(
58
+ model_path,
59
+ feature=feature,
60
+ project_scale=project_scale,
61
+ fp_loss=config.fp_loss,
62
+ full_scene_size=full_scene_size,
63
+ )
64
+ model.eval()
65
+ data_module.setup()
66
+ val_dataloader = data_module.val_dataloader()
67
+ trainer.test(model, test_dataloaders=val_dataloader)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
monoscene/scripts/generate_output.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytorch_lightning import Trainer
2
+ from monoscene.models.monoscene import MonoScene
3
+ from monoscene.data.NYU.nyu_dm import NYUDataModule
4
+ from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
5
+ from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
6
+ import hydra
7
+ from omegaconf import DictConfig
8
+ import torch
9
+ import numpy as np
10
+ import os
11
+ from hydra.utils import get_original_cwd
12
+ from tqdm import tqdm
13
+ import pickle
14
+
15
+
16
+ @hydra.main(config_name="../config/monoscene.yaml")
17
+ def main(config: DictConfig):
18
+ torch.set_grad_enabled(False)
19
+
20
+ # Setup dataloader
21
+ if config.dataset == "kitti" or config.dataset == "kitti_360":
22
+ feature = 64
23
+ project_scale = 2
24
+ full_scene_size = (256, 256, 32)
25
+
26
+ if config.dataset == "kitti":
27
+ data_module = KittiDataModule(
28
+ root=config.kitti_root,
29
+ preprocess_root=config.kitti_preprocess_root,
30
+ frustum_size=config.frustum_size,
31
+ batch_size=int(config.batch_size / config.n_gpus),
32
+ num_workers=int(config.num_workers_per_gpu * config.n_gpus),
33
+ )
34
+ data_module.setup()
35
+ data_loader = data_module.val_dataloader()
36
+ # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
37
+ else:
38
+ data_module = Kitti360DataModule(
39
+ root=config.kitti_360_root,
40
+ sequences=[config.kitti_360_sequence],
41
+ n_scans=2000,
42
+ batch_size=1,
43
+ num_workers=3,
44
+ )
45
+ data_module.setup()
46
+ data_loader = data_module.dataloader()
47
+
48
+ elif config.dataset == "NYU":
49
+ project_scale = 1
50
+ feature = 200
51
+ full_scene_size = (60, 36, 60)
52
+ data_module = NYUDataModule(
53
+ root=config.NYU_root,
54
+ preprocess_root=config.NYU_preprocess_root,
55
+ n_relations=config.n_relations,
56
+ frustum_size=config.frustum_size,
57
+ batch_size=int(config.batch_size / config.n_gpus),
58
+ num_workers=int(config.num_workers_per_gpu * config.n_gpus),
59
+ )
60
+ data_module.setup()
61
+ data_loader = data_module.val_dataloader()
62
+ # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
63
+ else:
64
+ print("dataset not support")
65
+
66
+ # Load pretrained models
67
+ if config.dataset == "NYU":
68
+ model_path = os.path.join(
69
+ get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
70
+ )
71
+ else:
72
+ model_path = os.path.join(
73
+ get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
74
+ )
75
+
76
+ model = MonoScene.load_from_checkpoint(
77
+ model_path,
78
+ feature=feature,
79
+ project_scale=project_scale,
80
+ fp_loss=config.fp_loss,
81
+ full_scene_size=full_scene_size,
82
+ )
83
+ model.cuda()
84
+ model.eval()
85
+
86
+ # Save prediction and additional data
87
+ # to draw the viewing frustum and remove scene outside the room for NYUv2
88
+ output_path = os.path.join(config.output_path, config.dataset)
89
+ with torch.no_grad():
90
+ for batch in tqdm(data_loader):
91
+ batch["img"] = batch["img"].cuda()
92
+ pred = model(batch)
93
+ y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
94
+ y_pred = np.argmax(y_pred, axis=1)
95
+ for i in range(config.batch_size):
96
+ out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
97
+ if "target" in batch:
98
+ out_dict["target"] = (
99
+ batch["target"][i].detach().cpu().numpy().astype(np.uint16)
100
+ )
101
+
102
+ if config.dataset == "NYU":
103
+ write_path = output_path
104
+ filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
105
+ out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
106
+ out_dict["vox_origin"] = (
107
+ batch["vox_origin"][i].detach().cpu().numpy()
108
+ )
109
+ else:
110
+ write_path = os.path.join(output_path, batch["sequence"][i])
111
+ filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
112
+ out_dict["fov_mask_1"] = (
113
+ batch["fov_mask_1"][i].detach().cpu().numpy()
114
+ )
115
+ out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
116
+ out_dict["T_velo_2_cam"] = (
117
+ batch["T_velo_2_cam"][i].detach().cpu().numpy()
118
+ )
119
+
120
+ os.makedirs(write_path, exist_ok=True)
121
+ with open(filepath, "wb") as handle:
122
+ pickle.dump(out_dict, handle)
123
+ print("wrote to", filepath)
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
monoscene/scripts/train_monoscene.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
2
+ from monoscene.data.semantic_kitti.params import (
3
+ semantic_kitti_class_frequencies,
4
+ kitti_class_names,
5
+ )
6
+ from monoscene.data.NYU.params import (
7
+ class_weights as NYU_class_weights,
8
+ NYU_class_names,
9
+ )
10
+ from monoscene.data.NYU.nyu_dm import NYUDataModule
11
+ from torch.utils.data.dataloader import DataLoader
12
+ from monoscene.models.monoscene import MonoScene
13
+ from pytorch_lightning import Trainer
14
+ from pytorch_lightning.loggers import TensorBoardLogger
15
+ from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
16
+ import os
17
+ import hydra
18
+ from omegaconf import DictConfig
19
+ import numpy as np
20
+ import torch
21
+
22
+ hydra.output_subdir = None
23
+
24
+
25
+ @hydra.main(config_name="../config/monoscene.yaml")
26
+ def main(config: DictConfig):
27
+ exp_name = config.exp_prefix
28
+ exp_name += "_{}_{}".format(config.dataset, config.run)
29
+ exp_name += "_FrusSize_{}".format(config.frustum_size)
30
+ exp_name += "_nRelations{}".format(config.n_relations)
31
+ exp_name += "_WD{}_lr{}".format(config.weight_decay, config.lr)
32
+
33
+ if config.CE_ssc_loss:
34
+ exp_name += "_CEssc"
35
+ if config.geo_scal_loss:
36
+ exp_name += "_geoScalLoss"
37
+ if config.sem_scal_loss:
38
+ exp_name += "_semScalLoss"
39
+ if config.fp_loss:
40
+ exp_name += "_fpLoss"
41
+
42
+ if config.relation_loss:
43
+ exp_name += "_CERel"
44
+ if config.context_prior:
45
+ exp_name += "_3DCRP"
46
+
47
+ # Setup dataloaders
48
+ if config.dataset == "kitti":
49
+ class_names = kitti_class_names
50
+ max_epochs = 30
51
+ logdir = config.kitti_logdir
52
+ full_scene_size = (256, 256, 32)
53
+ project_scale = 2
54
+ feature = 64
55
+ n_classes = 20
56
+ class_weights = torch.from_numpy(
57
+ 1 / np.log(semantic_kitti_class_frequencies + 0.001)
58
+ )
59
+ data_module = KittiDataModule(
60
+ root=config.kitti_root,
61
+ preprocess_root=config.kitti_preprocess_root,
62
+ frustum_size=config.frustum_size,
63
+ project_scale=project_scale,
64
+ batch_size=int(config.batch_size / config.n_gpus),
65
+ num_workers=int(config.num_workers_per_gpu),
66
+ )
67
+
68
+ elif config.dataset == "NYU":
69
+ class_names = NYU_class_names
70
+ max_epochs = 30
71
+ logdir = config.logdir
72
+ full_scene_size = (60, 36, 60)
73
+ project_scale = 1
74
+ feature = 200
75
+ n_classes = 12
76
+ class_weights = NYU_class_weights
77
+ data_module = NYUDataModule(
78
+ root=config.NYU_root,
79
+ preprocess_root=config.NYU_preprocess_root,
80
+ n_relations=config.n_relations,
81
+ frustum_size=config.frustum_size,
82
+ batch_size=int(config.batch_size / config.n_gpus),
83
+ num_workers=int(config.num_workers_per_gpu * config.n_gpus),
84
+ )
85
+
86
+ project_res = ["1"]
87
+ if config.project_1_2:
88
+ exp_name += "_Proj_2"
89
+ project_res.append("2")
90
+ if config.project_1_4:
91
+ exp_name += "_4"
92
+ project_res.append("4")
93
+ if config.project_1_8:
94
+ exp_name += "_8"
95
+ project_res.append("8")
96
+
97
+ print(exp_name)
98
+
99
+ # Initialize MonoScene model
100
+ model = MonoScene(
101
+ dataset=config.dataset,
102
+ frustum_size=config.frustum_size,
103
+ project_scale=project_scale,
104
+ n_relations=config.n_relations,
105
+ fp_loss=config.fp_loss,
106
+ feature=feature,
107
+ full_scene_size=full_scene_size,
108
+ project_res=project_res,
109
+ n_classes=n_classes,
110
+ class_names=class_names,
111
+ context_prior=config.context_prior,
112
+ relation_loss=config.relation_loss,
113
+ CE_ssc_loss=config.CE_ssc_loss,
114
+ sem_scal_loss=config.sem_scal_loss,
115
+ geo_scal_loss=config.geo_scal_loss,
116
+ lr=config.lr,
117
+ weight_decay=config.weight_decay,
118
+ class_weights=class_weights,
119
+ )
120
+
121
+ if config.enable_log:
122
+ logger = TensorBoardLogger(save_dir=logdir, name=exp_name, version="")
123
+ lr_monitor = LearningRateMonitor(logging_interval="step")
124
+ checkpoint_callbacks = [
125
+ ModelCheckpoint(
126
+ save_last=True,
127
+ monitor="val/mIoU",
128
+ save_top_k=1,
129
+ mode="max",
130
+ filename="{epoch:03d}-{val/mIoU:.5f}",
131
+ ),
132
+ lr_monitor,
133
+ ]
134
+ else:
135
+ logger = False
136
+ checkpoint_callbacks = False
137
+
138
+ model_path = os.path.join(logdir, exp_name, "checkpoints/last.ckpt")
139
+ if os.path.isfile(model_path):
140
+ # Continue training from last.ckpt
141
+ trainer = Trainer(
142
+ callbacks=checkpoint_callbacks,
143
+ resume_from_checkpoint=model_path,
144
+ sync_batchnorm=True,
145
+ deterministic=False,
146
+ max_epochs=max_epochs,
147
+ gpus=config.n_gpus,
148
+ logger=logger,
149
+ check_val_every_n_epoch=1,
150
+ log_every_n_steps=10,
151
+ flush_logs_every_n_steps=100,
152
+ accelerator="ddp",
153
+ )
154
+ else:
155
+ # Train from scratch
156
+ trainer = Trainer(
157
+ callbacks=checkpoint_callbacks,
158
+ sync_batchnorm=True,
159
+ deterministic=False,
160
+ max_epochs=max_epochs,
161
+ gpus=config.n_gpus,
162
+ logger=logger,
163
+ check_val_every_n_epoch=1,
164
+ log_every_n_steps=10,
165
+ flush_logs_every_n_steps=100,
166
+ accelerator="ddp",
167
+ )
168
+
169
+ trainer.fit(model, data_module)
170
+
171
+
172
+ if __name__ == "__main__":
173
+ main()
monoscene/scripts/visualization/NYU_vis_pred.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ from omegaconf import DictConfig
4
+ import numpy as np
5
+ import hydra
6
+ from mayavi import mlab
7
+
8
+
9
+ def get_grid_coords(dims, resolution):
10
+ """
11
+ :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
12
+ :return coords_grid: is the center coords of voxels in the grid
13
+ """
14
+
15
+ g_xx = np.arange(0, dims[0] + 1)
16
+ g_yy = np.arange(0, dims[1] + 1)
17
+
18
+ g_zz = np.arange(0, dims[2] + 1)
19
+
20
+ # Obtaining the grid with coords...
21
+ xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
22
+ coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
23
+ coords_grid = coords_grid.astype(np.float)
24
+
25
+ coords_grid = (coords_grid * resolution) + resolution / 2
26
+
27
+ temp = np.copy(coords_grid)
28
+ temp[:, 0] = coords_grid[:, 1]
29
+ temp[:, 1] = coords_grid[:, 0]
30
+ coords_grid = np.copy(temp)
31
+
32
+ return coords_grid
33
+
34
+
35
+ def draw(
36
+ voxels,
37
+ cam_pose,
38
+ vox_origin,
39
+ voxel_size=0.08,
40
+ d=0.75, # 0.75m - determine the size of the mesh representing the camera
41
+ ):
42
+ # Compute the coordinates of the mesh representing camera
43
+ y = d * 480 / (2 * 518.8579)
44
+ x = d * 640 / (2 * 518.8579)
45
+ tri_points = np.array(
46
+ [
47
+ [0, 0, 0],
48
+ [x, y, d],
49
+ [-x, y, d],
50
+ [-x, -y, d],
51
+ [x, -y, d],
52
+ ]
53
+ )
54
+ tri_points = np.hstack([tri_points, np.ones((5, 1))])
55
+
56
+ tri_points = (cam_pose @ tri_points.T).T
57
+ x = tri_points[:, 0] - vox_origin[0]
58
+ y = tri_points[:, 1] - vox_origin[1]
59
+ z = tri_points[:, 2] - vox_origin[2]
60
+ triangles = [
61
+ (0, 1, 2),
62
+ (0, 1, 4),
63
+ (0, 3, 4),
64
+ (0, 2, 3),
65
+ ]
66
+
67
+ # Compute the voxels coordinates
68
+ grid_coords = get_grid_coords(
69
+ [voxels.shape[0], voxels.shape[2], voxels.shape[1]], voxel_size
70
+ )
71
+
72
+ # Attach the predicted class to every voxel
73
+ grid_coords = np.vstack(
74
+ (grid_coords.T, np.moveaxis(voxels, [0, 1, 2], [0, 2, 1]).reshape(-1))
75
+ ).T
76
+
77
+ # Remove empty and unknown voxels
78
+ occupied_voxels = grid_coords[(grid_coords[:, 3] > 0) & (grid_coords[:, 3] < 255)]
79
+ figure = mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1))
80
+
81
+ # Draw the camera
82
+ mlab.triangular_mesh(
83
+ x,
84
+ y,
85
+ z,
86
+ triangles,
87
+ representation="wireframe",
88
+ color=(0, 0, 0),
89
+ line_width=5,
90
+ )
91
+
92
+ # Draw occupied voxels
93
+ plt_plot = mlab.points3d(
94
+ occupied_voxels[:, 0],
95
+ occupied_voxels[:, 1],
96
+ occupied_voxels[:, 2],
97
+ occupied_voxels[:, 3],
98
+ colormap="viridis",
99
+ scale_factor=voxel_size - 0.1 * voxel_size,
100
+ mode="cube",
101
+ opacity=1.0,
102
+ vmin=0,
103
+ vmax=12,
104
+ )
105
+
106
+ colors = np.array(
107
+ [
108
+ [22, 191, 206, 255],
109
+ [214, 38, 40, 255],
110
+ [43, 160, 43, 255],
111
+ [158, 216, 229, 255],
112
+ [114, 158, 206, 255],
113
+ [204, 204, 91, 255],
114
+ [255, 186, 119, 255],
115
+ [147, 102, 188, 255],
116
+ [30, 119, 181, 255],
117
+ [188, 188, 33, 255],
118
+ [255, 127, 12, 255],
119
+ [196, 175, 214, 255],
120
+ [153, 153, 153, 255],
121
+ ]
122
+ )
123
+
124
+ plt_plot.glyph.scale_mode = "scale_by_vector"
125
+
126
+ plt_plot.module_manager.scalar_lut_manager.lut.table = colors
127
+
128
+ mlab.show()
129
+
130
+
131
+ @hydra.main(config_path=None)
132
+ def main(config: DictConfig):
133
+ scan = config.file
134
+
135
+ with open(scan, "rb") as handle:
136
+ b = pickle.load(handle)
137
+
138
+ cam_pose = b["cam_pose"]
139
+ vox_origin = b["vox_origin"]
140
+ gt_scene = b["target"]
141
+ pred_scene = b["y_pred"]
142
+ scan = os.path.basename(scan)[:12]
143
+
144
+ pred_scene[(gt_scene == 255)] = 255 # only draw scene inside the room
145
+
146
+ draw(
147
+ pred_scene,
148
+ cam_pose,
149
+ vox_origin,
150
+ voxel_size=0.08,
151
+ d=0.75,
152
+ )
153
+
154
+
155
+ if __name__ == "__main__":
156
+ main()
monoscene/scripts/visualization/kitti_vis_pred.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from operator import gt
2
+ import pickle
3
+ import numpy as np
4
+ from omegaconf import DictConfig
5
+ import hydra
6
+ from mayavi import mlab
7
+
8
+
9
+ def get_grid_coords(dims, resolution):
10
+ """
11
+ :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
12
+ :return coords_grid: is the center coords of voxels in the grid
13
+ """
14
+
15
+ g_xx = np.arange(0, dims[0] + 1)
16
+ g_yy = np.arange(0, dims[1] + 1)
17
+ sensor_pose = 10
18
+ g_zz = np.arange(0, dims[2] + 1)
19
+
20
+ # Obtaining the grid with coords...
21
+ xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
22
+ coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
23
+ coords_grid = coords_grid.astype(np.float)
24
+
25
+ coords_grid = (coords_grid * resolution) + resolution / 2
26
+
27
+ temp = np.copy(coords_grid)
28
+ temp[:, 0] = coords_grid[:, 1]
29
+ temp[:, 1] = coords_grid[:, 0]
30
+ coords_grid = np.copy(temp)
31
+
32
+ return coords_grid
33
+
34
+
35
+ def draw(
36
+ voxels,
37
+ T_velo_2_cam,
38
+ vox_origin,
39
+ fov_mask,
40
+ img_size,
41
+ f,
42
+ voxel_size=0.2,
43
+ d=7, # 7m - determine the size of the mesh representing the camera
44
+ ):
45
+ # Compute the coordinates of the mesh representing camera
46
+ x = d * img_size[0] / (2 * f)
47
+ y = d * img_size[1] / (2 * f)
48
+ tri_points = np.array(
49
+ [
50
+ [0, 0, 0],
51
+ [x, y, d],
52
+ [-x, y, d],
53
+ [-x, -y, d],
54
+ [x, -y, d],
55
+ ]
56
+ )
57
+ tri_points = np.hstack([tri_points, np.ones((5, 1))])
58
+ tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T
59
+ x = tri_points[:, 0] - vox_origin[0]
60
+ y = tri_points[:, 1] - vox_origin[1]
61
+ z = tri_points[:, 2] - vox_origin[2]
62
+ triangles = [
63
+ (0, 1, 2),
64
+ (0, 1, 4),
65
+ (0, 3, 4),
66
+ (0, 2, 3),
67
+ ]
68
+
69
+ # Compute the voxels coordinates
70
+ grid_coords = get_grid_coords(
71
+ [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
72
+ )
73
+
74
+ # Attach the predicted class to every voxel
75
+ grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
76
+
77
+ # Get the voxels inside FOV
78
+ fov_grid_coords = grid_coords[fov_mask, :]
79
+
80
+ # Get the voxels outside FOV
81
+ outfov_grid_coords = grid_coords[~fov_mask, :]
82
+
83
+ # Remove empty and unknown voxels
84
+ fov_voxels = fov_grid_coords[
85
+ (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255)
86
+ ]
87
+ outfov_voxels = outfov_grid_coords[
88
+ (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255)
89
+ ]
90
+
91
+ figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
92
+
93
+ # Draw the camera
94
+ mlab.triangular_mesh(
95
+ x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5
96
+ )
97
+
98
+ # Draw occupied inside FOV voxels
99
+ plt_plot_fov = mlab.points3d(
100
+ fov_voxels[:, 0],
101
+ fov_voxels[:, 1],
102
+ fov_voxels[:, 2],
103
+ fov_voxels[:, 3],
104
+ colormap="viridis",
105
+ scale_factor=voxel_size - 0.05 * voxel_size,
106
+ mode="cube",
107
+ opacity=1.0,
108
+ vmin=1,
109
+ vmax=19,
110
+ )
111
+
112
+ # Draw occupied outside FOV voxels
113
+ plt_plot_outfov = mlab.points3d(
114
+ outfov_voxels[:, 0],
115
+ outfov_voxels[:, 1],
116
+ outfov_voxels[:, 2],
117
+ outfov_voxels[:, 3],
118
+ colormap="viridis",
119
+ scale_factor=voxel_size - 0.05 * voxel_size,
120
+ mode="cube",
121
+ opacity=1.0,
122
+ vmin=1,
123
+ vmax=19,
124
+ )
125
+
126
+ colors = np.array(
127
+ [
128
+ [100, 150, 245, 255],
129
+ [100, 230, 245, 255],
130
+ [30, 60, 150, 255],
131
+ [80, 30, 180, 255],
132
+ [100, 80, 250, 255],
133
+ [255, 30, 30, 255],
134
+ [255, 40, 200, 255],
135
+ [150, 30, 90, 255],
136
+ [255, 0, 255, 255],
137
+ [255, 150, 255, 255],
138
+ [75, 0, 75, 255],
139
+ [175, 0, 75, 255],
140
+ [255, 200, 0, 255],
141
+ [255, 120, 50, 255],
142
+ [0, 175, 0, 255],
143
+ [135, 60, 0, 255],
144
+ [150, 240, 80, 255],
145
+ [255, 240, 150, 255],
146
+ [255, 0, 0, 255],
147
+ ]
148
+ ).astype(np.uint8)
149
+
150
+ plt_plot_fov.glyph.scale_mode = "scale_by_vector"
151
+ plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
152
+
153
+ plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
154
+
155
+ outfov_colors = colors
156
+ outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
157
+ plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
158
+
159
+ mlab.show()
160
+
161
+
162
+ @hydra.main(config_path=None)
163
+ def main(config: DictConfig):
164
+ scan = config.file
165
+ with open(scan, "rb") as handle:
166
+ b = pickle.load(handle)
167
+
168
+ fov_mask_1 = b["fov_mask_1"]
169
+ T_velo_2_cam = b["T_velo_2_cam"]
170
+ vox_origin = np.array([0, -25.6, -2])
171
+
172
+ y_pred = b["y_pred"]
173
+
174
+ if config.dataset == "kitti_360":
175
+ # Visualize KITTI-360
176
+ draw(
177
+ y_pred,
178
+ T_velo_2_cam,
179
+ vox_origin,
180
+ fov_mask_1,
181
+ voxel_size=0.2,
182
+ f=552.55426,
183
+ img_size=(1408, 376),
184
+ d=7,
185
+ )
186
+ else:
187
+ # Visualize Semantic KITTI
188
+ draw(
189
+ y_pred,
190
+ T_velo_2_cam,
191
+ vox_origin,
192
+ fov_mask_1,
193
+ img_size=(1220, 370),
194
+ f=707.0912,
195
+ voxel_size=0.2,
196
+ d=7,
197
+ )
198
+
199
+
200
+ if __name__ == "__main__":
201
+ main()