Sohaib36 commited on
Commit
f9c9c61
β€’
1 Parent(s): b38a916

add: adding monoscene

Browse files
Files changed (39) hide show
  1. monoscene/{models/CRP3D.py β†’ CRP3D.py} +1 -1
  2. monoscene/{models/DDR.py β†’ DDR.py} +0 -0
  3. monoscene/__init__.py +0 -0
  4. monoscene/app.py +138 -0
  5. monoscene/config.py +26 -0
  6. monoscene/config/monoscene.yaml +0 -35
  7. monoscene/data/NYU/collate.py +0 -50
  8. monoscene/data/NYU/nyu_dataset.py +0 -133
  9. monoscene/data/NYU/nyu_dm.py +0 -78
  10. monoscene/data/NYU/params.py +0 -54
  11. monoscene/data/NYU/preprocess.py +0 -182
  12. monoscene/data/kitti_360/collate.py +0 -47
  13. monoscene/data/kitti_360/kitti_360_dataset.py +0 -125
  14. monoscene/data/kitti_360/kitti_360_dm.py +0 -32
  15. monoscene/data/semantic_kitti/collate.py +0 -61
  16. monoscene/data/semantic_kitti/io_data.py +0 -239
  17. monoscene/data/semantic_kitti/kitti_dataset.py +0 -200
  18. monoscene/data/semantic_kitti/kitti_dm.py +0 -91
  19. monoscene/data/semantic_kitti/params.py +0 -48
  20. monoscene/data/semantic_kitti/preprocess.py +0 -102
  21. monoscene/data/semantic_kitti/semantic-kitti.yaml +0 -213
  22. monoscene/data/utils/fusion.py +0 -507
  23. monoscene/data/utils/helpers.py +0 -185
  24. monoscene/data/utils/torch_util.py +0 -15
  25. monoscene/{models/flosp.py β†’ flosp.py} +0 -0
  26. monoscene/loss/CRP_loss.py +0 -24
  27. monoscene/loss/sscMetrics.py +0 -204
  28. monoscene/loss/ssc_loss.py +0 -99
  29. monoscene/{models/modules.py β†’ modules.py} +1 -1
  30. monoscene/{models/monoscene.py β†’ monoscene.py} +12 -177
  31. monoscene/monoscene_model.py +21 -0
  32. monoscene/scripts/eval_monoscene.py +0 -71
  33. monoscene/scripts/generate_output.py +0 -127
  34. monoscene/scripts/train_monoscene.py +0 -173
  35. monoscene/scripts/visualization/NYU_vis_pred.py +0 -156
  36. monoscene/scripts/visualization/kitti_vis_pred.py +0 -201
  37. monoscene/{models/unet2d.py β†’ unet2d.py} +0 -0
  38. monoscene/{models/unet3d_kitti.py β†’ unet3d_kitti.py} +3 -3
  39. monoscene/{models/unet3d_nyu.py β†’ unet3d_nyu.py} +2 -2
monoscene/{models/CRP3D.py β†’ CRP3D.py} RENAMED
@@ -1,6 +1,6 @@
1
  import torch
2
  import torch.nn as nn
3
- from monoscene.models.modules import (
4
  Process,
5
  ASPP,
6
  )
 
1
  import torch
2
  import torch.nn as nn
3
+ from monoscene.modules import (
4
  Process,
5
  ASPP,
6
  )
monoscene/{models/DDR.py β†’ DDR.py} RENAMED
File without changes
monoscene/__init__.py ADDED
File without changes
monoscene/app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytorch_lightning import Trainer
2
+ from monoscene.models.monoscene import MonoScene
3
+ from monoscene.data.NYU.nyu_dm import NYUDataModule
4
+ from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
5
+ from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
6
+ # import hydra
7
+ from omegaconf import DictConfig
8
+ import torch
9
+ import numpy as np
10
+ import os
11
+ from hydra.utils import get_original_cwd
12
+ import gradio as gr
13
+ import numpy as np
14
+ import plotly.express as px
15
+ import pandas as pd
16
+
17
+
18
+ # @hydra.main(config_name="../config/monoscene.yaml")
19
+ def plot(input_img):
20
+ torch.set_grad_enabled(False)
21
+
22
+ # Setup dataloader
23
+ # if config.dataset == "kitti" or config.dataset == "kitti_360":
24
+ feature = 64
25
+ project_scale = 2
26
+ full_scene_size = (256, 256, 32)
27
+
28
+ # if config.dataset == "kitti":
29
+ # data_module = KittiDataModule(
30
+ # root=config.kitti_root,
31
+ # preprocess_root=config.kitti_preprocess_root,
32
+ # frustum_size=config.frustum_size,
33
+ # batch_size=int(config.batch_size / config.n_gpus),
34
+ # num_workers=int(config.num_workers_per_gpu * config.n_gpus),
35
+ # )
36
+ # data_module.setup()
37
+ # data_loader = data_module.val_dataloader()
38
+ # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
39
+ # else:
40
+ # data_module = Kitti360DataModule(
41
+ # root=config.kitti_360_root,
42
+ # sequences=[config.kitti_360_sequence],
43
+ # n_scans=2000,
44
+ # batch_size=1,
45
+ # num_workers=3,
46
+ # )
47
+ # data_module.setup()
48
+ # data_loader = data_module.dataloader()
49
+
50
+ # elif config.dataset == "NYU":
51
+ # project_scale = 1
52
+ # feature = 200
53
+ # full_scene_size = (60, 36, 60)
54
+ # data_module = NYUDataModule(
55
+ # root=config.NYU_root,
56
+ # preprocess_root=config.NYU_preprocess_root,
57
+ # n_relations=config.n_relations,
58
+ # frustum_size=config.frustum_size,
59
+ # batch_size=int(config.batch_size / config.n_gpus),
60
+ # num_workers=int(config.num_workers_per_gpu * config.n_gpus),
61
+ # )
62
+ # data_module.setup()
63
+ # data_loader = data_module.val_dataloader()
64
+ # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
65
+ # else:
66
+ # print("dataset not support")
67
+
68
+ # Load pretrained models
69
+ # if config.dataset == "NYU":
70
+ # model_path = os.path.join(
71
+ # get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
72
+ # )
73
+ # else:
74
+ # model_path = os.path.join(
75
+ # get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
76
+ # )
77
+ model_path = "trained_models/monoscene_kitti.ckpt"
78
+
79
+ model = MonoScene.load_from_checkpoint(
80
+ model_path,
81
+ feature=feature,
82
+ project_scale=project_scale,
83
+ fp_loss=False,
84
+ full_scene_size=full_scene_size,
85
+ )
86
+ model.cuda()
87
+ model.eval()
88
+
89
+ print(input_img.shape)
90
+
91
+ x = np.arange(12).reshape(4, 3) / 12
92
+ data = pd.DataFrame(data=x, columns=['x', 'y', 'z'])
93
+ fig = px.scatter_3d(data, x="x", y="y", z="z")
94
+ return fig
95
+
96
+ demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot())
97
+ demo.launch()
98
+
99
+
100
+
101
+ # Save prediction and additional data
102
+ # to draw the viewing frustum and remove scene outside the room for NYUv2
103
+ # output_path = os.path.join(config.output_path, config.dataset)
104
+ # with torch.no_grad():
105
+ # for batch in tqdm(data_loader):
106
+ # batch["img"] = batch["img"].cuda()
107
+ # pred = model(batch)
108
+ # y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
109
+ # y_pred = np.argmax(y_pred, axis=1)
110
+ # for i in range(config.batch_size):
111
+ # out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
112
+ # if "target" in batch:
113
+ # out_dict["target"] = (
114
+ # batch["target"][i].detach().cpu().numpy().astype(np.uint16)
115
+ # )
116
+
117
+ # if config.dataset == "NYU":
118
+ # write_path = output_path
119
+ # filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
120
+ # out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
121
+ # out_dict["vox_origin"] = (
122
+ # batch["vox_origin"][i].detach().cpu().numpy()
123
+ # )
124
+ # else:
125
+ # write_path = os.path.join(output_path, batch["sequence"][i])
126
+ # filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
127
+ # out_dict["fov_mask_1"] = (
128
+ # batch["fov_mask_1"][i].detach().cpu().numpy()
129
+ # )
130
+ # out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
131
+ # out_dict["T_velo_2_cam"] = (
132
+ # batch["T_velo_2_cam"][i].detach().cpu().numpy()
133
+ # )
134
+
135
+ # os.makedirs(write_path, exist_ok=True)
136
+ # with open(filepath, "wb") as handle:
137
+ # pickle.dump(out_dict, handle)
138
+ # print("wrote to", filepath)
monoscene/config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ from typing import List
3
+
4
+
5
+ class MonoSceneConfig(PretrainedConfig):
6
+
7
+ def __init__(
8
+ self,
9
+ dataset="kitti",
10
+ n_classes=20,
11
+ feature=64,
12
+ project_scale=2,
13
+ full_scene_size=(256, 256, 32),
14
+ **kwargs,
15
+ ):
16
+ self.dataset = dataset
17
+ self.n_classes = n_classes
18
+ self.feature = feature
19
+ self.project_scale = project_scale
20
+ self.full_scene_size = full_scene_size
21
+ super().__init__(**kwargs)
22
+
23
+
24
+
25
+
26
+
monoscene/config/monoscene.yaml DELETED
@@ -1,35 +0,0 @@
1
- #dataset: "NYU" # "kitti", "kitti_360"
2
- dataset: "kitti_360"
3
-
4
- n_relations: 4
5
-
6
- enable_log: false
7
- kitti_root: '/path/to/semantic_kitti'
8
- kitti_preprocess_root: '/path/to/kitti/preprocess/folder'
9
- kitti_logdir: '/path/to/semantic_kitti/logdir'
10
-
11
- NYU_root: '/path/to/NYU/depthbin'
12
- NYU_preprocess_root: '/path/to/NYU/preprocess/folder'
13
- logdir: '/path/to/NYU/logdir'
14
-
15
-
16
- fp_loss: true
17
- frustum_size: 8
18
- batch_size: 1
19
- n_gpus: 1
20
- num_workers_per_gpu: 3
21
- exp_prefix: "exp"
22
- run: 1
23
- lr: 1e-4
24
- weight_decay: 1e-4
25
-
26
- context_prior: true
27
-
28
- relation_loss: true
29
- CE_ssc_loss: true
30
- sem_scal_loss: true
31
- geo_scal_loss: true
32
-
33
- project_1_2: true
34
- project_1_4: true
35
- project_1_8: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/NYU/collate.py DELETED
@@ -1,50 +0,0 @@
1
- import torch
2
-
3
-
4
- def collate_fn(batch):
5
- data = {}
6
- imgs = []
7
- targets = []
8
- names = []
9
- cam_poses = []
10
-
11
- vox_origins = []
12
- cam_ks = []
13
-
14
- CP_mega_matrices = []
15
-
16
- data["projected_pix_1"] = []
17
- data["fov_mask_1"] = []
18
- data["frustums_masks"] = []
19
- data["frustums_class_dists"] = []
20
-
21
- for idx, input_dict in enumerate(batch):
22
- CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
23
- for key in data:
24
- if key in input_dict:
25
- data[key].append(torch.from_numpy(input_dict[key]))
26
-
27
- cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
28
- cam_poses.append(torch.from_numpy(input_dict["cam_pose"]).float())
29
- vox_origins.append(torch.from_numpy(input_dict["voxel_origin"]).double())
30
-
31
- names.append(input_dict["name"])
32
-
33
- img = input_dict["img"]
34
- imgs.append(img)
35
-
36
- target = torch.from_numpy(input_dict["target"])
37
- targets.append(target)
38
-
39
- ret_data = {
40
- "CP_mega_matrices": CP_mega_matrices,
41
- "cam_pose": torch.stack(cam_poses),
42
- "cam_k": torch.stack(cam_ks),
43
- "vox_origin": torch.stack(vox_origins),
44
- "name": names,
45
- "img": torch.stack(imgs),
46
- "target": torch.stack(targets),
47
- }
48
- for key in data:
49
- ret_data[key] = data[key]
50
- return ret_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/NYU/nyu_dataset.py DELETED
@@ -1,133 +0,0 @@
1
- import torch
2
- import os
3
- import glob
4
- from torch.utils.data import Dataset
5
- import numpy as np
6
- from PIL import Image
7
- from torchvision import transforms
8
- from monoscene.data.utils.helpers import (
9
- vox2pix,
10
- compute_local_frustums,
11
- compute_CP_mega_matrix,
12
- )
13
- import pickle
14
- import torch.nn.functional as F
15
-
16
-
17
- class NYUDataset(Dataset):
18
- def __init__(
19
- self,
20
- split,
21
- root,
22
- preprocess_root,
23
- n_relations=4,
24
- color_jitter=None,
25
- frustum_size=4,
26
- fliplr=0.0,
27
- ):
28
- self.n_relations = n_relations
29
- self.frustum_size = frustum_size
30
- self.n_classes = 12
31
- self.root = os.path.join(root, "NYU" + split)
32
- self.preprocess_root = preprocess_root
33
- self.base_dir = os.path.join(preprocess_root, "base", "NYU" + split)
34
- self.fliplr = fliplr
35
-
36
- self.voxel_size = 0.08 # 0.08m
37
- self.scene_size = (4.8, 4.8, 2.88) # (4.8m, 4.8m, 2.88m)
38
- self.img_W = 640
39
- self.img_H = 480
40
- self.cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]])
41
-
42
- self.color_jitter = (
43
- transforms.ColorJitter(*color_jitter) if color_jitter else None
44
- )
45
-
46
- self.scan_names = glob.glob(os.path.join(self.root, "*.bin"))
47
-
48
- self.normalize_rgb = transforms.Compose(
49
- [
50
- transforms.ToTensor(),
51
- transforms.Normalize(
52
- mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
53
- ),
54
- ]
55
- )
56
-
57
- def __getitem__(self, index):
58
- file_path = self.scan_names[index]
59
- filename = os.path.basename(file_path)
60
- name = filename[:-4]
61
-
62
- os.makedirs(self.base_dir, exist_ok=True)
63
- filepath = os.path.join(self.base_dir, name + ".pkl")
64
-
65
- with open(filepath, "rb") as handle:
66
- data = pickle.load(handle)
67
-
68
- cam_pose = data["cam_pose"]
69
- T_world_2_cam = np.linalg.inv(cam_pose)
70
- vox_origin = data["voxel_origin"]
71
- data["cam_k"] = self.cam_k
72
- target = data[
73
- "target_1_4"
74
- ] # Following SSC literature, the output resolution on NYUv2 is set to 1:4
75
- data["target"] = target
76
- target_1_4 = data["target_1_16"]
77
-
78
- CP_mega_matrix = compute_CP_mega_matrix(
79
- target_1_4, is_binary=self.n_relations == 2
80
- )
81
- data["CP_mega_matrix"] = CP_mega_matrix
82
-
83
- # compute the 3D-2D mapping
84
- projected_pix, fov_mask, pix_z = vox2pix(
85
- T_world_2_cam,
86
- self.cam_k,
87
- vox_origin,
88
- self.voxel_size,
89
- self.img_W,
90
- self.img_H,
91
- self.scene_size,
92
- )
93
-
94
- data["projected_pix_1"] = projected_pix
95
- data["fov_mask_1"] = fov_mask
96
-
97
- # compute the masks, each indicates voxels inside a frustum
98
- frustums_masks, frustums_class_dists = compute_local_frustums(
99
- projected_pix,
100
- pix_z,
101
- target,
102
- self.img_W,
103
- self.img_H,
104
- dataset="NYU",
105
- n_classes=12,
106
- size=self.frustum_size,
107
- )
108
- data["frustums_masks"] = frustums_masks
109
- data["frustums_class_dists"] = frustums_class_dists
110
-
111
- rgb_path = os.path.join(self.root, name + "_color.jpg")
112
- img = Image.open(rgb_path).convert("RGB")
113
-
114
- # Image augmentation
115
- if self.color_jitter is not None:
116
- img = self.color_jitter(img)
117
-
118
- # PIL to numpy
119
- img = np.array(img, dtype=np.float32, copy=False) / 255.0
120
-
121
- # randomly fliplr the image
122
- if np.random.rand() < self.fliplr:
123
- img = np.ascontiguousarray(np.fliplr(img))
124
- data["projected_pix_1"][:, 0] = (
125
- img.shape[1] - 1 - data["projected_pix_1"][:, 0]
126
- )
127
-
128
- data["img"] = self.normalize_rgb(img) # (3, img_H, img_W)
129
-
130
- return data
131
-
132
- def __len__(self):
133
- return len(self.scan_names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/NYU/nyu_dm.py DELETED
@@ -1,78 +0,0 @@
1
- from torch.utils.data.dataloader import DataLoader
2
- from monoscene.data.NYU.nyu_dataset import NYUDataset
3
- from monoscene.data.NYU.collate import collate_fn
4
- import pytorch_lightning as pl
5
- from monoscene.data.utils.torch_util import worker_init_fn
6
-
7
-
8
- class NYUDataModule(pl.LightningDataModule):
9
- def __init__(
10
- self,
11
- root,
12
- preprocess_root,
13
- n_relations=4,
14
- batch_size=4,
15
- frustum_size=4,
16
- num_workers=6,
17
- ):
18
- super().__init__()
19
- self.n_relations = n_relations
20
- self.preprocess_root = preprocess_root
21
- self.root = root
22
- self.batch_size = batch_size
23
- self.num_workers = num_workers
24
- self.frustum_size = frustum_size
25
-
26
- def setup(self, stage=None):
27
- self.train_ds = NYUDataset(
28
- split="train",
29
- preprocess_root=self.preprocess_root,
30
- n_relations=self.n_relations,
31
- root=self.root,
32
- fliplr=0.5,
33
- frustum_size=self.frustum_size,
34
- color_jitter=(0.4, 0.4, 0.4),
35
- )
36
- self.test_ds = NYUDataset(
37
- split="test",
38
- preprocess_root=self.preprocess_root,
39
- n_relations=self.n_relations,
40
- root=self.root,
41
- frustum_size=self.frustum_size,
42
- fliplr=0.0,
43
- color_jitter=None,
44
- )
45
-
46
- def train_dataloader(self):
47
- return DataLoader(
48
- self.train_ds,
49
- batch_size=self.batch_size,
50
- drop_last=True,
51
- num_workers=self.num_workers,
52
- shuffle=True,
53
- pin_memory=True,
54
- worker_init_fn=worker_init_fn,
55
- collate_fn=collate_fn,
56
- )
57
-
58
- def val_dataloader(self):
59
- return DataLoader(
60
- self.test_ds,
61
- batch_size=self.batch_size,
62
- num_workers=self.num_workers,
63
- drop_last=False,
64
- shuffle=False,
65
- pin_memory=True,
66
- collate_fn=collate_fn,
67
- )
68
-
69
- def test_dataloader(self):
70
- return DataLoader(
71
- self.test_ds,
72
- batch_size=self.batch_size,
73
- num_workers=self.num_workers,
74
- drop_last=False,
75
- shuffle=False,
76
- pin_memory=True,
77
- collate_fn=collate_fn,
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/NYU/params.py DELETED
@@ -1,54 +0,0 @@
1
- import torch
2
- import numpy as np
3
-
4
- NYU_class_names = [
5
- "empty",
6
- "ceiling",
7
- "floor",
8
- "wall",
9
- "window",
10
- "chair",
11
- "bed",
12
- "sofa",
13
- "table",
14
- "tvs",
15
- "furn",
16
- "objs",
17
- ]
18
- class_weights = torch.FloatTensor([0.05, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
19
-
20
- class_freq_1_4 = np.array(
21
- [
22
- 43744234,
23
- 80205,
24
- 1070052,
25
- 905632,
26
- 116952,
27
- 180994,
28
- 436852,
29
- 279714,
30
- 254611,
31
- 28247,
32
- 1805949,
33
- 850724,
34
- ]
35
- )
36
- class_freq_1_8 = np.array(
37
- [
38
- 5176253,
39
- 17277,
40
- 220105,
41
- 183849,
42
- 21827,
43
- 33520,
44
- 67022,
45
- 44248,
46
- 46615,
47
- 4419,
48
- 290218,
49
- 142573,
50
- ]
51
- )
52
- class_freq_1_16 = np.array(
53
- [587620, 3820, 46836, 36256, 4241, 5978, 10939, 8000, 8224, 781, 49778, 25864]
54
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/NYU/preprocess.py DELETED
@@ -1,182 +0,0 @@
1
- import numpy as np
2
- from tqdm import tqdm
3
- import numpy.matlib
4
- import os
5
- import glob
6
- import pickle
7
- import hydra
8
- from omegaconf import DictConfig
9
-
10
-
11
- seg_class_map = [
12
- 0,
13
- 1,
14
- 2,
15
- 3,
16
- 4,
17
- 11,
18
- 5,
19
- 6,
20
- 7,
21
- 8,
22
- 8,
23
- 10,
24
- 10,
25
- 10,
26
- 11,
27
- 11,
28
- 9,
29
- 8,
30
- 11,
31
- 11,
32
- 11,
33
- 11,
34
- 11,
35
- 11,
36
- 11,
37
- 11,
38
- 11,
39
- 10,
40
- 10,
41
- 11,
42
- 8,
43
- 10,
44
- 11,
45
- 9,
46
- 11,
47
- 11,
48
- 11,
49
- ]
50
-
51
-
52
- def _rle2voxel(rle, voxel_size=(240, 144, 240), rle_filename=""):
53
- r"""Read voxel label data from file (RLE compression), and convert it to fully occupancy labeled voxels.
54
- code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L172
55
- In the data loader of pytorch, only single thread is allowed.
56
- For multi-threads version and more details, see 'readRLE.py'.
57
- output: seg_label: 3D numpy array, size 240 x 144 x 240
58
- """
59
- seg_label = np.zeros(
60
- int(voxel_size[0] * voxel_size[1] * voxel_size[2]), dtype=np.uint8
61
- ) # segmentation label
62
- vox_idx = 0
63
- for idx in range(int(rle.shape[0] / 2)):
64
- check_val = rle[idx * 2]
65
- check_iter = rle[idx * 2 + 1]
66
- if check_val >= 37 and check_val != 255: # 37 classes to 12 classes
67
- print("RLE {} check_val: {}".format(rle_filename, check_val))
68
- seg_label_val = (
69
- seg_class_map[check_val] if check_val != 255 else 255
70
- ) # 37 classes to 12 classes
71
- seg_label[vox_idx : vox_idx + check_iter] = np.matlib.repmat(
72
- seg_label_val, 1, check_iter
73
- )
74
- vox_idx = vox_idx + check_iter
75
- seg_label = seg_label.reshape(voxel_size) # 3D array, size 240 x 144 x 240
76
- return seg_label
77
-
78
-
79
- def _read_rle(rle_filename): # 0.0005s
80
- """Read RLE compression data
81
- code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L153
82
- Return:
83
- vox_origin,
84
- cam_pose,
85
- vox_rle, voxel label data from file
86
- Shape:
87
- vox_rle, (240, 144, 240)
88
- """
89
- fid = open(rle_filename, "rb")
90
- vox_origin = np.fromfile(
91
- fid, np.float32, 3
92
- ).T # Read voxel origin in world coordinates
93
- cam_pose = np.fromfile(fid, np.float32, 16).reshape((4, 4)) # Read camera pose
94
- vox_rle = (
95
- np.fromfile(fid, np.uint32).reshape((-1, 1)).T
96
- ) # Read voxel label data from file
97
- vox_rle = np.squeeze(vox_rle) # 2d array: (1 x N), to 1d array: (N , )
98
- fid.close()
99
- return vox_origin, cam_pose, vox_rle
100
-
101
-
102
- def _downsample_label(label, voxel_size=(240, 144, 240), downscale=4):
103
- r"""downsample the labeled data,
104
- code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L262
105
- Shape:
106
- label, (240, 144, 240)
107
- label_downscale, if downsample==4, then (60, 36, 60)
108
- """
109
- if downscale == 1:
110
- return label
111
- ds = downscale
112
- small_size = (
113
- voxel_size[0] // ds,
114
- voxel_size[1] // ds,
115
- voxel_size[2] // ds,
116
- ) # small size
117
- label_downscale = np.zeros(small_size, dtype=np.uint8)
118
- empty_t = 0.95 * ds * ds * ds # threshold
119
- s01 = small_size[0] * small_size[1]
120
- label_i = np.zeros((ds, ds, ds), dtype=np.int32)
121
-
122
- for i in range(small_size[0] * small_size[1] * small_size[2]):
123
- z = int(i / s01)
124
- y = int((i - z * s01) / small_size[0])
125
- x = int(i - z * s01 - y * small_size[0])
126
-
127
- label_i[:, :, :] = label[
128
- x * ds : (x + 1) * ds, y * ds : (y + 1) * ds, z * ds : (z + 1) * ds
129
- ]
130
- label_bin = label_i.flatten()
131
-
132
- zero_count_0 = np.array(np.where(label_bin == 0)).size
133
- zero_count_255 = np.array(np.where(label_bin == 255)).size
134
-
135
- zero_count = zero_count_0 + zero_count_255
136
- if zero_count > empty_t:
137
- label_downscale[x, y, z] = 0 if zero_count_0 > zero_count_255 else 255
138
- else:
139
- label_i_s = label_bin[
140
- np.where(np.logical_and(label_bin > 0, label_bin < 255))
141
- ]
142
- label_downscale[x, y, z] = np.argmax(np.bincount(label_i_s))
143
- return label_downscale
144
-
145
-
146
- @hydra.main(config_name="../../config/monoscene.yaml")
147
- def main(config: DictConfig):
148
- scene_size = (240, 144, 240)
149
- for split in ["train", "test"]:
150
- root = os.path.join(config.NYU_root, "NYU" + split)
151
- base_dir = os.path.join(config.NYU_preprocess_root, "base", "NYU" + split)
152
- os.makedirs(base_dir, exist_ok=True)
153
-
154
- scans = glob.glob(os.path.join(root, "*.bin"))
155
- for scan in tqdm(scans):
156
- filename = os.path.basename(scan)
157
- name = filename[:-4]
158
- filepath = os.path.join(base_dir, name + ".pkl")
159
- if os.path.exists(filepath):
160
- continue
161
-
162
- vox_origin, cam_pose, rle = _read_rle(scan)
163
-
164
- target_1_1 = _rle2voxel(rle, scene_size, scan)
165
- target_1_4 = _downsample_label(target_1_1, scene_size, 4)
166
- target_1_16 = _downsample_label(target_1_1, scene_size, 16)
167
-
168
- data = {
169
- "cam_pose": cam_pose,
170
- "voxel_origin": vox_origin,
171
- "name": name,
172
- "target_1_4": target_1_4,
173
- "target_1_16": target_1_16,
174
- }
175
-
176
- with open(filepath, "wb") as handle:
177
- pickle.dump(data, handle)
178
- print("wrote to", filepath)
179
-
180
-
181
- if __name__ == "__main__":
182
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/kitti_360/collate.py DELETED
@@ -1,47 +0,0 @@
1
- import torch
2
-
3
-
4
- def collate_fn(batch):
5
- data = {}
6
- imgs = []
7
- frame_ids = []
8
- img_paths = []
9
- sequences = []
10
-
11
- cam_ks = []
12
- T_velo_2_cams = []
13
-
14
- scale_3ds = batch[0]["scale_3ds"]
15
- for scale_3d in scale_3ds:
16
- data["projected_pix_{}".format(scale_3d)] = []
17
- data["fov_mask_{}".format(scale_3d)] = []
18
-
19
- for _, input_dict in enumerate(batch):
20
- if "img_path" in input_dict:
21
- img_paths.append(input_dict["img_path"])
22
-
23
- for key in data:
24
- data[key].append(torch.from_numpy(input_dict[key]))
25
-
26
- cam_ks.append(torch.from_numpy(input_dict["cam_k"]).float())
27
- T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
28
-
29
- sequences.append(input_dict["sequence"])
30
-
31
- img = input_dict["img"]
32
- imgs.append(img)
33
-
34
- frame_ids.append(input_dict["frame_id"])
35
-
36
- ret_data = {
37
- "sequence": sequences,
38
- "frame_id": frame_ids,
39
- "cam_k": cam_ks,
40
- "T_velo_2_cam": T_velo_2_cams,
41
- "img": torch.stack(imgs),
42
- "img_path": img_paths,
43
- }
44
- for key in data:
45
- ret_data[key] = data[key]
46
-
47
- return ret_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/kitti_360/kitti_360_dataset.py DELETED
@@ -1,125 +0,0 @@
1
- import torch
2
- import os
3
- import glob
4
- from torch.utils.data import Dataset
5
- import numpy as np
6
- from monoscene.data.utils.helpers import vox2pix
7
- from PIL import Image
8
- from torchvision import transforms
9
-
10
-
11
- class Kitti360Dataset(Dataset):
12
- def __init__(self, root, sequences, n_scans):
13
- """
14
- Paramters
15
- --------
16
- root: str
17
- Path to KITTI-360 dataset i.e. contain sequences such as 2013_05_28_drive_0009_sync
18
- sequence: str
19
- KITTI-360 sequence e.g. 2013_05_28_drive_0009_sync
20
- n_scans: int
21
- Only use the first n_scans since KITTI-360 sequence is very long
22
- """
23
- self.root = root
24
- self.img_H = 376
25
- self.img_W = 1408
26
- self.project_scale = 2
27
- self.output_scale = 1
28
- self.voxel_size = 0.2
29
- self.vox_origin = np.array([0, -25.6, -2])
30
- self.scene_size = (51.2, 51.2, 6.4)
31
- self.T_velo_2_cam = self.get_velo2cam()
32
- self.cam_k = self.get_cam_k()
33
- self.scans = []
34
- for sequence in sequences:
35
- glob_path = os.path.join(
36
- self.root, "data_2d_raw", sequence, "image_00/data_rect", "*.png"
37
- )
38
- for img_path in glob.glob(glob_path):
39
- self.scans.append({"img_path": img_path, "sequence": sequence})
40
- self.scans = self.scans[:n_scans]
41
- self.normalize_rgb = transforms.Compose(
42
- [
43
- transforms.ToTensor(),
44
- transforms.Normalize(
45
- mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
46
- ),
47
- ]
48
- )
49
-
50
- def __len__(self):
51
- return len(self.scans)
52
-
53
- def get_cam_k(self):
54
- cam_k = np.array(
55
- [
56
- 552.554261,
57
- 0.000000,
58
- 682.049453,
59
- 0.000000,
60
- 0.000000,
61
- 552.554261,
62
- 238.769549,
63
- 0.000000,
64
- 0.000000,
65
- 0.000000,
66
- 1.000000,
67
- 0.000000,
68
- ]
69
- ).reshape(3, 4)
70
- return cam_k[:3, :3]
71
-
72
- def get_velo2cam(self):
73
- cam2velo = np.array(
74
- [
75
- 0.04307104361,
76
- -0.08829286498,
77
- 0.995162929,
78
- 0.8043914418,
79
- -0.999004371,
80
- 0.007784614041,
81
- 0.04392796942,
82
- 0.2993489574,
83
- -0.01162548558,
84
- -0.9960641394,
85
- -0.08786966659,
86
- -0.1770225824,
87
- ]
88
- ).reshape(3, 4)
89
- cam2velo = np.concatenate(
90
- [cam2velo, np.array([0, 0, 0, 1]).reshape(1, 4)], axis=0
91
- )
92
- return np.linalg.inv(cam2velo)
93
-
94
- def __getitem__(self, index):
95
- data = {"T_velo_2_cam": self.T_velo_2_cam, "cam_k": self.cam_k}
96
- scan = self.scans[index]
97
- img_path = scan["img_path"]
98
- sequence = scan["sequence"]
99
- filename = os.path.basename(img_path)
100
- frame_id = os.path.splitext(filename)[0]
101
- data["frame_id"] = frame_id
102
- data["img_path"] = img_path
103
- data["sequence"] = sequence
104
-
105
- img = Image.open(img_path).convert("RGB")
106
- img = np.array(img, dtype=np.float32, copy=False) / 255.0
107
- img = self.normalize_rgb(img)
108
- data["img"] = img
109
-
110
- scale_3ds = [self.project_scale, self.output_scale]
111
- data["scale_3ds"] = scale_3ds
112
-
113
- for scale_3d in scale_3ds:
114
- projected_pix, fov_mask, _ = vox2pix(
115
- self.T_velo_2_cam,
116
- self.cam_k,
117
- self.vox_origin,
118
- self.voxel_size * scale_3d,
119
- self.img_W,
120
- self.img_H,
121
- self.scene_size,
122
- )
123
- data["projected_pix_{}".format(scale_3d)] = projected_pix
124
- data["fov_mask_{}".format(scale_3d)] = fov_mask
125
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/kitti_360/kitti_360_dm.py DELETED
@@ -1,32 +0,0 @@
1
- from torch.utils.data.dataloader import DataLoader
2
- from monoscene.data.kitti_360.kitti_360_dataset import Kitti360Dataset
3
- import pytorch_lightning as pl
4
- from monoscene.data.kitti_360.collate import collate_fn
5
- from monoscene.data.utils.torch_util import worker_init_fn
6
-
7
-
8
- class Kitti360DataModule(pl.LightningDataModule):
9
- def __init__(self, root, sequences, n_scans, batch_size=4, num_workers=3):
10
- super().__init__()
11
- self.root = root
12
- self.batch_size = batch_size
13
- self.num_workers = num_workers
14
- self.sequences = sequences
15
- self.n_scans = n_scans
16
-
17
- def setup(self, stage=None):
18
- self.ds = Kitti360Dataset(
19
- root=self.root, sequences=self.sequences, n_scans=self.n_scans
20
- )
21
-
22
- def dataloader(self):
23
- return DataLoader(
24
- self.ds,
25
- batch_size=self.batch_size,
26
- drop_last=False,
27
- num_workers=self.num_workers,
28
- shuffle=False,
29
- pin_memory=True,
30
- worker_init_fn=worker_init_fn,
31
- collate_fn=collate_fn,
32
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/semantic_kitti/collate.py DELETED
@@ -1,61 +0,0 @@
1
- import torch
2
-
3
-
4
- def collate_fn(batch):
5
- data = {}
6
- imgs = []
7
- CP_mega_matrices = []
8
- targets = []
9
- frame_ids = []
10
- sequences = []
11
-
12
- cam_ks = []
13
- T_velo_2_cams = []
14
- frustums_masks = []
15
- frustums_class_dists = []
16
-
17
- scale_3ds = batch[0]["scale_3ds"]
18
- for scale_3d in scale_3ds:
19
- data["projected_pix_{}".format(scale_3d)] = []
20
- data["fov_mask_{}".format(scale_3d)] = []
21
-
22
- for idx, input_dict in enumerate(batch):
23
- cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
24
- T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
25
-
26
- if "frustums_masks" in input_dict:
27
- frustums_masks.append(torch.from_numpy(input_dict["frustums_masks"]))
28
- frustums_class_dists.append(
29
- torch.from_numpy(input_dict["frustums_class_dists"]).float()
30
- )
31
-
32
- for key in data:
33
- data[key].append(torch.from_numpy(input_dict[key]))
34
-
35
- img = input_dict["img"]
36
- imgs.append(img)
37
-
38
- frame_ids.append(input_dict["frame_id"])
39
- sequences.append(input_dict["sequence"])
40
-
41
-
42
- target = torch.from_numpy(input_dict["target"])
43
- targets.append(target)
44
- CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
45
-
46
- ret_data = {
47
- "frame_id": frame_ids,
48
- "sequence": sequences,
49
- "frustums_class_dists": frustums_class_dists,
50
- "frustums_masks": frustums_masks,
51
- "cam_k": cam_ks,
52
- "T_velo_2_cam": T_velo_2_cams,
53
- "img": torch.stack(imgs),
54
- "CP_mega_matrices": CP_mega_matrices,
55
- "target": torch.stack(targets)
56
- }
57
-
58
-
59
- for key in data:
60
- ret_data[key] = data[key]
61
- return ret_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/semantic_kitti/io_data.py DELETED
@@ -1,239 +0,0 @@
1
- """
2
- Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
3
- """
4
-
5
- import numpy as np
6
- import yaml
7
- import imageio
8
-
9
-
10
- def unpack(compressed):
11
- ''' given a bit encoded voxel grid, make a normal voxel grid out of it. '''
12
- uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
13
- uncompressed[::8] = compressed[:] >> 7 & 1
14
- uncompressed[1::8] = compressed[:] >> 6 & 1
15
- uncompressed[2::8] = compressed[:] >> 5 & 1
16
- uncompressed[3::8] = compressed[:] >> 4 & 1
17
- uncompressed[4::8] = compressed[:] >> 3 & 1
18
- uncompressed[5::8] = compressed[:] >> 2 & 1
19
- uncompressed[6::8] = compressed[:] >> 1 & 1
20
- uncompressed[7::8] = compressed[:] & 1
21
-
22
- return uncompressed
23
-
24
-
25
- def img_normalize(img, mean, std):
26
- img = img.astype(np.float32) / 255.0
27
- img = img - mean
28
- img = img / std
29
-
30
- return img
31
-
32
-
33
- def pack(array):
34
- """ convert a boolean array into a bitwise array. """
35
- array = array.reshape((-1))
36
-
37
- #compressing bit flags.
38
- # yapf: disable
39
- compressed = array[::8] << 7 | array[1::8] << 6 | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
40
- # yapf: enable
41
-
42
- return np.array(compressed, dtype=np.uint8)
43
-
44
-
45
- def get_grid_coords(dims, resolution):
46
- '''
47
- :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
48
- :return coords_grid: is the center coords of voxels in the grid
49
- '''
50
-
51
- # The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
52
- g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
53
- # The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
54
- g_yy = np.arange(0, dims[1] + 1)
55
- # The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
56
- # if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
57
- # (we go to 22 + 1 for the histogramdd)
58
- # ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
59
- sensor_pose = 10
60
- g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)
61
-
62
- # Obtaining the grid with coords...
63
- xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
64
- coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
65
- coords_grid = coords_grid.astype(np.float)
66
-
67
- coords_grid = (coords_grid * resolution) + resolution/2
68
-
69
- temp = np.copy(coords_grid)
70
- temp[:, 0] = coords_grid[:, 1]
71
- temp[:, 1] = coords_grid[:, 0]
72
- coords_grid = np.copy(temp)
73
-
74
- return coords_grid, g_xx, g_yy, g_zz
75
-
76
-
77
- def _get_remap_lut(config_path):
78
- '''
79
- remap_lut to remap classes of semantic kitti for training...
80
- :return:
81
- '''
82
-
83
- dataset_config = yaml.safe_load(open(config_path, 'r'))
84
- # make lookup table for mapping
85
- maxkey = max(dataset_config['learning_map'].keys())
86
-
87
- # +100 hack making lut bigger just in case there are unknown labels
88
- remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
89
- remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
90
-
91
- # in completion we have to distinguish empty and invalid voxels.
92
- # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
93
- remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
94
- remap_lut[0] = 0 # only 'empty' stays 'empty'.
95
-
96
- return remap_lut
97
-
98
-
99
- def get_inv_map():
100
- '''
101
- remap_lut to remap classes of semantic kitti for training...
102
- :return:
103
- '''
104
- config_path = "./semantic-kitti.yaml"
105
- dataset_config = yaml.safe_load(open(config_path, 'r'))
106
- # make lookup table for mapping
107
-
108
- inv_map = np.zeros(20, dtype=np.int32)
109
- inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())
110
-
111
- return inv_map
112
-
113
- def _read_SemKITTI(path, dtype, do_unpack):
114
- bin = np.fromfile(path, dtype=dtype) # Flattened array
115
- if do_unpack:
116
- bin = unpack(bin)
117
- return bin
118
-
119
-
120
- def _read_label_SemKITTI(path):
121
- label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
122
- return label
123
-
124
-
125
- def _read_invalid_SemKITTI(path):
126
- invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
127
- return invalid
128
-
129
-
130
- def _read_occluded_SemKITTI(path):
131
- occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
132
- return occluded
133
-
134
-
135
- def _read_occupancy_SemKITTI(path):
136
- occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
137
- return occupancy
138
-
139
-
140
- def _read_rgb_SemKITTI(path):
141
- rgb = np.asarray(imageio.imread(path))
142
- return rgb
143
-
144
-
145
- def _read_pointcloud_SemKITTI(path):
146
- 'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
147
- pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
148
- pointcloud = pointcloud.reshape((-1, 4))
149
- return pointcloud
150
-
151
-
152
- def _read_calib_SemKITTI(calib_path):
153
- """
154
- :param calib_path: Path to a calibration text file.
155
- :return: dict with calibration matrices.
156
- """
157
- calib_all = {}
158
- with open(calib_path, 'r') as f:
159
- for line in f.readlines():
160
- if line == '\n':
161
- break
162
- key, value = line.split(':', 1)
163
- calib_all[key] = np.array([float(x) for x in value.split()])
164
-
165
- # reshape matrices
166
- calib_out = {}
167
- calib_out['P2'] = calib_all['P2'].reshape(3, 4) # 3x4 projection matrix for left camera
168
- calib_out['Tr'] = np.identity(4) # 4x4 matrix
169
- calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
170
- return calib_out
171
-
172
-
173
- def get_remap_lut(path):
174
- '''
175
- remap_lut to remap classes of semantic kitti for training...
176
- :return:
177
- '''
178
-
179
- dataset_config = yaml.safe_load(open(path, 'r'))
180
-
181
- # make lookup table for mapping
182
- maxkey = max(dataset_config['learning_map'].keys())
183
-
184
- # +100 hack making lut bigger just in case there are unknown labels
185
- remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
186
- remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
187
-
188
- # in completion we have to distinguish empty and invalid voxels.
189
- # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
190
- remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
191
- remap_lut[0] = 0 # only 'empty' stays 'empty'.
192
-
193
- return remap_lut
194
-
195
-
196
- def data_augmentation_3Dflips(flip, data):
197
- # The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
198
- # into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
199
- # https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
200
- # Dims -> {XZY}
201
- # Flipping around the X axis...
202
- if np.isclose(flip, 1):
203
- data = np.flip(data, axis=0).copy()
204
-
205
- # Flipping around the Y axis...
206
- if np.isclose(flip, 2):
207
- data = np.flip(data, 2).copy()
208
-
209
- # Flipping around the X and the Y axis...
210
- if np.isclose(flip, 3):
211
- data = np.flip(np.flip(data, axis=0), axis=2).copy()
212
-
213
- return data
214
-
215
-
216
- def get_cmap_semanticKITTI20():
217
- colors = np.array([
218
- # [0 , 0 , 0, 255],
219
- [100, 150, 245, 255],
220
- [100, 230, 245, 255],
221
- [30, 60, 150, 255],
222
- [80, 30, 180, 255],
223
- [100, 80, 250, 255],
224
- [255, 30, 30, 255],
225
- [255, 40, 200, 255],
226
- [150, 30, 90, 255],
227
- [255, 0, 255, 255],
228
- [255, 150, 255, 255],
229
- [75, 0, 75, 255],
230
- [175, 0, 75, 255],
231
- [255, 200, 0, 255],
232
- [255, 120, 50, 255],
233
- [0, 175, 0, 255],
234
- [135, 60, 0, 255],
235
- [150, 240, 80, 255],
236
- [255, 240, 150, 255],
237
- [255, 0, 0, 255]]).astype(np.uint8)
238
-
239
- return colors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/semantic_kitti/kitti_dataset.py DELETED
@@ -1,200 +0,0 @@
1
- import torch
2
- import os
3
- import glob
4
- from torch.utils.data import Dataset
5
- import numpy as np
6
- from PIL import Image
7
- from torchvision import transforms
8
- from monoscene.data.utils.helpers import (
9
- vox2pix,
10
- compute_local_frustums,
11
- compute_CP_mega_matrix,
12
- )
13
-
14
-
15
- class KittiDataset(Dataset):
16
- def __init__(
17
- self,
18
- split,
19
- root,
20
- preprocess_root,
21
- project_scale=2,
22
- frustum_size=4,
23
- color_jitter=None,
24
- fliplr=0.0,
25
- ):
26
- super().__init__()
27
- self.root = root
28
- self.label_root = os.path.join(preprocess_root, "labels")
29
- self.n_classes = 20
30
- splits = {
31
- "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
32
- "val": ["08"],
33
- "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
34
- }
35
- self.split = split
36
- self.sequences = splits[split]
37
- self.frustum_size = frustum_size
38
- self.project_scale = project_scale
39
- self.output_scale = int(self.project_scale / 2)
40
- self.scene_size = (51.2, 51.2, 6.4)
41
- self.vox_origin = np.array([0, -25.6, -2])
42
- self.fliplr = fliplr
43
-
44
- self.voxel_size = 0.2 # 0.2m
45
- self.img_W = 1220
46
- self.img_H = 370
47
-
48
- self.color_jitter = (
49
- transforms.ColorJitter(*color_jitter) if color_jitter else None
50
- )
51
- self.scans = []
52
- for sequence in self.sequences:
53
- calib = self.read_calib(
54
- os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
55
- )
56
- P = calib["P2"]
57
- T_velo_2_cam = calib["Tr"]
58
- proj_matrix = P @ T_velo_2_cam
59
-
60
- glob_path = os.path.join(
61
- self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
62
- )
63
- for voxel_path in glob.glob(glob_path):
64
- self.scans.append(
65
- {
66
- "sequence": sequence,
67
- "P": P,
68
- "T_velo_2_cam": T_velo_2_cam,
69
- "proj_matrix": proj_matrix,
70
- "voxel_path": voxel_path,
71
- }
72
- )
73
-
74
- self.normalize_rgb = transforms.Compose(
75
- [
76
- transforms.ToTensor(),
77
- transforms.Normalize(
78
- mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
79
- ),
80
- ]
81
- )
82
-
83
- def __getitem__(self, index):
84
- scan = self.scans[index]
85
- voxel_path = scan["voxel_path"]
86
- sequence = scan["sequence"]
87
- P = scan["P"]
88
- T_velo_2_cam = scan["T_velo_2_cam"]
89
- proj_matrix = scan["proj_matrix"]
90
-
91
- filename = os.path.basename(voxel_path)
92
- frame_id = os.path.splitext(filename)[0]
93
-
94
- rgb_path = os.path.join(
95
- self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png"
96
- )
97
-
98
- data = {
99
- "frame_id": frame_id,
100
- "sequence": sequence,
101
- "P": P,
102
- "T_velo_2_cam": T_velo_2_cam,
103
- "proj_matrix": proj_matrix,
104
- }
105
- scale_3ds = [self.output_scale, self.project_scale]
106
- data["scale_3ds"] = scale_3ds
107
- cam_k = P[0:3, 0:3]
108
- data["cam_k"] = cam_k
109
- for scale_3d in scale_3ds:
110
-
111
- # compute the 3D-2D mapping
112
- projected_pix, fov_mask, pix_z = vox2pix(
113
- T_velo_2_cam,
114
- cam_k,
115
- self.vox_origin,
116
- self.voxel_size * scale_3d,
117
- self.img_W,
118
- self.img_H,
119
- self.scene_size,
120
- )
121
-
122
- data["projected_pix_{}".format(scale_3d)] = projected_pix
123
- data["pix_z_{}".format(scale_3d)] = pix_z
124
- data["fov_mask_{}".format(scale_3d)] = fov_mask
125
-
126
- target_1_path = os.path.join(self.label_root, sequence, frame_id + "_1_1.npy")
127
- target = np.load(target_1_path)
128
- data["target"] = target
129
- target_8_path = os.path.join(self.label_root, sequence, frame_id + "_1_8.npy")
130
- target_1_8 = np.load(target_8_path)
131
- CP_mega_matrix = compute_CP_mega_matrix(target_1_8)
132
- data["CP_mega_matrix"] = CP_mega_matrix
133
-
134
- # Compute the masks, each indicate the voxels of a local frustum
135
- if self.split != "test":
136
- projected_pix_output = data["projected_pix_{}".format(self.output_scale)]
137
- pix_z_output = data[
138
- "pix_z_{}".format(self.output_scale)
139
- ]
140
- frustums_masks, frustums_class_dists = compute_local_frustums(
141
- projected_pix_output,
142
- pix_z_output,
143
- target,
144
- self.img_W,
145
- self.img_H,
146
- dataset="kitti",
147
- n_classes=20,
148
- size=self.frustum_size,
149
- )
150
- else:
151
- frustums_masks = None
152
- frustums_class_dists = None
153
- data["frustums_masks"] = frustums_masks
154
- data["frustums_class_dists"] = frustums_class_dists
155
-
156
- img = Image.open(rgb_path).convert("RGB")
157
-
158
- # Image augmentation
159
- if self.color_jitter is not None:
160
- img = self.color_jitter(img)
161
-
162
- # PIL to numpy
163
- img = np.array(img, dtype=np.float32, copy=False) / 255.0
164
- img = img[:370, :1220, :] # crop image
165
-
166
- # Fliplr the image
167
- if np.random.rand() < self.fliplr:
168
- img = np.ascontiguousarray(np.fliplr(img))
169
- for scale in scale_3ds:
170
- key = "projected_pix_" + str(scale)
171
- data[key][:, 0] = img.shape[1] - 1 - data[key][:, 0]
172
-
173
- data["img"] = self.normalize_rgb(img)
174
- return data
175
-
176
- def __len__(self):
177
- return len(self.scans)
178
-
179
- @staticmethod
180
- def read_calib(calib_path):
181
- """
182
- Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
183
- :param calib_path: Path to a calibration text file.
184
- :return: dict with calibration matrices.
185
- """
186
- calib_all = {}
187
- with open(calib_path, "r") as f:
188
- for line in f.readlines():
189
- if line == "\n":
190
- break
191
- key, value = line.split(":", 1)
192
- calib_all[key] = np.array([float(x) for x in value.split()])
193
-
194
- # reshape matrices
195
- calib_out = {}
196
- # 3x4 projection matrix for left camera
197
- calib_out["P2"] = calib_all["P2"].reshape(3, 4)
198
- calib_out["Tr"] = np.identity(4) # 4x4 matrix
199
- calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
200
- return calib_out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/semantic_kitti/kitti_dm.py DELETED
@@ -1,91 +0,0 @@
1
- from torch.utils.data.dataloader import DataLoader
2
- from monoscene.data.semantic_kitti.kitti_dataset import KittiDataset
3
- import pytorch_lightning as pl
4
- from monoscene.data.semantic_kitti.collate import collate_fn
5
- from monoscene.data.utils.torch_util import worker_init_fn
6
-
7
-
8
- class KittiDataModule(pl.LightningDataModule):
9
- def __init__(
10
- self,
11
- root,
12
- preprocess_root,
13
- project_scale=2,
14
- frustum_size=4,
15
- batch_size=4,
16
- num_workers=6,
17
- ):
18
- super().__init__()
19
- self.root = root
20
- self.preprocess_root = preprocess_root
21
- self.project_scale = project_scale
22
- self.batch_size = batch_size
23
- self.num_workers = num_workers
24
- self.frustum_size = frustum_size
25
-
26
- def setup(self, stage=None):
27
- self.train_ds = KittiDataset(
28
- split="train",
29
- root=self.root,
30
- preprocess_root=self.preprocess_root,
31
- project_scale=self.project_scale,
32
- frustum_size=self.frustum_size,
33
- fliplr=0.5,
34
- color_jitter=(0.4, 0.4, 0.4),
35
- )
36
-
37
- self.val_ds = KittiDataset(
38
- split="val",
39
- root=self.root,
40
- preprocess_root=self.preprocess_root,
41
- project_scale=self.project_scale,
42
- frustum_size=self.frustum_size,
43
- fliplr=0,
44
- color_jitter=None,
45
- )
46
-
47
- self.test_ds = KittiDataset(
48
- split="test",
49
- root=self.root,
50
- preprocess_root=self.preprocess_root,
51
- project_scale=self.project_scale,
52
- frustum_size=self.frustum_size,
53
- fliplr=0,
54
- color_jitter=None,
55
- )
56
-
57
- def train_dataloader(self):
58
- return DataLoader(
59
- self.train_ds,
60
- batch_size=self.batch_size,
61
- drop_last=True,
62
- num_workers=self.num_workers,
63
- shuffle=True,
64
- pin_memory=True,
65
- worker_init_fn=worker_init_fn,
66
- collate_fn=collate_fn,
67
- )
68
-
69
- def val_dataloader(self):
70
- return DataLoader(
71
- self.val_ds,
72
- batch_size=self.batch_size,
73
- drop_last=False,
74
- num_workers=self.num_workers,
75
- shuffle=False,
76
- pin_memory=True,
77
- worker_init_fn=worker_init_fn,
78
- collate_fn=collate_fn,
79
- )
80
-
81
- def test_dataloader(self):
82
- return DataLoader(
83
- self.test_ds,
84
- batch_size=self.batch_size,
85
- drop_last=False,
86
- num_workers=self.num_workers,
87
- shuffle=False,
88
- pin_memory=True,
89
- worker_init_fn=worker_init_fn,
90
- collate_fn=collate_fn,
91
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/semantic_kitti/params.py DELETED
@@ -1,48 +0,0 @@
1
- import numpy as np
2
-
3
- semantic_kitti_class_frequencies = np.array(
4
- [
5
- 5.41773033e09,
6
- 1.57835390e07,
7
- 1.25136000e05,
8
- 1.18809000e05,
9
- 6.46799000e05,
10
- 8.21951000e05,
11
- 2.62978000e05,
12
- 2.83696000e05,
13
- 2.04750000e05,
14
- 6.16887030e07,
15
- 4.50296100e06,
16
- 4.48836500e07,
17
- 2.26992300e06,
18
- 5.68402180e07,
19
- 1.57196520e07,
20
- 1.58442623e08,
21
- 2.06162300e06,
22
- 3.69705220e07,
23
- 1.15198800e06,
24
- 3.34146000e05,
25
- ]
26
- )
27
- kitti_class_names = [
28
- "empty",
29
- "car",
30
- "bicycle",
31
- "motorcycle",
32
- "truck",
33
- "other-vehicle",
34
- "person",
35
- "bicyclist",
36
- "motorcyclist",
37
- "road",
38
- "parking",
39
- "sidewalk",
40
- "other-ground",
41
- "building",
42
- "fence",
43
- "vegetation",
44
- "trunk",
45
- "terrain",
46
- "pole",
47
- "traffic-sign",
48
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/semantic_kitti/preprocess.py DELETED
@@ -1,102 +0,0 @@
1
- """
2
- Code partly taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/labels_downscale.py
3
- """
4
- import numpy as np
5
- from tqdm import tqdm
6
- import numpy.matlib
7
- import os
8
- import glob
9
- import hydra
10
- from omegaconf import DictConfig
11
- import monoscene.data.semantic_kitti.io_data as SemanticKittiIO
12
- from hydra.utils import get_original_cwd
13
- from monoscene.data.NYU.preprocess import _downsample_label
14
-
15
-
16
- def majority_pooling(grid, k_size=2):
17
- result = np.zeros(
18
- (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
19
- )
20
- for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
21
- for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
22
- for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
23
-
24
- sub_m = grid[
25
- (xx * k_size) : (xx * k_size) + k_size,
26
- (yy * k_size) : (yy * k_size) + k_size,
27
- (zz * k_size) : (zz * k_size) + k_size,
28
- ]
29
- unique, counts = np.unique(sub_m, return_counts=True)
30
- if True in ((unique != 0) & (unique != 255)):
31
- # Remove counts with 0 and 255
32
- counts = counts[((unique != 0) & (unique != 255))]
33
- unique = unique[((unique != 0) & (unique != 255))]
34
- else:
35
- if True in (unique == 0):
36
- counts = counts[(unique != 255)]
37
- unique = unique[(unique != 255)]
38
- value = unique[np.argmax(counts)]
39
- result[xx, yy, zz] = value
40
- return result
41
-
42
-
43
- @hydra.main(config_name="../../config/monoscene.yaml")
44
- def main(config: DictConfig):
45
- scene_size = (256, 256, 32)
46
- sequences = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
47
- remap_lut = SemanticKittiIO.get_remap_lut(
48
- os.path.join(
49
- get_original_cwd(),
50
- "monoscene",
51
- "data",
52
- "semantic_kitti",
53
- "semantic-kitti.yaml",
54
- )
55
- )
56
-
57
- for sequence in sequences:
58
- sequence_path = os.path.join(
59
- config.kitti_root, "dataset", "sequences", sequence
60
- )
61
- label_paths = sorted(
62
- glob.glob(os.path.join(sequence_path, "voxels", "*.label"))
63
- )
64
- invalid_paths = sorted(
65
- glob.glob(os.path.join(sequence_path, "voxels", "*.invalid"))
66
- )
67
- out_dir = os.path.join(config.kitti_preprocess_root, "labels", sequence)
68
- os.makedirs(out_dir, exist_ok=True)
69
-
70
- downscaling = {"1_1": 1, "1_8": 8}
71
-
72
- for i in tqdm(range(len(label_paths))):
73
-
74
- frame_id, extension = os.path.splitext(os.path.basename(label_paths[i]))
75
-
76
- LABEL = SemanticKittiIO._read_label_SemKITTI(label_paths[i])
77
- INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_paths[i])
78
- LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
79
- np.float32
80
- ) # Remap 20 classes semanticKITTI SSC
81
- LABEL[
82
- np.isclose(INVALID, 1)
83
- ] = 255 # Setting to unknown all voxels marked on invalid mask...
84
- LABEL = LABEL.reshape([256, 256, 32])
85
-
86
- for scale in downscaling:
87
- filename = frame_id + "_" + scale + ".npy"
88
- label_filename = os.path.join(out_dir, filename)
89
- # If files have not been created...
90
- if not os.path.exists(label_filename):
91
- if scale == "1_8":
92
- LABEL_ds = _downsample_label(
93
- LABEL, (256, 256, 32), downscaling[scale]
94
- )
95
- else:
96
- LABEL_ds = LABEL
97
- np.save(label_filename, LABEL_ds)
98
- print("wrote to", label_filename)
99
-
100
-
101
- if __name__ == "__main__":
102
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/semantic_kitti/semantic-kitti.yaml DELETED
@@ -1,213 +0,0 @@
1
- # This file is covered by the LICENSE file in the root of this project.
2
- nbr_classes: 20
3
- grid_dims: [256, 32, 256] # (W, H, D)
4
- labels:
5
- 0 : "unlabeled"
6
- 1 : "outlier"
7
- 10: "car"
8
- 11: "bicycle"
9
- 13: "bus"
10
- 15: "motorcycle"
11
- 16: "on-rails"
12
- 18: "truck"
13
- 20: "other-vehicle"
14
- 30: "person"
15
- 31: "bicyclist"
16
- 32: "motorcyclist"
17
- 40: "road"
18
- 44: "parking"
19
- 48: "sidewalk"
20
- 49: "other-ground"
21
- 50: "building"
22
- 51: "fence"
23
- 52: "other-structure"
24
- 60: "lane-marking"
25
- 70: "vegetation"
26
- 71: "trunk"
27
- 72: "terrain"
28
- 80: "pole"
29
- 81: "traffic-sign"
30
- 99: "other-object"
31
- 252: "moving-car"
32
- 253: "moving-bicyclist"
33
- 254: "moving-person"
34
- 255: "moving-motorcyclist"
35
- 256: "moving-on-rails"
36
- 257: "moving-bus"
37
- 258: "moving-truck"
38
- 259: "moving-other-vehicle"
39
- color_map: # bgr
40
- 0 : [0, 0, 0]
41
- 1 : [0, 0, 255]
42
- 10: [245, 150, 100]
43
- 11: [245, 230, 100]
44
- 13: [250, 80, 100]
45
- 15: [150, 60, 30]
46
- 16: [255, 0, 0]
47
- 18: [180, 30, 80]
48
- 20: [255, 0, 0]
49
- 30: [30, 30, 255]
50
- 31: [200, 40, 255]
51
- 32: [90, 30, 150]
52
- 40: [255, 0, 255]
53
- 44: [255, 150, 255]
54
- 48: [75, 0, 75]
55
- 49: [75, 0, 175]
56
- 50: [0, 200, 255]
57
- 51: [50, 120, 255]
58
- 52: [0, 150, 255]
59
- 60: [170, 255, 150]
60
- 70: [0, 175, 0]
61
- 71: [0, 60, 135]
62
- 72: [80, 240, 150]
63
- 80: [150, 240, 255]
64
- 81: [0, 0, 255]
65
- 99: [255, 255, 50]
66
- 252: [245, 150, 100]
67
- 256: [255, 0, 0]
68
- 253: [200, 40, 255]
69
- 254: [30, 30, 255]
70
- 255: [90, 30, 150]
71
- 257: [250, 80, 100]
72
- 258: [180, 30, 80]
73
- 259: [255, 0, 0]
74
- content: # as a ratio with the total number of points
75
- 0: 0.018889854628292943
76
- 1: 0.0002937197336781505
77
- 10: 0.040818519255974316
78
- 11: 0.00016609538710764618
79
- 13: 2.7879693665067774e-05
80
- 15: 0.00039838616015114444
81
- 16: 0.0
82
- 18: 0.0020633612104619787
83
- 20: 0.0016218197275284021
84
- 30: 0.00017698551338515307
85
- 31: 1.1065903904919655e-08
86
- 32: 5.532951952459828e-09
87
- 40: 0.1987493871255525
88
- 44: 0.014717169549888214
89
- 48: 0.14392298360372
90
- 49: 0.0039048553037472045
91
- 50: 0.1326861944777486
92
- 51: 0.0723592229456223
93
- 52: 0.002395131480328884
94
- 60: 4.7084144280367186e-05
95
- 70: 0.26681502148037506
96
- 71: 0.006035012012626033
97
- 72: 0.07814222006271769
98
- 80: 0.002855498193863172
99
- 81: 0.0006155958086189918
100
- 99: 0.009923127583046915
101
- 252: 0.001789309418528068
102
- 253: 0.00012709999297008662
103
- 254: 0.00016059776092534436
104
- 255: 3.745553104802113e-05
105
- 256: 0.0
106
- 257: 0.00011351574470342043
107
- 258: 0.00010157861367183268
108
- 259: 4.3840131989471124e-05
109
- # classes that are indistinguishable from single scan or inconsistent in
110
- # ground truth are mapped to their closest equivalent
111
- learning_map:
112
- 0 : 0 # "unlabeled"
113
- 1 : 0 # "outlier" mapped to "unlabeled" --------------------------mapped
114
- 10: 1 # "car"
115
- 11: 2 # "bicycle"
116
- 13: 5 # "bus" mapped to "other-vehicle" --------------------------mapped
117
- 15: 3 # "motorcycle"
118
- 16: 5 # "on-rails" mapped to "other-vehicle" ---------------------mapped
119
- 18: 4 # "truck"
120
- 20: 5 # "other-vehicle"
121
- 30: 6 # "person"
122
- 31: 7 # "bicyclist"
123
- 32: 8 # "motorcyclist"
124
- 40: 9 # "road"
125
- 44: 10 # "parking"
126
- 48: 11 # "sidewalk"
127
- 49: 12 # "other-ground"
128
- 50: 13 # "building"
129
- 51: 14 # "fence"
130
- 52: 0 # "other-structure" mapped to "unlabeled" ------------------mapped
131
- 60: 9 # "lane-marking" to "road" ---------------------------------mapped
132
- 70: 15 # "vegetation"
133
- 71: 16 # "trunk"
134
- 72: 17 # "terrain"
135
- 80: 18 # "pole"
136
- 81: 19 # "traffic-sign"
137
- 99: 0 # "other-object" to "unlabeled" ----------------------------mapped
138
- 252: 1 # "moving-car" to "car" ------------------------------------mapped
139
- 253: 7 # "moving-bicyclist" to "bicyclist" ------------------------mapped
140
- 254: 6 # "moving-person" to "person" ------------------------------mapped
141
- 255: 8 # "moving-motorcyclist" to "motorcyclist" ------------------mapped
142
- 256: 5 # "moving-on-rails" mapped to "other-vehicle" --------------mapped
143
- 257: 5 # "moving-bus" mapped to "other-vehicle" -------------------mapped
144
- 258: 4 # "moving-truck" to "truck" --------------------------------mapped
145
- 259: 5 # "moving-other"-vehicle to "other-vehicle" ----------------mapped
146
- learning_map_inv: # inverse of previous map
147
- 0: 0 # "unlabeled", and others ignored
148
- 1: 10 # "car"
149
- 2: 11 # "bicycle"
150
- 3: 15 # "motorcycle"
151
- 4: 18 # "truck"
152
- 5: 20 # "other-vehicle"
153
- 6: 30 # "person"
154
- 7: 31 # "bicyclist"
155
- 8: 32 # "motorcyclist"
156
- 9: 40 # "road"
157
- 10: 44 # "parking"
158
- 11: 48 # "sidewalk"
159
- 12: 49 # "other-ground"
160
- 13: 50 # "building"
161
- 14: 51 # "fence"
162
- 15: 70 # "vegetation"
163
- 16: 71 # "trunk"
164
- 17: 72 # "terrain"
165
- 18: 80 # "pole"
166
- 19: 81 # "traffic-sign"
167
- learning_ignore: # Ignore classes
168
- 0: True # "unlabeled", and others ignored
169
- 1: False # "car"
170
- 2: False # "bicycle"
171
- 3: False # "motorcycle"
172
- 4: False # "truck"
173
- 5: False # "other-vehicle"
174
- 6: False # "person"
175
- 7: False # "bicyclist"
176
- 8: False # "motorcyclist"
177
- 9: False # "road"
178
- 10: False # "parking"
179
- 11: False # "sidewalk"
180
- 12: False # "other-ground"
181
- 13: False # "building"
182
- 14: False # "fence"
183
- 15: False # "vegetation"
184
- 16: False # "trunk"
185
- 17: False # "terrain"
186
- 18: False # "pole"
187
- 19: False # "traffic-sign"
188
- split: # sequence numbers
189
- train:
190
- - 0
191
- - 1
192
- - 2
193
- - 3
194
- - 4
195
- - 5
196
- - 6
197
- - 7
198
- - 9
199
- - 10
200
- valid:
201
- - 8
202
- test:
203
- - 11
204
- - 12
205
- - 13
206
- - 14
207
- - 15
208
- - 16
209
- - 17
210
- - 18
211
- - 19
212
- - 20
213
- - 21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/utils/fusion.py DELETED
@@ -1,507 +0,0 @@
1
- """
2
- Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
3
-
4
- @inproceedings{zeng20163dmatch,
5
- title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
6
- author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
7
- booktitle={CVPR},
8
- year={2017}
9
- }
10
- """
11
-
12
- import numpy as np
13
-
14
- from numba import njit, prange
15
- from skimage import measure
16
-
17
- FUSION_GPU_MODE = 0
18
-
19
-
20
- class TSDFVolume:
21
- """Volumetric TSDF Fusion of RGB-D Images."""
22
-
23
- def __init__(self, vol_bnds, voxel_size, use_gpu=True):
24
- """Constructor.
25
-
26
- Args:
27
- vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
28
- xyz bounds (min/max) in meters.
29
- voxel_size (float): The volume discretization in meters.
30
- """
31
- vol_bnds = np.asarray(vol_bnds)
32
- assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
33
-
34
- # Define voxel volume parameters
35
- self._vol_bnds = vol_bnds
36
- self._voxel_size = float(voxel_size)
37
- self._trunc_margin = 5 * self._voxel_size # truncation on SDF
38
- # self._trunc_margin = 10 # truncation on SDF
39
- self._color_const = 256 * 256
40
-
41
- # Adjust volume bounds and ensure C-order contiguous
42
- self._vol_dim = (
43
- np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
44
- .copy(order="C")
45
- .astype(int)
46
- )
47
- self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
48
- self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
49
-
50
- print(
51
- "Voxel volume size: {} x {} x {} - # points: {:,}".format(
52
- self._vol_dim[0],
53
- self._vol_dim[1],
54
- self._vol_dim[2],
55
- self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
56
- )
57
- )
58
-
59
- # Initialize pointers to voxel volume in CPU memory
60
- self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
61
- # for computing the cumulative moving average of observations per voxel
62
- self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
63
- self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
64
-
65
- self.gpu_mode = use_gpu and FUSION_GPU_MODE
66
-
67
- # Copy voxel volumes to GPU
68
- if self.gpu_mode:
69
- self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
70
- cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
71
- self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
72
- cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
73
- self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
74
- cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
75
-
76
- # Cuda kernel function (C++)
77
- self._cuda_src_mod = SourceModule(
78
- """
79
- __global__ void integrate(float * tsdf_vol,
80
- float * weight_vol,
81
- float * color_vol,
82
- float * vol_dim,
83
- float * vol_origin,
84
- float * cam_intr,
85
- float * cam_pose,
86
- float * other_params,
87
- float * color_im,
88
- float * depth_im) {
89
- // Get voxel index
90
- int gpu_loop_idx = (int) other_params[0];
91
- int max_threads_per_block = blockDim.x;
92
- int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
93
- int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
94
- int vol_dim_x = (int) vol_dim[0];
95
- int vol_dim_y = (int) vol_dim[1];
96
- int vol_dim_z = (int) vol_dim[2];
97
- if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
98
- return;
99
- // Get voxel grid coordinates (note: be careful when casting)
100
- float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
101
- float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
102
- float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
103
- // Voxel grid coordinates to world coordinates
104
- float voxel_size = other_params[1];
105
- float pt_x = vol_origin[0]+voxel_x*voxel_size;
106
- float pt_y = vol_origin[1]+voxel_y*voxel_size;
107
- float pt_z = vol_origin[2]+voxel_z*voxel_size;
108
- // World coordinates to camera coordinates
109
- float tmp_pt_x = pt_x-cam_pose[0*4+3];
110
- float tmp_pt_y = pt_y-cam_pose[1*4+3];
111
- float tmp_pt_z = pt_z-cam_pose[2*4+3];
112
- float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
113
- float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
114
- float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
115
- // Camera coordinates to image pixels
116
- int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
117
- int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
118
- // Skip if outside view frustum
119
- int im_h = (int) other_params[2];
120
- int im_w = (int) other_params[3];
121
- if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
122
- return;
123
- // Skip invalid depth
124
- float depth_value = depth_im[pixel_y*im_w+pixel_x];
125
- if (depth_value == 0)
126
- return;
127
- // Integrate TSDF
128
- float trunc_margin = other_params[4];
129
- float depth_diff = depth_value-cam_pt_z;
130
- if (depth_diff < -trunc_margin)
131
- return;
132
- float dist = fmin(1.0f,depth_diff/trunc_margin);
133
- float w_old = weight_vol[voxel_idx];
134
- float obs_weight = other_params[5];
135
- float w_new = w_old + obs_weight;
136
- weight_vol[voxel_idx] = w_new;
137
- tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
138
- // Integrate color
139
- float old_color = color_vol[voxel_idx];
140
- float old_b = floorf(old_color/(256*256));
141
- float old_g = floorf((old_color-old_b*256*256)/256);
142
- float old_r = old_color-old_b*256*256-old_g*256;
143
- float new_color = color_im[pixel_y*im_w+pixel_x];
144
- float new_b = floorf(new_color/(256*256));
145
- float new_g = floorf((new_color-new_b*256*256)/256);
146
- float new_r = new_color-new_b*256*256-new_g*256;
147
- new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
148
- new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
149
- new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
150
- color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
151
- }"""
152
- )
153
-
154
- self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
155
-
156
- # Determine block/grid size on GPU
157
- gpu_dev = cuda.Device(0)
158
- self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
159
- n_blocks = int(
160
- np.ceil(
161
- float(np.prod(self._vol_dim))
162
- / float(self._max_gpu_threads_per_block)
163
- )
164
- )
165
- grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
166
- grid_dim_y = min(
167
- gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
168
- )
169
- grid_dim_z = min(
170
- gpu_dev.MAX_GRID_DIM_Z,
171
- int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
172
- )
173
- self._max_gpu_grid_dim = np.array(
174
- [grid_dim_x, grid_dim_y, grid_dim_z]
175
- ).astype(int)
176
- self._n_gpu_loops = int(
177
- np.ceil(
178
- float(np.prod(self._vol_dim))
179
- / float(
180
- np.prod(self._max_gpu_grid_dim)
181
- * self._max_gpu_threads_per_block
182
- )
183
- )
184
- )
185
-
186
- else:
187
- # Get voxel grid coordinates
188
- xv, yv, zv = np.meshgrid(
189
- range(self._vol_dim[0]),
190
- range(self._vol_dim[1]),
191
- range(self._vol_dim[2]),
192
- indexing="ij",
193
- )
194
- self.vox_coords = (
195
- np.concatenate(
196
- [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
197
- )
198
- .astype(int)
199
- .T
200
- )
201
-
202
- @staticmethod
203
- @njit(parallel=True)
204
- def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
205
- """Convert voxel grid coordinates to world coordinates."""
206
- vol_origin = vol_origin.astype(np.float32)
207
- vox_coords = vox_coords.astype(np.float32)
208
- # print(np.min(vox_coords))
209
- cam_pts = np.empty_like(vox_coords, dtype=np.float32)
210
-
211
- for i in prange(vox_coords.shape[0]):
212
- for j in range(3):
213
- cam_pts[i, j] = (
214
- vol_origin[j]
215
- + (vox_size * vox_coords[i, j])
216
- + vox_size * offsets[j]
217
- )
218
- return cam_pts
219
-
220
- @staticmethod
221
- @njit(parallel=True)
222
- def cam2pix(cam_pts, intr):
223
- """Convert camera coordinates to pixel coordinates."""
224
- intr = intr.astype(np.float32)
225
- fx, fy = intr[0, 0], intr[1, 1]
226
- cx, cy = intr[0, 2], intr[1, 2]
227
- pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
228
- for i in prange(cam_pts.shape[0]):
229
- pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
230
- pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
231
- return pix
232
-
233
- @staticmethod
234
- @njit(parallel=True)
235
- def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
236
- """Integrate the TSDF volume."""
237
- tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
238
- # print(tsdf_vol.shape)
239
- w_new = np.empty_like(w_old, dtype=np.float32)
240
- for i in prange(len(tsdf_vol)):
241
- w_new[i] = w_old[i] + obs_weight
242
- tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
243
- return tsdf_vol_int, w_new
244
-
245
- def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
246
- """Integrate an RGB-D frame into the TSDF volume.
247
-
248
- Args:
249
- color_im (ndarray): An RGB image of shape (H, W, 3).
250
- depth_im (ndarray): A depth image of shape (H, W).
251
- cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
252
- cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
253
- obs_weight (float): The weight to assign for the current observation. A higher
254
- value
255
- """
256
- im_h, im_w = depth_im.shape
257
-
258
- # Fold RGB color image into a single channel image
259
- color_im = color_im.astype(np.float32)
260
- color_im = np.floor(
261
- color_im[..., 2] * self._color_const
262
- + color_im[..., 1] * 256
263
- + color_im[..., 0]
264
- )
265
-
266
- if self.gpu_mode: # GPU mode: integrate voxel volume (calls CUDA kernel)
267
- for gpu_loop_idx in range(self._n_gpu_loops):
268
- self._cuda_integrate(
269
- self._tsdf_vol_gpu,
270
- self._weight_vol_gpu,
271
- self._color_vol_gpu,
272
- cuda.InOut(self._vol_dim.astype(np.float32)),
273
- cuda.InOut(self._vol_origin.astype(np.float32)),
274
- cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
275
- cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
276
- cuda.InOut(
277
- np.asarray(
278
- [
279
- gpu_loop_idx,
280
- self._voxel_size,
281
- im_h,
282
- im_w,
283
- self._trunc_margin,
284
- obs_weight,
285
- ],
286
- np.float32,
287
- )
288
- ),
289
- cuda.InOut(color_im.reshape(-1).astype(np.float32)),
290
- cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
291
- block=(self._max_gpu_threads_per_block, 1, 1),
292
- grid=(
293
- int(self._max_gpu_grid_dim[0]),
294
- int(self._max_gpu_grid_dim[1]),
295
- int(self._max_gpu_grid_dim[2]),
296
- ),
297
- )
298
- else: # CPU mode: integrate voxel volume (vectorized implementation)
299
- # Convert voxel grid coordinates to pixel coordinates
300
- cam_pts = self.vox2world(
301
- self._vol_origin, self.vox_coords, self._voxel_size
302
- )
303
- cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
304
- pix_z = cam_pts[:, 2]
305
- pix = self.cam2pix(cam_pts, cam_intr)
306
- pix_x, pix_y = pix[:, 0], pix[:, 1]
307
-
308
- # Eliminate pixels outside view frustum
309
- valid_pix = np.logical_and(
310
- pix_x >= 0,
311
- np.logical_and(
312
- pix_x < im_w,
313
- np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
314
- ),
315
- )
316
- depth_val = np.zeros(pix_x.shape)
317
- depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
318
-
319
- # Integrate TSDF
320
- depth_diff = depth_val - pix_z
321
-
322
- valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
323
- dist = depth_diff
324
-
325
- valid_vox_x = self.vox_coords[valid_pts, 0]
326
- valid_vox_y = self.vox_coords[valid_pts, 1]
327
- valid_vox_z = self.vox_coords[valid_pts, 2]
328
- w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
329
- tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
330
- valid_dist = dist[valid_pts]
331
- tsdf_vol_new, w_new = self.integrate_tsdf(
332
- tsdf_vals, valid_dist, w_old, obs_weight
333
- )
334
- self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
335
- self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
336
-
337
- # Integrate color
338
- old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
339
- old_b = np.floor(old_color / self._color_const)
340
- old_g = np.floor((old_color - old_b * self._color_const) / 256)
341
- old_r = old_color - old_b * self._color_const - old_g * 256
342
- new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
343
- new_b = np.floor(new_color / self._color_const)
344
- new_g = np.floor((new_color - new_b * self._color_const) / 256)
345
- new_r = new_color - new_b * self._color_const - new_g * 256
346
- new_b = np.minimum(
347
- 255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
348
- )
349
- new_g = np.minimum(
350
- 255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
351
- )
352
- new_r = np.minimum(
353
- 255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
354
- )
355
- self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
356
- new_b * self._color_const + new_g * 256 + new_r
357
- )
358
-
359
- def get_volume(self):
360
- if self.gpu_mode:
361
- cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
362
- cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
363
- return self._tsdf_vol_cpu, self._color_vol_cpu
364
-
365
- def get_point_cloud(self):
366
- """Extract a point cloud from the voxel volume."""
367
- tsdf_vol, color_vol = self.get_volume()
368
-
369
- # Marching cubes
370
- verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
371
- verts_ind = np.round(verts).astype(int)
372
- verts = verts * self._voxel_size + self._vol_origin
373
-
374
- # Get vertex colors
375
- rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
376
- colors_b = np.floor(rgb_vals / self._color_const)
377
- colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
378
- colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
379
- colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
380
- colors = colors.astype(np.uint8)
381
-
382
- pc = np.hstack([verts, colors])
383
- return pc
384
-
385
- def get_mesh(self):
386
- """Compute a mesh from the voxel volume using marching cubes."""
387
- tsdf_vol, color_vol = self.get_volume()
388
-
389
- # Marching cubes
390
- verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
391
- verts_ind = np.round(verts).astype(int)
392
- verts = (
393
- verts * self._voxel_size + self._vol_origin
394
- ) # voxel grid coordinates to world coordinates
395
-
396
- # Get vertex colors
397
- rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
398
- colors_b = np.floor(rgb_vals / self._color_const)
399
- colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
400
- colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
401
- colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
402
- colors = colors.astype(np.uint8)
403
- return verts, faces, norms, colors
404
-
405
-
406
- def rigid_transform(xyz, transform):
407
- """Applies a rigid transform to an (N, 3) pointcloud."""
408
- xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
409
- xyz_t_h = np.dot(transform, xyz_h.T).T
410
- return xyz_t_h[:, :3]
411
-
412
-
413
- def get_view_frustum(depth_im, cam_intr, cam_pose):
414
- """Get corners of 3D camera view frustum of depth image"""
415
- im_h = depth_im.shape[0]
416
- im_w = depth_im.shape[1]
417
- max_depth = np.max(depth_im)
418
- view_frust_pts = np.array(
419
- [
420
- (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
421
- * np.array([0, max_depth, max_depth, max_depth, max_depth])
422
- / cam_intr[0, 0],
423
- (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
424
- * np.array([0, max_depth, max_depth, max_depth, max_depth])
425
- / cam_intr[1, 1],
426
- np.array([0, max_depth, max_depth, max_depth, max_depth]),
427
- ]
428
- )
429
- view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
430
- return view_frust_pts
431
-
432
-
433
- def meshwrite(filename, verts, faces, norms, colors):
434
- """Save a 3D mesh to a polygon .ply file."""
435
- # Write header
436
- ply_file = open(filename, "w")
437
- ply_file.write("ply\n")
438
- ply_file.write("format ascii 1.0\n")
439
- ply_file.write("element vertex %d\n" % (verts.shape[0]))
440
- ply_file.write("property float x\n")
441
- ply_file.write("property float y\n")
442
- ply_file.write("property float z\n")
443
- ply_file.write("property float nx\n")
444
- ply_file.write("property float ny\n")
445
- ply_file.write("property float nz\n")
446
- ply_file.write("property uchar red\n")
447
- ply_file.write("property uchar green\n")
448
- ply_file.write("property uchar blue\n")
449
- ply_file.write("element face %d\n" % (faces.shape[0]))
450
- ply_file.write("property list uchar int vertex_index\n")
451
- ply_file.write("end_header\n")
452
-
453
- # Write vertex list
454
- for i in range(verts.shape[0]):
455
- ply_file.write(
456
- "%f %f %f %f %f %f %d %d %d\n"
457
- % (
458
- verts[i, 0],
459
- verts[i, 1],
460
- verts[i, 2],
461
- norms[i, 0],
462
- norms[i, 1],
463
- norms[i, 2],
464
- colors[i, 0],
465
- colors[i, 1],
466
- colors[i, 2],
467
- )
468
- )
469
-
470
- # Write face list
471
- for i in range(faces.shape[0]):
472
- ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
473
-
474
- ply_file.close()
475
-
476
-
477
- def pcwrite(filename, xyzrgb):
478
- """Save a point cloud to a polygon .ply file."""
479
- xyz = xyzrgb[:, :3]
480
- rgb = xyzrgb[:, 3:].astype(np.uint8)
481
-
482
- # Write header
483
- ply_file = open(filename, "w")
484
- ply_file.write("ply\n")
485
- ply_file.write("format ascii 1.0\n")
486
- ply_file.write("element vertex %d\n" % (xyz.shape[0]))
487
- ply_file.write("property float x\n")
488
- ply_file.write("property float y\n")
489
- ply_file.write("property float z\n")
490
- ply_file.write("property uchar red\n")
491
- ply_file.write("property uchar green\n")
492
- ply_file.write("property uchar blue\n")
493
- ply_file.write("end_header\n")
494
-
495
- # Write vertex list
496
- for i in range(xyz.shape[0]):
497
- ply_file.write(
498
- "%f %f %f %d %d %d\n"
499
- % (
500
- xyz[i, 0],
501
- xyz[i, 1],
502
- xyz[i, 2],
503
- rgb[i, 0],
504
- rgb[i, 1],
505
- rgb[i, 2],
506
- )
507
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/utils/helpers.py DELETED
@@ -1,185 +0,0 @@
1
- import numpy as np
2
- import monoscene.data.utils.fusion as fusion
3
- import torch
4
-
5
-
6
- def compute_CP_mega_matrix(target, is_binary=False):
7
- """
8
- Parameters
9
- ---------
10
- target: (H, W, D)
11
- contains voxels semantic labels
12
-
13
- is_binary: bool
14
- if True, return binary voxels relations else return 4-way relations
15
- """
16
- label = target.reshape(-1)
17
- label_row = label
18
- N = label.shape[0]
19
- super_voxel_size = [i//2 for i in target.shape]
20
- if is_binary:
21
- matrix = np.zeros((2, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
22
- else:
23
- matrix = np.zeros((4, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
24
-
25
- for xx in range(super_voxel_size[0]):
26
- for yy in range(super_voxel_size[1]):
27
- for zz in range(super_voxel_size[2]):
28
- col_idx = xx * (super_voxel_size[1] * super_voxel_size[2]) + yy * super_voxel_size[2] + zz
29
- label_col_megas = np.array([
30
- target[xx * 2, yy * 2, zz * 2],
31
- target[xx * 2 + 1, yy * 2, zz * 2],
32
- target[xx * 2, yy * 2 + 1, zz * 2],
33
- target[xx * 2, yy * 2, zz * 2 + 1],
34
- target[xx * 2 + 1, yy * 2 + 1, zz * 2],
35
- target[xx * 2 + 1, yy * 2, zz * 2 + 1],
36
- target[xx * 2, yy * 2 + 1, zz * 2 + 1],
37
- target[xx * 2 + 1, yy * 2 + 1, zz * 2 + 1],
38
- ])
39
- label_col_megas = label_col_megas[label_col_megas != 255]
40
- for label_col_mega in label_col_megas:
41
- label_col = np.ones(N) * label_col_mega
42
- if not is_binary:
43
- matrix[0, (label_row != 255) & (label_col == label_row) & (label_col != 0), col_idx] = 1.0 # non non same
44
- matrix[1, (label_row != 255) & (label_col != label_row) & (label_col != 0) & (label_row != 0), col_idx] = 1.0 # non non diff
45
- matrix[2, (label_row != 255) & (label_row == label_col) & (label_col == 0), col_idx] = 1.0 # empty empty
46
- matrix[3, (label_row != 255) & (label_row != label_col) & ((label_row == 0) | (label_col == 0)), col_idx] = 1.0 # nonempty empty
47
- else:
48
- matrix[0, (label_row != 255) & (label_col != label_row), col_idx] = 1.0 # diff
49
- matrix[1, (label_row != 255) & (label_col == label_row), col_idx] = 1.0 # same
50
- return matrix
51
-
52
-
53
- def vox2pix(cam_E, cam_k,
54
- vox_origin, voxel_size,
55
- img_W, img_H,
56
- scene_size):
57
- """
58
- compute the 2D projection of voxels centroids
59
-
60
- Parameters:
61
- ----------
62
- cam_E: 4x4
63
- =camera pose in case of NYUv2 dataset
64
- =Transformation from camera to lidar coordinate in case of SemKITTI
65
- cam_k: 3x3
66
- camera intrinsics
67
- vox_origin: (3,)
68
- world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
69
- img_W: int
70
- image width
71
- img_H: int
72
- image height
73
- scene_size: (3,)
74
- scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
75
-
76
- Returns
77
- -------
78
- projected_pix: (N, 2)
79
- Projected 2D positions of voxels
80
- fov_mask: (N,)
81
- Voxels mask indice voxels inside image's FOV
82
- pix_z: (N,)
83
- Voxels'distance to the sensor in meter
84
- """
85
- # Compute the x, y, z bounding of the scene in meter
86
- vol_bnds = np.zeros((3,2))
87
- vol_bnds[:,0] = vox_origin
88
- vol_bnds[:,1] = vox_origin + np.array(scene_size)
89
-
90
- # Compute the voxels centroids in lidar cooridnates
91
- vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
92
- xv, yv, zv = np.meshgrid(
93
- range(vol_dim[0]),
94
- range(vol_dim[1]),
95
- range(vol_dim[2]),
96
- indexing='ij'
97
- )
98
- vox_coords = np.concatenate([
99
- xv.reshape(1,-1),
100
- yv.reshape(1,-1),
101
- zv.reshape(1,-1)
102
- ], axis=0).astype(int).T
103
-
104
- # Project voxels'centroid from lidar coordinates to camera coordinates
105
- cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
106
- cam_pts = fusion.rigid_transform(cam_pts, cam_E)
107
-
108
- # Project camera coordinates to pixel positions
109
- projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
110
- pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
111
-
112
- # Eliminate pixels outside view frustum
113
- pix_z = cam_pts[:, 2]
114
- fov_mask = np.logical_and(pix_x >= 0,
115
- np.logical_and(pix_x < img_W,
116
- np.logical_and(pix_y >= 0,
117
- np.logical_and(pix_y < img_H,
118
- pix_z > 0))))
119
-
120
-
121
- return projected_pix, fov_mask, pix_z
122
-
123
-
124
- def compute_local_frustum(pix_x, pix_y, min_x, max_x, min_y, max_y, pix_z):
125
- valid_pix = np.logical_and(pix_x >= min_x,
126
- np.logical_and(pix_x < max_x,
127
- np.logical_and(pix_y >= min_y,
128
- np.logical_and(pix_y < max_y,
129
- pix_z > 0))))
130
- return valid_pix
131
-
132
- def compute_local_frustums(projected_pix, pix_z, target, img_W, img_H, dataset, n_classes, size=4):
133
- """
134
- Compute the local frustums mask and their class frequencies
135
-
136
- Parameters:
137
- ----------
138
- projected_pix: (N, 2)
139
- 2D projected pix of all voxels
140
- pix_z: (N,)
141
- Distance of the camera sensor to voxels
142
- target: (H, W, D)
143
- Voxelized sematic labels
144
- img_W: int
145
- Image width
146
- img_H: int
147
- Image height
148
- dataset: str
149
- ="NYU" or "kitti" (for both SemKITTI and KITTI-360)
150
- n_classes: int
151
- Number of classes (12 for NYU and 20 for SemKITTI)
152
- size: int
153
- determine the number of local frustums i.e. size * size
154
-
155
- Returns
156
- -------
157
- frustums_masks: (n_frustums, N)
158
- List of frustums_masks, each indicates the belonging voxels
159
- frustums_class_dists: (n_frustums, n_classes)
160
- Contains the class frequencies in each frustum
161
- """
162
- H, W, D = target.shape
163
- ranges = [(i * 1.0/size, (i * 1.0 + 1)/size) for i in range(size)]
164
- local_frustum_masks = []
165
- local_frustum_class_dists = []
166
- pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
167
- for y in ranges:
168
- for x in ranges:
169
- start_x = x[0] * img_W
170
- end_x = x[1] * img_W
171
- start_y = y[0] * img_H
172
- end_y = y[1] * img_H
173
- local_frustum = compute_local_frustum(pix_x, pix_y, start_x, end_x, start_y, end_y, pix_z)
174
- if dataset == "NYU":
175
- mask = (target != 255) & np.moveaxis(local_frustum.reshape(60, 60, 36), [0, 1, 2], [0, 2, 1])
176
- elif dataset == "kitti":
177
- mask = (target != 255) & local_frustum.reshape(H, W, D)
178
-
179
- local_frustum_masks.append(mask)
180
- classes, cnts = np.unique(target[mask], return_counts=True)
181
- class_counts = np.zeros(n_classes)
182
- class_counts[classes.astype(int)] = cnts
183
- local_frustum_class_dists.append(class_counts)
184
- frustums_masks, frustums_class_dists = np.array(local_frustum_masks), np.array(local_frustum_class_dists)
185
- return frustums_masks, frustums_class_dists
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/data/utils/torch_util.py DELETED
@@ -1,15 +0,0 @@
1
- import numpy as np
2
- import torch
3
-
4
-
5
- def worker_init_fn(worker_id):
6
- """The function is designed for pytorch multi-process dataloader.
7
- Note that we use the pytorch random generator to generate a base_seed.
8
- Please try to be consistent.
9
-
10
- References:
11
- https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
12
-
13
- """
14
- base_seed = torch.IntTensor(1).random_().item()
15
- np.random.seed(base_seed + worker_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/{models/flosp.py β†’ flosp.py} RENAMED
File without changes
monoscene/loss/CRP_loss.py DELETED
@@ -1,24 +0,0 @@
1
- import torch
2
-
3
-
4
- def compute_super_CP_multilabel_loss(pred_logits, CP_mega_matrices):
5
- logits = []
6
- labels = []
7
- bs, n_relations, _, _ = pred_logits.shape
8
- for i in range(bs):
9
- pred_logit = pred_logits[i, :, :, :].permute(
10
- 0, 2, 1
11
- ) # n_relations, N, n_mega_voxels
12
- CP_mega_matrix = CP_mega_matrices[i] # n_relations, N, n_mega_voxels
13
- logits.append(pred_logit.reshape(n_relations, -1))
14
- labels.append(CP_mega_matrix.reshape(n_relations, -1))
15
-
16
- logits = torch.cat(logits, dim=1).T # M, 4
17
- labels = torch.cat(labels, dim=1).T # M, 4
18
-
19
- cnt_neg = (labels == 0).sum(0)
20
- cnt_pos = labels.sum(0)
21
- pos_weight = cnt_neg / cnt_pos
22
- criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
23
- loss_bce = criterion(logits, labels.float())
24
- return loss_bce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/loss/sscMetrics.py DELETED
@@ -1,204 +0,0 @@
1
- """
2
- Part of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py
3
- """
4
- import numpy as np
5
- from sklearn.metrics import accuracy_score, precision_recall_fscore_support
6
-
7
-
8
- def get_iou(iou_sum, cnt_class):
9
- _C = iou_sum.shape[0] # 12
10
- iou = np.zeros(_C, dtype=np.float32) # iou for each class
11
- for idx in range(_C):
12
- iou[idx] = iou_sum[idx] / cnt_class[idx] if cnt_class[idx] else 0
13
-
14
- mean_iou = np.sum(iou[1:]) / np.count_nonzero(cnt_class[1:])
15
- return iou, mean_iou
16
-
17
-
18
- def get_accuracy(predict, target, weight=None): # 0.05s
19
- _bs = predict.shape[0] # batch size
20
- _C = predict.shape[1] # _C = 12
21
- target = np.int32(target)
22
- target = target.reshape(_bs, -1) # (_bs, 60*36*60) 129600
23
- predict = predict.reshape(_bs, _C, -1) # (_bs, _C, 60*36*60)
24
- predict = np.argmax(
25
- predict, axis=1
26
- ) # one-hot: _bs x _C x 60*36*60 --> label: _bs x 60*36*60.
27
-
28
- correct = predict == target # (_bs, 129600)
29
- if weight: # 0.04s, add class weights
30
- weight_k = np.ones(target.shape)
31
- for i in range(_bs):
32
- for n in range(target.shape[1]):
33
- idx = 0 if target[i, n] == 255 else target[i, n]
34
- weight_k[i, n] = weight[idx]
35
- correct = correct * weight_k
36
- acc = correct.sum() / correct.size
37
- return acc
38
-
39
-
40
- class SSCMetrics:
41
- def __init__(self, n_classes):
42
- self.n_classes = n_classes
43
- self.reset()
44
-
45
- def hist_info(self, n_cl, pred, gt):
46
- assert pred.shape == gt.shape
47
- k = (gt >= 0) & (gt < n_cl) # exclude 255
48
- labeled = np.sum(k)
49
- correct = np.sum((pred[k] == gt[k]))
50
-
51
- return (
52
- np.bincount(
53
- n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
54
- ).reshape(n_cl, n_cl),
55
- correct,
56
- labeled,
57
- )
58
-
59
- @staticmethod
60
- def compute_score(hist, correct, labeled):
61
- iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
62
- mean_IU = np.nanmean(iu)
63
- mean_IU_no_back = np.nanmean(iu[1:])
64
- freq = hist.sum(1) / hist.sum()
65
- freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
66
- mean_pixel_acc = correct / labeled if labeled != 0 else 0
67
-
68
- return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
69
-
70
- def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
71
- self.count += 1
72
- mask = y_true != 255
73
- if nonempty is not None:
74
- mask = mask & nonempty
75
- if nonsurface is not None:
76
- mask = mask & nonsurface
77
- tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)
78
-
79
- self.completion_tp += tp
80
- self.completion_fp += fp
81
- self.completion_fn += fn
82
-
83
- mask = y_true != 255
84
- if nonempty is not None:
85
- mask = mask & nonempty
86
- tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
87
- y_pred, y_true, mask
88
- )
89
- self.tps += tp_sum
90
- self.fps += fp_sum
91
- self.fns += fn_sum
92
-
93
- def get_stats(self):
94
- if self.completion_tp != 0:
95
- precision = self.completion_tp / (self.completion_tp + self.completion_fp)
96
- recall = self.completion_tp / (self.completion_tp + self.completion_fn)
97
- iou = self.completion_tp / (
98
- self.completion_tp + self.completion_fp + self.completion_fn
99
- )
100
- else:
101
- precision, recall, iou = 0, 0, 0
102
- iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
103
- return {
104
- "precision": precision,
105
- "recall": recall,
106
- "iou": iou,
107
- "iou_ssc": iou_ssc,
108
- "iou_ssc_mean": np.mean(iou_ssc[1:]),
109
- }
110
-
111
- def reset(self):
112
-
113
- self.completion_tp = 0
114
- self.completion_fp = 0
115
- self.completion_fn = 0
116
- self.tps = np.zeros(self.n_classes)
117
- self.fps = np.zeros(self.n_classes)
118
- self.fns = np.zeros(self.n_classes)
119
-
120
- self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
121
- self.labeled_ssc = 0
122
- self.correct_ssc = 0
123
-
124
- self.precision = 0
125
- self.recall = 0
126
- self.iou = 0
127
- self.count = 1e-8
128
- self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
129
- self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)
130
-
131
- def get_score_completion(self, predict, target, nonempty=None):
132
- predict = np.copy(predict)
133
- target = np.copy(target)
134
-
135
- """for scene completion, treat the task as two-classes problem, just empty or occupancy"""
136
- _bs = predict.shape[0] # batch size
137
- # ---- ignore
138
- predict[target == 255] = 0
139
- target[target == 255] = 0
140
- # ---- flatten
141
- target = target.reshape(_bs, -1) # (_bs, 129600)
142
- predict = predict.reshape(_bs, -1) # (_bs, _C, 129600), 60*36*60=129600
143
- # ---- treat all non-empty object class as one category, set them to label 1
144
- b_pred = np.zeros(predict.shape)
145
- b_true = np.zeros(target.shape)
146
- b_pred[predict > 0] = 1
147
- b_true[target > 0] = 1
148
- p, r, iou = 0.0, 0.0, 0.0
149
- tp_sum, fp_sum, fn_sum = 0, 0, 0
150
- for idx in range(_bs):
151
- y_true = b_true[idx, :] # GT
152
- y_pred = b_pred[idx, :]
153
- if nonempty is not None:
154
- nonempty_idx = nonempty[idx, :].reshape(-1)
155
- y_true = y_true[nonempty_idx == 1]
156
- y_pred = y_pred[nonempty_idx == 1]
157
-
158
- tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
159
- fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
160
- fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
161
- tp_sum += tp
162
- fp_sum += fp
163
- fn_sum += fn
164
- return tp_sum, fp_sum, fn_sum
165
-
166
- def get_score_semantic_and_completion(self, predict, target, nonempty=None):
167
- target = np.copy(target)
168
- predict = np.copy(predict)
169
- _bs = predict.shape[0] # batch size
170
- _C = self.n_classes # _C = 12
171
- # ---- ignore
172
- predict[target == 255] = 0
173
- target[target == 255] = 0
174
- # ---- flatten
175
- target = target.reshape(_bs, -1) # (_bs, 129600)
176
- predict = predict.reshape(_bs, -1) # (_bs, 129600), 60*36*60=129600
177
-
178
- cnt_class = np.zeros(_C, dtype=np.int32) # count for each class
179
- iou_sum = np.zeros(_C, dtype=np.float32) # sum of iou for each class
180
- tp_sum = np.zeros(_C, dtype=np.int32) # tp
181
- fp_sum = np.zeros(_C, dtype=np.int32) # fp
182
- fn_sum = np.zeros(_C, dtype=np.int32) # fn
183
-
184
- for idx in range(_bs):
185
- y_true = target[idx, :] # GT
186
- y_pred = predict[idx, :]
187
- if nonempty is not None:
188
- nonempty_idx = nonempty[idx, :].reshape(-1)
189
- y_pred = y_pred[
190
- np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
191
- ]
192
- y_true = y_true[
193
- np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
194
- ]
195
- for j in range(_C): # for each class
196
- tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
197
- fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
198
- fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size
199
-
200
- tp_sum[j] += tp
201
- fp_sum[j] += fp
202
- fn_sum[j] += fn
203
-
204
- return tp_sum, fp_sum, fn_sum
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/loss/ssc_loss.py DELETED
@@ -1,99 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
-
5
-
6
- def KL_sep(p, target):
7
- """
8
- KL divergence on nonzeros classes
9
- """
10
- nonzeros = target != 0
11
- nonzero_p = p[nonzeros]
12
- kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
13
- return kl_term
14
-
15
-
16
- def geo_scal_loss(pred, ssc_target):
17
-
18
- # Get softmax probabilities
19
- pred = F.softmax(pred, dim=1)
20
-
21
- # Compute empty and nonempty probabilities
22
- empty_probs = pred[:, 0, :, :, :]
23
- nonempty_probs = 1 - empty_probs
24
-
25
- # Remove unknown voxels
26
- mask = ssc_target != 255
27
- nonempty_target = ssc_target != 0
28
- nonempty_target = nonempty_target[mask].float()
29
- nonempty_probs = nonempty_probs[mask]
30
- empty_probs = empty_probs[mask]
31
-
32
- intersection = (nonempty_target * nonempty_probs).sum()
33
- precision = intersection / nonempty_probs.sum()
34
- recall = intersection / nonempty_target.sum()
35
- spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
36
- return (
37
- F.binary_cross_entropy(precision, torch.ones_like(precision))
38
- + F.binary_cross_entropy(recall, torch.ones_like(recall))
39
- + F.binary_cross_entropy(spec, torch.ones_like(spec))
40
- )
41
-
42
-
43
- def sem_scal_loss(pred, ssc_target):
44
- # Get softmax probabilities
45
- pred = F.softmax(pred, dim=1)
46
- loss = 0
47
- count = 0
48
- mask = ssc_target != 255
49
- n_classes = pred.shape[1]
50
- for i in range(0, n_classes):
51
-
52
- # Get probability of class i
53
- p = pred[:, i, :, :, :]
54
-
55
- # Remove unknown voxels
56
- target_ori = ssc_target
57
- p = p[mask]
58
- target = ssc_target[mask]
59
-
60
- completion_target = torch.ones_like(target)
61
- completion_target[target != i] = 0
62
- completion_target_ori = torch.ones_like(target_ori).float()
63
- completion_target_ori[target_ori != i] = 0
64
- if torch.sum(completion_target) > 0:
65
- count += 1.0
66
- nominator = torch.sum(p * completion_target)
67
- loss_class = 0
68
- if torch.sum(p) > 0:
69
- precision = nominator / (torch.sum(p))
70
- loss_precision = F.binary_cross_entropy(
71
- precision, torch.ones_like(precision)
72
- )
73
- loss_class += loss_precision
74
- if torch.sum(completion_target) > 0:
75
- recall = nominator / (torch.sum(completion_target))
76
- loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
77
- loss_class += loss_recall
78
- if torch.sum(1 - completion_target) > 0:
79
- specificity = torch.sum((1 - p) * (1 - completion_target)) / (
80
- torch.sum(1 - completion_target)
81
- )
82
- loss_specificity = F.binary_cross_entropy(
83
- specificity, torch.ones_like(specificity)
84
- )
85
- loss_class += loss_specificity
86
- loss += loss_class
87
- return loss / count
88
-
89
-
90
- def CE_ssc_loss(pred, target, class_weights):
91
- """
92
- :param: prediction: the predicted tensor, must be [BS, C, H, W, D]
93
- """
94
- criterion = nn.CrossEntropyLoss(
95
- weight=class_weights, ignore_index=255, reduction="mean"
96
- )
97
- loss = criterion(pred, target.long())
98
-
99
- return loss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/{models/modules.py β†’ modules.py} RENAMED
@@ -1,6 +1,6 @@
1
  import torch
2
  import torch.nn as nn
3
- from monoscene.models.DDR import Bottleneck3D
4
 
5
 
6
  class ASPP(nn.Module):
 
1
  import torch
2
  import torch.nn as nn
3
+ from monoscene.DDR import Bottleneck3D
4
 
5
 
6
  class ASPP(nn.Module):
monoscene/{models/monoscene.py β†’ monoscene.py} RENAMED
@@ -1,32 +1,26 @@
1
  import pytorch_lightning as pl
2
  import torch
3
  import torch.nn as nn
4
- from monoscene.models.unet3d_nyu import UNet3D as UNet3DNYU
5
- from monoscene.models.unet3d_kitti import UNet3D as UNet3DKitti
6
- from monoscene.loss.sscMetrics import SSCMetrics
7
- from monoscene.loss.ssc_loss import sem_scal_loss, CE_ssc_loss, KL_sep, geo_scal_loss
8
- from monoscene.models.flosp import FLoSP
9
- from monoscene.loss.CRP_loss import compute_super_CP_multilabel_loss
10
  import numpy as np
11
  import torch.nn.functional as F
12
- from monoscene.models.unet2d import UNet2D
13
- from torch.optim.lr_scheduler import MultiStepLR
14
 
15
 
16
  class MonoScene(pl.LightningModule):
17
  def __init__(
18
  self,
19
  n_classes,
20
- class_names,
21
  feature,
22
- class_weights,
23
  project_scale,
24
  full_scene_size,
25
  dataset,
 
26
  n_relations=4,
27
  context_prior=True,
28
  fp_loss=True,
29
- project_res=[],
30
  frustum_size=4,
31
  relation_loss=False,
32
  CE_ssc_loss=True,
@@ -42,13 +36,11 @@ class MonoScene(pl.LightningModule):
42
  self.dataset = dataset
43
  self.context_prior = context_prior
44
  self.frustum_size = frustum_size
45
- self.class_names = class_names
46
  self.relation_loss = relation_loss
47
  self.CE_ssc_loss = CE_ssc_loss
48
  self.sem_scal_loss = sem_scal_loss
49
  self.geo_scal_loss = geo_scal_loss
50
  self.project_scale = project_scale
51
- self.class_weights = class_weights
52
  self.lr = lr
53
  self.weight_decay = weight_decay
54
 
@@ -81,13 +73,6 @@ class MonoScene(pl.LightningModule):
81
  )
82
  self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
83
 
84
- # log hyperparameters
85
- self.save_hyperparameters()
86
-
87
- self.train_metrics = SSCMetrics(self.n_classes)
88
- self.val_metrics = SSCMetrics(self.n_classes)
89
- self.test_metrics = SSCMetrics(self.n_classes)
90
-
91
  def forward(self, batch):
92
 
93
  img = batch["img"]
@@ -104,19 +89,21 @@ class MonoScene(pl.LightningModule):
104
 
105
  # project features at each 2D scale to target 3D scale
106
  scale_2d = int(scale_2d)
107
- projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda()
108
- fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda()
109
 
110
  # Sum all the 3D features
111
  if x3d is None:
112
  x3d = self.projects[str(scale_2d)](
113
  x_rgb["1_" + str(scale_2d)][i],
 
114
  projected_pix // scale_2d,
115
  fov_mask,
116
  )
117
  else:
118
  x3d += self.projects[str(scale_2d)](
119
  x_rgb["1_" + str(scale_2d)][i],
 
120
  projected_pix // scale_2d,
121
  fov_mask,
122
  )
@@ -126,165 +113,13 @@ class MonoScene(pl.LightningModule):
126
  "x3d": torch.stack(x3ds),
127
  }
128
 
129
- out = self.net_3d_decoder(input_dict)
130
 
131
- return out
132
-
133
- def step(self, batch, step_type, metric):
134
- bs = len(batch["img"])
135
- loss = 0
136
- out_dict = self(batch)
137
  ssc_pred = out_dict["ssc_logit"]
138
- target = batch["target"]
139
-
140
- if self.context_prior:
141
- P_logits = out_dict["P_logits"]
142
- CP_mega_matrices = batch["CP_mega_matrices"]
143
-
144
- if self.relation_loss:
145
- loss_rel_ce = compute_super_CP_multilabel_loss(
146
- P_logits, CP_mega_matrices
147
- )
148
- loss += loss_rel_ce
149
- self.log(
150
- step_type + "/loss_relation_ce_super",
151
- loss_rel_ce.detach(),
152
- on_epoch=True,
153
- sync_dist=True,
154
- )
155
-
156
- class_weight = self.class_weights.type_as(batch["img"])
157
- if self.CE_ssc_loss:
158
- loss_ssc = CE_ssc_loss(ssc_pred, target, class_weight)
159
- loss += loss_ssc
160
- self.log(
161
- step_type + "/loss_ssc",
162
- loss_ssc.detach(),
163
- on_epoch=True,
164
- sync_dist=True,
165
- )
166
-
167
- if self.sem_scal_loss:
168
- loss_sem_scal = sem_scal_loss(ssc_pred, target)
169
- loss += loss_sem_scal
170
- self.log(
171
- step_type + "/loss_sem_scal",
172
- loss_sem_scal.detach(),
173
- on_epoch=True,
174
- sync_dist=True,
175
- )
176
-
177
- if self.geo_scal_loss:
178
- loss_geo_scal = geo_scal_loss(ssc_pred, target)
179
- loss += loss_geo_scal
180
- self.log(
181
- step_type + "/loss_geo_scal",
182
- loss_geo_scal.detach(),
183
- on_epoch=True,
184
- sync_dist=True,
185
- )
186
-
187
- if self.fp_loss and step_type != "test":
188
- frustums_masks = torch.stack(batch["frustums_masks"])
189
- frustums_class_dists = torch.stack(
190
- batch["frustums_class_dists"]
191
- ).float() # (bs, n_frustums, n_classes)
192
- n_frustums = frustums_class_dists.shape[1]
193
-
194
- pred_prob = F.softmax(ssc_pred, dim=1)
195
- batch_cnt = frustums_class_dists.sum(0) # (n_frustums, n_classes)
196
-
197
- frustum_loss = 0
198
- frustum_nonempty = 0
199
- for frus in range(n_frustums):
200
- frustum_mask = frustums_masks[:, frus, :, :, :].unsqueeze(1).float()
201
- prob = frustum_mask * pred_prob # bs, n_classes, H, W, D
202
- prob = prob.reshape(bs, self.n_classes, -1).permute(1, 0, 2)
203
- prob = prob.reshape(self.n_classes, -1)
204
- cum_prob = prob.sum(dim=1) # n_classes
205
-
206
- total_cnt = torch.sum(batch_cnt[frus])
207
- total_prob = prob.sum()
208
- if total_prob > 0 and total_cnt > 0:
209
- frustum_target_proportion = batch_cnt[frus] / total_cnt
210
- cum_prob = cum_prob / total_prob # n_classes
211
- frustum_loss_i = KL_sep(cum_prob, frustum_target_proportion)
212
- frustum_loss += frustum_loss_i
213
- frustum_nonempty += 1
214
- frustum_loss = frustum_loss / frustum_nonempty
215
- loss += frustum_loss
216
- self.log(
217
- step_type + "/loss_frustums",
218
- frustum_loss.detach(),
219
- on_epoch=True,
220
- sync_dist=True,
221
- )
222
-
223
- y_true = target.cpu().numpy()
224
  y_pred = ssc_pred.detach().cpu().numpy()
225
  y_pred = np.argmax(y_pred, axis=1)
226
- metric.add_batch(y_pred, y_true)
227
-
228
- self.log(step_type + "/loss", loss.detach(), on_epoch=True, sync_dist=True)
229
-
230
- return loss
231
-
232
- def training_step(self, batch, batch_idx):
233
- return self.step(batch, "train", self.train_metrics)
234
-
235
- def validation_step(self, batch, batch_idx):
236
- self.step(batch, "val", self.val_metrics)
237
-
238
- def validation_epoch_end(self, outputs):
239
- metric_list = [("train", self.train_metrics), ("val", self.val_metrics)]
240
 
241
- for prefix, metric in metric_list:
242
- stats = metric.get_stats()
243
- for i, class_name in enumerate(self.class_names):
244
- self.log(
245
- "{}_SemIoU/{}".format(prefix, class_name),
246
- stats["iou_ssc"][i],
247
- sync_dist=True,
248
- )
249
- self.log("{}/mIoU".format(prefix), stats["iou_ssc_mean"], sync_dist=True)
250
- self.log("{}/IoU".format(prefix), stats["iou"], sync_dist=True)
251
- self.log("{}/Precision".format(prefix), stats["precision"], sync_dist=True)
252
- self.log("{}/Recall".format(prefix), stats["recall"], sync_dist=True)
253
- metric.reset()
254
 
255
- def test_step(self, batch, batch_idx):
256
- self.step(batch, "test", self.test_metrics)
257
 
258
- def test_epoch_end(self, outputs):
259
- classes = self.class_names
260
- metric_list = [("test", self.test_metrics)]
261
- for prefix, metric in metric_list:
262
- print("{}======".format(prefix))
263
- stats = metric.get_stats()
264
- print(
265
- "Precision={:.4f}, Recall={:.4f}, IoU={:.4f}".format(
266
- stats["precision"] * 100, stats["recall"] * 100, stats["iou"] * 100
267
- )
268
- )
269
- print("class IoU: {}, ".format(classes))
270
- print(
271
- " ".join(["{:.4f}, "] * len(classes)).format(
272
- *(stats["iou_ssc"] * 100).tolist()
273
- )
274
- )
275
- print("mIoU={:.4f}".format(stats["iou_ssc_mean"] * 100))
276
- metric.reset()
277
-
278
- def configure_optimizers(self):
279
- if self.dataset == "NYU":
280
- optimizer = torch.optim.AdamW(
281
- self.parameters(), lr=self.lr, weight_decay=self.weight_decay
282
- )
283
- scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
284
- return [optimizer], [scheduler]
285
- elif self.dataset == "kitti":
286
- optimizer = torch.optim.AdamW(
287
- self.parameters(), lr=self.lr, weight_decay=self.weight_decay
288
- )
289
- scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
290
- return [optimizer], [scheduler]
 
1
  import pytorch_lightning as pl
2
  import torch
3
  import torch.nn as nn
4
+ from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
5
+ from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
6
+ from monoscene.flosp import FLoSP
 
 
 
7
  import numpy as np
8
  import torch.nn.functional as F
9
+ from monoscene.unet2d import UNet2D
 
10
 
11
 
12
  class MonoScene(pl.LightningModule):
13
  def __init__(
14
  self,
15
  n_classes,
 
16
  feature,
 
17
  project_scale,
18
  full_scene_size,
19
  dataset,
20
+ project_res=["1", "2", "4", "8"],
21
  n_relations=4,
22
  context_prior=True,
23
  fp_loss=True,
 
24
  frustum_size=4,
25
  relation_loss=False,
26
  CE_ssc_loss=True,
 
36
  self.dataset = dataset
37
  self.context_prior = context_prior
38
  self.frustum_size = frustum_size
 
39
  self.relation_loss = relation_loss
40
  self.CE_ssc_loss = CE_ssc_loss
41
  self.sem_scal_loss = sem_scal_loss
42
  self.geo_scal_loss = geo_scal_loss
43
  self.project_scale = project_scale
 
44
  self.lr = lr
45
  self.weight_decay = weight_decay
46
 
 
73
  )
74
  self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
75
 
 
 
 
 
 
 
 
76
  def forward(self, batch):
77
 
78
  img = batch["img"]
 
89
 
90
  # project features at each 2D scale to target 3D scale
91
  scale_2d = int(scale_2d)
92
+ projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda()
93
+ fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda()
94
 
95
  # Sum all the 3D features
96
  if x3d is None:
97
  x3d = self.projects[str(scale_2d)](
98
  x_rgb["1_" + str(scale_2d)][i],
99
+ # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
100
  projected_pix // scale_2d,
101
  fov_mask,
102
  )
103
  else:
104
  x3d += self.projects[str(scale_2d)](
105
  x_rgb["1_" + str(scale_2d)][i],
106
+ # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
107
  projected_pix // scale_2d,
108
  fov_mask,
109
  )
 
113
  "x3d": torch.stack(x3ds),
114
  }
115
 
116
+ out_dict = self.net_3d_decoder(input_dict)
117
 
 
 
 
 
 
 
118
  ssc_pred = out_dict["ssc_logit"]
119
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  y_pred = ssc_pred.detach().cpu().numpy()
121
  y_pred = np.argmax(y_pred, axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ return y_pred
 
 
 
 
 
 
 
 
 
 
 
 
124
 
 
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/monoscene_model.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel
2
+ from .config import MonoSceneConfig
3
+ from monoscene.monoscene import MonoScene
4
+
5
+
6
+ class MonoSceneModel(PreTrainedModel):
7
+ config_class = MonoSceneConfig
8
+
9
+ def __init__(self, config):
10
+ super().__init__(config)
11
+ self.model = MonoScene(
12
+ dataset=config.dataset,
13
+ n_classes=config.n_classes,
14
+ feature=config.feature,
15
+ project_scale=config.project_scale,
16
+ full_scene_size=config.full_scene_size
17
+ )
18
+
19
+
20
+ def forward(self, tensor):
21
+ return self.model.forward(tensor)
monoscene/scripts/eval_monoscene.py DELETED
@@ -1,71 +0,0 @@
1
- from pytorch_lightning import Trainer
2
- from monoscene.models.monoscene import MonoScene
3
- from monoscene.data.NYU.nyu_dm import NYUDataModule
4
- from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
5
- import hydra
6
- from omegaconf import DictConfig
7
- import torch
8
- import os
9
- from hydra.utils import get_original_cwd
10
-
11
-
12
- @hydra.main(config_name="../config/monoscene.yaml")
13
- def main(config: DictConfig):
14
- torch.set_grad_enabled(False)
15
- if config.dataset == "kitti":
16
- config.batch_size = 1
17
- n_classes = 20
18
- feature = 64
19
- project_scale = 2
20
- full_scene_size = (256, 256, 32)
21
- data_module = KittiDataModule(
22
- root=config.kitti_root,
23
- preprocess_root=config.kitti_preprocess_root,
24
- frustum_size=config.frustum_size,
25
- batch_size=int(config.batch_size / config.n_gpus),
26
- num_workers=int(config.num_workers_per_gpu * config.n_gpus),
27
- )
28
-
29
- elif config.dataset == "NYU":
30
- config.batch_size = 2
31
- project_scale = 1
32
- n_classes = 12
33
- feature = 200
34
- full_scene_size = (60, 36, 60)
35
- data_module = NYUDataModule(
36
- root=config.NYU_root,
37
- preprocess_root=config.NYU_preprocess_root,
38
- n_relations=config.n_relations,
39
- frustum_size=config.frustum_size,
40
- batch_size=int(config.batch_size / config.n_gpus),
41
- num_workers=int(config.num_workers_per_gpu * config.n_gpus),
42
- )
43
-
44
- trainer = Trainer(
45
- sync_batchnorm=True, deterministic=True, gpus=config.n_gpus, accelerator="ddp"
46
- )
47
-
48
- if config.dataset == "NYU":
49
- model_path = os.path.join(
50
- get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
51
- )
52
- else:
53
- model_path = os.path.join(
54
- get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
55
- )
56
-
57
- model = MonoScene.load_from_checkpoint(
58
- model_path,
59
- feature=feature,
60
- project_scale=project_scale,
61
- fp_loss=config.fp_loss,
62
- full_scene_size=full_scene_size,
63
- )
64
- model.eval()
65
- data_module.setup()
66
- val_dataloader = data_module.val_dataloader()
67
- trainer.test(model, test_dataloaders=val_dataloader)
68
-
69
-
70
- if __name__ == "__main__":
71
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/scripts/generate_output.py DELETED
@@ -1,127 +0,0 @@
1
- from pytorch_lightning import Trainer
2
- from monoscene.models.monoscene import MonoScene
3
- from monoscene.data.NYU.nyu_dm import NYUDataModule
4
- from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
5
- from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
6
- import hydra
7
- from omegaconf import DictConfig
8
- import torch
9
- import numpy as np
10
- import os
11
- from hydra.utils import get_original_cwd
12
- from tqdm import tqdm
13
- import pickle
14
-
15
-
16
- @hydra.main(config_name="../config/monoscene.yaml")
17
- def main(config: DictConfig):
18
- torch.set_grad_enabled(False)
19
-
20
- # Setup dataloader
21
- if config.dataset == "kitti" or config.dataset == "kitti_360":
22
- feature = 64
23
- project_scale = 2
24
- full_scene_size = (256, 256, 32)
25
-
26
- if config.dataset == "kitti":
27
- data_module = KittiDataModule(
28
- root=config.kitti_root,
29
- preprocess_root=config.kitti_preprocess_root,
30
- frustum_size=config.frustum_size,
31
- batch_size=int(config.batch_size / config.n_gpus),
32
- num_workers=int(config.num_workers_per_gpu * config.n_gpus),
33
- )
34
- data_module.setup()
35
- data_loader = data_module.val_dataloader()
36
- # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
37
- else:
38
- data_module = Kitti360DataModule(
39
- root=config.kitti_360_root,
40
- sequences=[config.kitti_360_sequence],
41
- n_scans=2000,
42
- batch_size=1,
43
- num_workers=3,
44
- )
45
- data_module.setup()
46
- data_loader = data_module.dataloader()
47
-
48
- elif config.dataset == "NYU":
49
- project_scale = 1
50
- feature = 200
51
- full_scene_size = (60, 36, 60)
52
- data_module = NYUDataModule(
53
- root=config.NYU_root,
54
- preprocess_root=config.NYU_preprocess_root,
55
- n_relations=config.n_relations,
56
- frustum_size=config.frustum_size,
57
- batch_size=int(config.batch_size / config.n_gpus),
58
- num_workers=int(config.num_workers_per_gpu * config.n_gpus),
59
- )
60
- data_module.setup()
61
- data_loader = data_module.val_dataloader()
62
- # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
63
- else:
64
- print("dataset not support")
65
-
66
- # Load pretrained models
67
- if config.dataset == "NYU":
68
- model_path = os.path.join(
69
- get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
70
- )
71
- else:
72
- model_path = os.path.join(
73
- get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
74
- )
75
-
76
- model = MonoScene.load_from_checkpoint(
77
- model_path,
78
- feature=feature,
79
- project_scale=project_scale,
80
- fp_loss=config.fp_loss,
81
- full_scene_size=full_scene_size,
82
- )
83
- model.cuda()
84
- model.eval()
85
-
86
- # Save prediction and additional data
87
- # to draw the viewing frustum and remove scene outside the room for NYUv2
88
- output_path = os.path.join(config.output_path, config.dataset)
89
- with torch.no_grad():
90
- for batch in tqdm(data_loader):
91
- batch["img"] = batch["img"].cuda()
92
- pred = model(batch)
93
- y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
94
- y_pred = np.argmax(y_pred, axis=1)
95
- for i in range(config.batch_size):
96
- out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
97
- if "target" in batch:
98
- out_dict["target"] = (
99
- batch["target"][i].detach().cpu().numpy().astype(np.uint16)
100
- )
101
-
102
- if config.dataset == "NYU":
103
- write_path = output_path
104
- filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
105
- out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
106
- out_dict["vox_origin"] = (
107
- batch["vox_origin"][i].detach().cpu().numpy()
108
- )
109
- else:
110
- write_path = os.path.join(output_path, batch["sequence"][i])
111
- filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
112
- out_dict["fov_mask_1"] = (
113
- batch["fov_mask_1"][i].detach().cpu().numpy()
114
- )
115
- out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
116
- out_dict["T_velo_2_cam"] = (
117
- batch["T_velo_2_cam"][i].detach().cpu().numpy()
118
- )
119
-
120
- os.makedirs(write_path, exist_ok=True)
121
- with open(filepath, "wb") as handle:
122
- pickle.dump(out_dict, handle)
123
- print("wrote to", filepath)
124
-
125
-
126
- if __name__ == "__main__":
127
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/scripts/train_monoscene.py DELETED
@@ -1,173 +0,0 @@
1
- from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
2
- from monoscene.data.semantic_kitti.params import (
3
- semantic_kitti_class_frequencies,
4
- kitti_class_names,
5
- )
6
- from monoscene.data.NYU.params import (
7
- class_weights as NYU_class_weights,
8
- NYU_class_names,
9
- )
10
- from monoscene.data.NYU.nyu_dm import NYUDataModule
11
- from torch.utils.data.dataloader import DataLoader
12
- from monoscene.models.monoscene import MonoScene
13
- from pytorch_lightning import Trainer
14
- from pytorch_lightning.loggers import TensorBoardLogger
15
- from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
16
- import os
17
- import hydra
18
- from omegaconf import DictConfig
19
- import numpy as np
20
- import torch
21
-
22
- hydra.output_subdir = None
23
-
24
-
25
- @hydra.main(config_name="../config/monoscene.yaml")
26
- def main(config: DictConfig):
27
- exp_name = config.exp_prefix
28
- exp_name += "_{}_{}".format(config.dataset, config.run)
29
- exp_name += "_FrusSize_{}".format(config.frustum_size)
30
- exp_name += "_nRelations{}".format(config.n_relations)
31
- exp_name += "_WD{}_lr{}".format(config.weight_decay, config.lr)
32
-
33
- if config.CE_ssc_loss:
34
- exp_name += "_CEssc"
35
- if config.geo_scal_loss:
36
- exp_name += "_geoScalLoss"
37
- if config.sem_scal_loss:
38
- exp_name += "_semScalLoss"
39
- if config.fp_loss:
40
- exp_name += "_fpLoss"
41
-
42
- if config.relation_loss:
43
- exp_name += "_CERel"
44
- if config.context_prior:
45
- exp_name += "_3DCRP"
46
-
47
- # Setup dataloaders
48
- if config.dataset == "kitti":
49
- class_names = kitti_class_names
50
- max_epochs = 30
51
- logdir = config.kitti_logdir
52
- full_scene_size = (256, 256, 32)
53
- project_scale = 2
54
- feature = 64
55
- n_classes = 20
56
- class_weights = torch.from_numpy(
57
- 1 / np.log(semantic_kitti_class_frequencies + 0.001)
58
- )
59
- data_module = KittiDataModule(
60
- root=config.kitti_root,
61
- preprocess_root=config.kitti_preprocess_root,
62
- frustum_size=config.frustum_size,
63
- project_scale=project_scale,
64
- batch_size=int(config.batch_size / config.n_gpus),
65
- num_workers=int(config.num_workers_per_gpu),
66
- )
67
-
68
- elif config.dataset == "NYU":
69
- class_names = NYU_class_names
70
- max_epochs = 30
71
- logdir = config.logdir
72
- full_scene_size = (60, 36, 60)
73
- project_scale = 1
74
- feature = 200
75
- n_classes = 12
76
- class_weights = NYU_class_weights
77
- data_module = NYUDataModule(
78
- root=config.NYU_root,
79
- preprocess_root=config.NYU_preprocess_root,
80
- n_relations=config.n_relations,
81
- frustum_size=config.frustum_size,
82
- batch_size=int(config.batch_size / config.n_gpus),
83
- num_workers=int(config.num_workers_per_gpu * config.n_gpus),
84
- )
85
-
86
- project_res = ["1"]
87
- if config.project_1_2:
88
- exp_name += "_Proj_2"
89
- project_res.append("2")
90
- if config.project_1_4:
91
- exp_name += "_4"
92
- project_res.append("4")
93
- if config.project_1_8:
94
- exp_name += "_8"
95
- project_res.append("8")
96
-
97
- print(exp_name)
98
-
99
- # Initialize MonoScene model
100
- model = MonoScene(
101
- dataset=config.dataset,
102
- frustum_size=config.frustum_size,
103
- project_scale=project_scale,
104
- n_relations=config.n_relations,
105
- fp_loss=config.fp_loss,
106
- feature=feature,
107
- full_scene_size=full_scene_size,
108
- project_res=project_res,
109
- n_classes=n_classes,
110
- class_names=class_names,
111
- context_prior=config.context_prior,
112
- relation_loss=config.relation_loss,
113
- CE_ssc_loss=config.CE_ssc_loss,
114
- sem_scal_loss=config.sem_scal_loss,
115
- geo_scal_loss=config.geo_scal_loss,
116
- lr=config.lr,
117
- weight_decay=config.weight_decay,
118
- class_weights=class_weights,
119
- )
120
-
121
- if config.enable_log:
122
- logger = TensorBoardLogger(save_dir=logdir, name=exp_name, version="")
123
- lr_monitor = LearningRateMonitor(logging_interval="step")
124
- checkpoint_callbacks = [
125
- ModelCheckpoint(
126
- save_last=True,
127
- monitor="val/mIoU",
128
- save_top_k=1,
129
- mode="max",
130
- filename="{epoch:03d}-{val/mIoU:.5f}",
131
- ),
132
- lr_monitor,
133
- ]
134
- else:
135
- logger = False
136
- checkpoint_callbacks = False
137
-
138
- model_path = os.path.join(logdir, exp_name, "checkpoints/last.ckpt")
139
- if os.path.isfile(model_path):
140
- # Continue training from last.ckpt
141
- trainer = Trainer(
142
- callbacks=checkpoint_callbacks,
143
- resume_from_checkpoint=model_path,
144
- sync_batchnorm=True,
145
- deterministic=False,
146
- max_epochs=max_epochs,
147
- gpus=config.n_gpus,
148
- logger=logger,
149
- check_val_every_n_epoch=1,
150
- log_every_n_steps=10,
151
- flush_logs_every_n_steps=100,
152
- accelerator="ddp",
153
- )
154
- else:
155
- # Train from scratch
156
- trainer = Trainer(
157
- callbacks=checkpoint_callbacks,
158
- sync_batchnorm=True,
159
- deterministic=False,
160
- max_epochs=max_epochs,
161
- gpus=config.n_gpus,
162
- logger=logger,
163
- check_val_every_n_epoch=1,
164
- log_every_n_steps=10,
165
- flush_logs_every_n_steps=100,
166
- accelerator="ddp",
167
- )
168
-
169
- trainer.fit(model, data_module)
170
-
171
-
172
- if __name__ == "__main__":
173
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/scripts/visualization/NYU_vis_pred.py DELETED
@@ -1,156 +0,0 @@
1
- import pickle
2
- import os
3
- from omegaconf import DictConfig
4
- import numpy as np
5
- import hydra
6
- from mayavi import mlab
7
-
8
-
9
- def get_grid_coords(dims, resolution):
10
- """
11
- :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
12
- :return coords_grid: is the center coords of voxels in the grid
13
- """
14
-
15
- g_xx = np.arange(0, dims[0] + 1)
16
- g_yy = np.arange(0, dims[1] + 1)
17
-
18
- g_zz = np.arange(0, dims[2] + 1)
19
-
20
- # Obtaining the grid with coords...
21
- xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
22
- coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
23
- coords_grid = coords_grid.astype(np.float)
24
-
25
- coords_grid = (coords_grid * resolution) + resolution / 2
26
-
27
- temp = np.copy(coords_grid)
28
- temp[:, 0] = coords_grid[:, 1]
29
- temp[:, 1] = coords_grid[:, 0]
30
- coords_grid = np.copy(temp)
31
-
32
- return coords_grid
33
-
34
-
35
- def draw(
36
- voxels,
37
- cam_pose,
38
- vox_origin,
39
- voxel_size=0.08,
40
- d=0.75, # 0.75m - determine the size of the mesh representing the camera
41
- ):
42
- # Compute the coordinates of the mesh representing camera
43
- y = d * 480 / (2 * 518.8579)
44
- x = d * 640 / (2 * 518.8579)
45
- tri_points = np.array(
46
- [
47
- [0, 0, 0],
48
- [x, y, d],
49
- [-x, y, d],
50
- [-x, -y, d],
51
- [x, -y, d],
52
- ]
53
- )
54
- tri_points = np.hstack([tri_points, np.ones((5, 1))])
55
-
56
- tri_points = (cam_pose @ tri_points.T).T
57
- x = tri_points[:, 0] - vox_origin[0]
58
- y = tri_points[:, 1] - vox_origin[1]
59
- z = tri_points[:, 2] - vox_origin[2]
60
- triangles = [
61
- (0, 1, 2),
62
- (0, 1, 4),
63
- (0, 3, 4),
64
- (0, 2, 3),
65
- ]
66
-
67
- # Compute the voxels coordinates
68
- grid_coords = get_grid_coords(
69
- [voxels.shape[0], voxels.shape[2], voxels.shape[1]], voxel_size
70
- )
71
-
72
- # Attach the predicted class to every voxel
73
- grid_coords = np.vstack(
74
- (grid_coords.T, np.moveaxis(voxels, [0, 1, 2], [0, 2, 1]).reshape(-1))
75
- ).T
76
-
77
- # Remove empty and unknown voxels
78
- occupied_voxels = grid_coords[(grid_coords[:, 3] > 0) & (grid_coords[:, 3] < 255)]
79
- figure = mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1))
80
-
81
- # Draw the camera
82
- mlab.triangular_mesh(
83
- x,
84
- y,
85
- z,
86
- triangles,
87
- representation="wireframe",
88
- color=(0, 0, 0),
89
- line_width=5,
90
- )
91
-
92
- # Draw occupied voxels
93
- plt_plot = mlab.points3d(
94
- occupied_voxels[:, 0],
95
- occupied_voxels[:, 1],
96
- occupied_voxels[:, 2],
97
- occupied_voxels[:, 3],
98
- colormap="viridis",
99
- scale_factor=voxel_size - 0.1 * voxel_size,
100
- mode="cube",
101
- opacity=1.0,
102
- vmin=0,
103
- vmax=12,
104
- )
105
-
106
- colors = np.array(
107
- [
108
- [22, 191, 206, 255],
109
- [214, 38, 40, 255],
110
- [43, 160, 43, 255],
111
- [158, 216, 229, 255],
112
- [114, 158, 206, 255],
113
- [204, 204, 91, 255],
114
- [255, 186, 119, 255],
115
- [147, 102, 188, 255],
116
- [30, 119, 181, 255],
117
- [188, 188, 33, 255],
118
- [255, 127, 12, 255],
119
- [196, 175, 214, 255],
120
- [153, 153, 153, 255],
121
- ]
122
- )
123
-
124
- plt_plot.glyph.scale_mode = "scale_by_vector"
125
-
126
- plt_plot.module_manager.scalar_lut_manager.lut.table = colors
127
-
128
- mlab.show()
129
-
130
-
131
- @hydra.main(config_path=None)
132
- def main(config: DictConfig):
133
- scan = config.file
134
-
135
- with open(scan, "rb") as handle:
136
- b = pickle.load(handle)
137
-
138
- cam_pose = b["cam_pose"]
139
- vox_origin = b["vox_origin"]
140
- gt_scene = b["target"]
141
- pred_scene = b["y_pred"]
142
- scan = os.path.basename(scan)[:12]
143
-
144
- pred_scene[(gt_scene == 255)] = 255 # only draw scene inside the room
145
-
146
- draw(
147
- pred_scene,
148
- cam_pose,
149
- vox_origin,
150
- voxel_size=0.08,
151
- d=0.75,
152
- )
153
-
154
-
155
- if __name__ == "__main__":
156
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/scripts/visualization/kitti_vis_pred.py DELETED
@@ -1,201 +0,0 @@
1
- # from operator import gt
2
- import pickle
3
- import numpy as np
4
- from omegaconf import DictConfig
5
- import hydra
6
- from mayavi import mlab
7
-
8
-
9
- def get_grid_coords(dims, resolution):
10
- """
11
- :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
12
- :return coords_grid: is the center coords of voxels in the grid
13
- """
14
-
15
- g_xx = np.arange(0, dims[0] + 1)
16
- g_yy = np.arange(0, dims[1] + 1)
17
- sensor_pose = 10
18
- g_zz = np.arange(0, dims[2] + 1)
19
-
20
- # Obtaining the grid with coords...
21
- xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
22
- coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
23
- coords_grid = coords_grid.astype(np.float)
24
-
25
- coords_grid = (coords_grid * resolution) + resolution / 2
26
-
27
- temp = np.copy(coords_grid)
28
- temp[:, 0] = coords_grid[:, 1]
29
- temp[:, 1] = coords_grid[:, 0]
30
- coords_grid = np.copy(temp)
31
-
32
- return coords_grid
33
-
34
-
35
- def draw(
36
- voxels,
37
- T_velo_2_cam,
38
- vox_origin,
39
- fov_mask,
40
- img_size,
41
- f,
42
- voxel_size=0.2,
43
- d=7, # 7m - determine the size of the mesh representing the camera
44
- ):
45
- # Compute the coordinates of the mesh representing camera
46
- x = d * img_size[0] / (2 * f)
47
- y = d * img_size[1] / (2 * f)
48
- tri_points = np.array(
49
- [
50
- [0, 0, 0],
51
- [x, y, d],
52
- [-x, y, d],
53
- [-x, -y, d],
54
- [x, -y, d],
55
- ]
56
- )
57
- tri_points = np.hstack([tri_points, np.ones((5, 1))])
58
- tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T
59
- x = tri_points[:, 0] - vox_origin[0]
60
- y = tri_points[:, 1] - vox_origin[1]
61
- z = tri_points[:, 2] - vox_origin[2]
62
- triangles = [
63
- (0, 1, 2),
64
- (0, 1, 4),
65
- (0, 3, 4),
66
- (0, 2, 3),
67
- ]
68
-
69
- # Compute the voxels coordinates
70
- grid_coords = get_grid_coords(
71
- [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
72
- )
73
-
74
- # Attach the predicted class to every voxel
75
- grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
76
-
77
- # Get the voxels inside FOV
78
- fov_grid_coords = grid_coords[fov_mask, :]
79
-
80
- # Get the voxels outside FOV
81
- outfov_grid_coords = grid_coords[~fov_mask, :]
82
-
83
- # Remove empty and unknown voxels
84
- fov_voxels = fov_grid_coords[
85
- (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255)
86
- ]
87
- outfov_voxels = outfov_grid_coords[
88
- (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255)
89
- ]
90
-
91
- figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
92
-
93
- # Draw the camera
94
- mlab.triangular_mesh(
95
- x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5
96
- )
97
-
98
- # Draw occupied inside FOV voxels
99
- plt_plot_fov = mlab.points3d(
100
- fov_voxels[:, 0],
101
- fov_voxels[:, 1],
102
- fov_voxels[:, 2],
103
- fov_voxels[:, 3],
104
- colormap="viridis",
105
- scale_factor=voxel_size - 0.05 * voxel_size,
106
- mode="cube",
107
- opacity=1.0,
108
- vmin=1,
109
- vmax=19,
110
- )
111
-
112
- # Draw occupied outside FOV voxels
113
- plt_plot_outfov = mlab.points3d(
114
- outfov_voxels[:, 0],
115
- outfov_voxels[:, 1],
116
- outfov_voxels[:, 2],
117
- outfov_voxels[:, 3],
118
- colormap="viridis",
119
- scale_factor=voxel_size - 0.05 * voxel_size,
120
- mode="cube",
121
- opacity=1.0,
122
- vmin=1,
123
- vmax=19,
124
- )
125
-
126
- colors = np.array(
127
- [
128
- [100, 150, 245, 255],
129
- [100, 230, 245, 255],
130
- [30, 60, 150, 255],
131
- [80, 30, 180, 255],
132
- [100, 80, 250, 255],
133
- [255, 30, 30, 255],
134
- [255, 40, 200, 255],
135
- [150, 30, 90, 255],
136
- [255, 0, 255, 255],
137
- [255, 150, 255, 255],
138
- [75, 0, 75, 255],
139
- [175, 0, 75, 255],
140
- [255, 200, 0, 255],
141
- [255, 120, 50, 255],
142
- [0, 175, 0, 255],
143
- [135, 60, 0, 255],
144
- [150, 240, 80, 255],
145
- [255, 240, 150, 255],
146
- [255, 0, 0, 255],
147
- ]
148
- ).astype(np.uint8)
149
-
150
- plt_plot_fov.glyph.scale_mode = "scale_by_vector"
151
- plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
152
-
153
- plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
154
-
155
- outfov_colors = colors
156
- outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
157
- plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
158
-
159
- mlab.show()
160
-
161
-
162
- @hydra.main(config_path=None)
163
- def main(config: DictConfig):
164
- scan = config.file
165
- with open(scan, "rb") as handle:
166
- b = pickle.load(handle)
167
-
168
- fov_mask_1 = b["fov_mask_1"]
169
- T_velo_2_cam = b["T_velo_2_cam"]
170
- vox_origin = np.array([0, -25.6, -2])
171
-
172
- y_pred = b["y_pred"]
173
-
174
- if config.dataset == "kitti_360":
175
- # Visualize KITTI-360
176
- draw(
177
- y_pred,
178
- T_velo_2_cam,
179
- vox_origin,
180
- fov_mask_1,
181
- voxel_size=0.2,
182
- f=552.55426,
183
- img_size=(1408, 376),
184
- d=7,
185
- )
186
- else:
187
- # Visualize Semantic KITTI
188
- draw(
189
- y_pred,
190
- T_velo_2_cam,
191
- vox_origin,
192
- fov_mask_1,
193
- img_size=(1220, 370),
194
- f=707.0912,
195
- voxel_size=0.2,
196
- d=7,
197
- )
198
-
199
-
200
- if __name__ == "__main__":
201
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monoscene/{models/unet2d.py β†’ unet2d.py} RENAMED
File without changes
monoscene/{models/unet3d_kitti.py β†’ unet3d_kitti.py} RENAMED
@@ -2,9 +2,9 @@
2
  import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
- from monoscene.models.modules import SegmentationHead
6
- from monoscene.models.CRP3D import CPMegaVoxels
7
- from monoscene.models.modules import Process, Upsample, Downsample
8
 
9
 
10
  class UNet3D(nn.Module):
 
2
  import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
+ from monoscene.modules import SegmentationHead
6
+ from monoscene.CRP3D import CPMegaVoxels
7
+ from monoscene.modules import Process, Upsample, Downsample
8
 
9
 
10
  class UNet3D(nn.Module):
monoscene/{models/unet3d_nyu.py β†’ unet3d_nyu.py} RENAMED
@@ -3,8 +3,8 @@ import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
  import numpy as np
6
- from monoscene.models.CRP3D import CPMegaVoxels
7
- from monoscene.models.modules import (
8
  Process,
9
  Upsample,
10
  Downsample,
 
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
  import numpy as np
6
+ from monoscene.CRP3D import CPMegaVoxels
7
+ from monoscene.modules import (
8
  Process,
9
  Upsample,
10
  Downsample,