MonoScene_room3d_model

Build error

App Files Files Community

Sohaib36 commited on Mar 2, 2023

Commit

f9c9c61

•

1 Parent(s): b38a916

add: adding monoscene

Browse files

Files changed (39) hide show

monoscene/{models/CRP3D.py → CRP3D.py} +1 -1
monoscene/{models/DDR.py → DDR.py} +0 -0
monoscene/__init__.py +0 -0
monoscene/app.py +138 -0
monoscene/config.py +26 -0
monoscene/config/monoscene.yaml +0 -35
monoscene/data/NYU/collate.py +0 -50
monoscene/data/NYU/nyu_dataset.py +0 -133
monoscene/data/NYU/nyu_dm.py +0 -78
monoscene/data/NYU/params.py +0 -54
monoscene/data/NYU/preprocess.py +0 -182
monoscene/data/kitti_360/collate.py +0 -47
monoscene/data/kitti_360/kitti_360_dataset.py +0 -125
monoscene/data/kitti_360/kitti_360_dm.py +0 -32
monoscene/data/semantic_kitti/collate.py +0 -61
monoscene/data/semantic_kitti/io_data.py +0 -239
monoscene/data/semantic_kitti/kitti_dataset.py +0 -200
monoscene/data/semantic_kitti/kitti_dm.py +0 -91
monoscene/data/semantic_kitti/params.py +0 -48
monoscene/data/semantic_kitti/preprocess.py +0 -102
monoscene/data/semantic_kitti/semantic-kitti.yaml +0 -213
monoscene/data/utils/fusion.py +0 -507
monoscene/data/utils/helpers.py +0 -185
monoscene/data/utils/torch_util.py +0 -15
monoscene/{models/flosp.py → flosp.py} +0 -0
monoscene/loss/CRP_loss.py +0 -24
monoscene/loss/sscMetrics.py +0 -204
monoscene/loss/ssc_loss.py +0 -99
monoscene/{models/modules.py → modules.py} +1 -1
monoscene/{models/monoscene.py → monoscene.py} +12 -177
monoscene/monoscene_model.py +21 -0
monoscene/scripts/eval_monoscene.py +0 -71
monoscene/scripts/generate_output.py +0 -127
monoscene/scripts/train_monoscene.py +0 -173
monoscene/scripts/visualization/NYU_vis_pred.py +0 -156
monoscene/scripts/visualization/kitti_vis_pred.py +0 -201
monoscene/{models/unet2d.py → unet2d.py} +0 -0
monoscene/{models/unet3d_kitti.py → unet3d_kitti.py} +3 -3
monoscene/{models/unet3d_nyu.py → unet3d_nyu.py} +2 -2

monoscene/{models/CRP3D.py → CRP3D.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from monoscene.models.modules import (
     Process,
     ASPP,
 )

 import torch
 import torch.nn as nn
+from monoscene.modules import (
     Process,
     ASPP,
 )

monoscene/{models/DDR.py → DDR.py} RENAMED Viewed

File without changes

monoscene/__init__.py ADDED Viewed

File without changes

monoscene/app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from pytorch_lightning import Trainer
+from monoscene.models.monoscene import MonoScene
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
+# import hydra
+from omegaconf import DictConfig
+import torch
+import numpy as np
+import os
+from hydra.utils import get_original_cwd
+import gradio as gr
+import numpy as np
+import plotly.express as px
+import pandas as pd
+# @hydra.main(config_name="../config/monoscene.yaml")
+def plot(input_img):
+    torch.set_grad_enabled(False)
+    # Setup dataloader
+    # if config.dataset == "kitti" or config.dataset == "kitti_360":
+    feature = 64
+    project_scale = 2
+    full_scene_size = (256, 256, 32)
+    #     if config.dataset == "kitti":
+    #         data_module = KittiDataModule(
+    #             root=config.kitti_root,
+    #             preprocess_root=config.kitti_preprocess_root,
+    #             frustum_size=config.frustum_size,
+    #             batch_size=int(config.batch_size / config.n_gpus),
+    #             num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+    #         )
+    #         data_module.setup()
+    #         data_loader = data_module.val_dataloader()
+    #         # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+    #     else:
+    #         data_module = Kitti360DataModule(
+    #             root=config.kitti_360_root,
+    #             sequences=[config.kitti_360_sequence],
+    #             n_scans=2000,
+    #             batch_size=1,
+    #             num_workers=3,
+    #         )
+    #         data_module.setup()
+    #         data_loader = data_module.dataloader()
+    # elif config.dataset == "NYU":
+    #     project_scale = 1
+    #     feature = 200
+    #     full_scene_size = (60, 36, 60)
+    #     data_module = NYUDataModule(
+    #         root=config.NYU_root,
+    #         preprocess_root=config.NYU_preprocess_root,
+    #         n_relations=config.n_relations,
+    #         frustum_size=config.frustum_size,
+    #         batch_size=int(config.batch_size / config.n_gpus),
+    #         num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+    #     )
+    #     data_module.setup()
+    #     data_loader = data_module.val_dataloader()
+    #     # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+    # else:
+    #     print("dataset not support")
+    # Load pretrained models
+    # if config.dataset == "NYU":
+    #     model_path = os.path.join(
+    #         get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
+    #     )
+    # else:
+    # model_path = os.path.join(
+    #     get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
+    # )
+    model_path = "trained_models/monoscene_kitti.ckpt"
+    model = MonoScene.load_from_checkpoint(
+        model_path,
+        feature=feature,
+        project_scale=project_scale,
+        fp_loss=False,
+        full_scene_size=full_scene_size,
+    )
+    model.cuda()
+    model.eval()
+    print(input_img.shape)
+    x = np.arange(12).reshape(4, 3) / 12
+    data = pd.DataFrame(data=x, columns=['x', 'y', 'z'])
+    fig = px.scatter_3d(data, x="x", y="y", z="z")
+    return fig
+demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot())
+demo.launch()
+    # Save prediction and additional data
+    # to draw the viewing frustum and remove scene outside the room for NYUv2
+    # output_path = os.path.join(config.output_path, config.dataset)
+    # with torch.no_grad():
+    #     for batch in tqdm(data_loader):
+    #         batch["img"] = batch["img"].cuda()
+    #         pred = model(batch)
+    #         y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
+    #         y_pred = np.argmax(y_pred, axis=1)
+    #         for i in range(config.batch_size):
+    #             out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
+    #             if "target" in batch:
+    #                 out_dict["target"] = (
+    #                     batch["target"][i].detach().cpu().numpy().astype(np.uint16)
+    #                 )
+    #             if config.dataset == "NYU":
+    #                 write_path = output_path
+    #                 filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
+    #                 out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
+    #                 out_dict["vox_origin"] = (
+    #                     batch["vox_origin"][i].detach().cpu().numpy()
+    #                 )
+    #             else:
+    #                 write_path = os.path.join(output_path, batch["sequence"][i])
+    #                 filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
+    #                 out_dict["fov_mask_1"] = (
+    #                     batch["fov_mask_1"][i].detach().cpu().numpy()
+    #                 )
+    #                 out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
+    #                 out_dict["T_velo_2_cam"] = (
+    #                     batch["T_velo_2_cam"][i].detach().cpu().numpy()
+    #                 )
+    #             os.makedirs(write_path, exist_ok=True)
+    #             with open(filepath, "wb") as handle:
+    #                 pickle.dump(out_dict, handle)
+    #                 print("wrote to", filepath)

monoscene/config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from transformers import PretrainedConfig
+from typing import List
+class MonoSceneConfig(PretrainedConfig):
+    def __init__(
+        self,
+        dataset="kitti",
+        n_classes=20,
+        feature=64,
+        project_scale=2,
+        full_scene_size=(256, 256, 32),
+        **kwargs,
+    ):
+        self.dataset = dataset
+        self.n_classes = n_classes
+        self.feature = feature
+        self.project_scale = project_scale
+        self.full_scene_size = full_scene_size
+        super().__init__(**kwargs)

monoscene/config/monoscene.yaml DELETED Viewed

@@ -1,35 +0,0 @@
-#dataset: "NYU" # "kitti", "kitti_360"
-dataset: "kitti_360"
-n_relations: 4
-enable_log: false
-kitti_root: '/path/to/semantic_kitti'
-kitti_preprocess_root: '/path/to/kitti/preprocess/folder'
-kitti_logdir: '/path/to/semantic_kitti/logdir'
-NYU_root: '/path/to/NYU/depthbin'
-NYU_preprocess_root: '/path/to/NYU/preprocess/folder'
-logdir: '/path/to/NYU/logdir'
-fp_loss: true
-frustum_size: 8
-batch_size: 1
-n_gpus: 1
-num_workers_per_gpu: 3
-exp_prefix: "exp"
-run: 1
-lr: 1e-4
-weight_decay: 1e-4
-context_prior: true
-relation_loss: true
-CE_ssc_loss: true
-sem_scal_loss: true
-geo_scal_loss: true
-project_1_2: true
-project_1_4: true
-project_1_8: true

monoscene/data/NYU/collate.py DELETED Viewed

@@ -1,50 +0,0 @@
-import torch
-def collate_fn(batch):
-    data = {}
-    imgs = []
-    targets = []
-    names = []
-    cam_poses = []
-    vox_origins = []
-    cam_ks = []
-    CP_mega_matrices = []
-    data["projected_pix_1"] = []
-    data["fov_mask_1"] = []
-    data["frustums_masks"] = []
-    data["frustums_class_dists"] = []
-    for idx, input_dict in enumerate(batch):
-        CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
-        for key in data:
-            if key in input_dict:
-                data[key].append(torch.from_numpy(input_dict[key]))
-        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
-        cam_poses.append(torch.from_numpy(input_dict["cam_pose"]).float())
-        vox_origins.append(torch.from_numpy(input_dict["voxel_origin"]).double())
-        names.append(input_dict["name"])
-        img = input_dict["img"]
-        imgs.append(img)
-        target = torch.from_numpy(input_dict["target"])
-        targets.append(target)
-    ret_data = {
-        "CP_mega_matrices": CP_mega_matrices,
-        "cam_pose": torch.stack(cam_poses),
-        "cam_k": torch.stack(cam_ks),
-        "vox_origin": torch.stack(vox_origins),
-        "name": names,
-        "img": torch.stack(imgs),
-        "target": torch.stack(targets),
-    }
-    for key in data:
-        ret_data[key] = data[key]
-    return ret_data

monoscene/data/NYU/nyu_dataset.py DELETED Viewed

@@ -1,133 +0,0 @@
-import torch
-import os
-import glob
-from torch.utils.data import Dataset
-import numpy as np
-from PIL import Image
-from torchvision import transforms
-from monoscene.data.utils.helpers import (
-    vox2pix,
-    compute_local_frustums,
-    compute_CP_mega_matrix,
-)
-import pickle
-import torch.nn.functional as F
-class NYUDataset(Dataset):
-    def __init__(
-        self,
-        split,
-        root,
-        preprocess_root,
-        n_relations=4,
-        color_jitter=None,
-        frustum_size=4,
-        fliplr=0.0,
-    ):
-        self.n_relations = n_relations
-        self.frustum_size = frustum_size
-        self.n_classes = 12
-        self.root = os.path.join(root, "NYU" + split)
-        self.preprocess_root = preprocess_root
-        self.base_dir = os.path.join(preprocess_root, "base", "NYU" + split)
-        self.fliplr = fliplr
-        self.voxel_size = 0.08  # 0.08m
-        self.scene_size = (4.8, 4.8, 2.88)  # (4.8m, 4.8m, 2.88m)
-        self.img_W = 640
-        self.img_H = 480
-        self.cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]])
-        self.color_jitter = (
-            transforms.ColorJitter(*color_jitter) if color_jitter else None
-        )
-        self.scan_names = glob.glob(os.path.join(self.root, "*.bin"))
-        self.normalize_rgb = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-    def __getitem__(self, index):
-        file_path = self.scan_names[index]
-        filename = os.path.basename(file_path)
-        name = filename[:-4]
-        os.makedirs(self.base_dir, exist_ok=True)
-        filepath = os.path.join(self.base_dir, name + ".pkl")
-        with open(filepath, "rb") as handle:
-            data = pickle.load(handle)
-        cam_pose = data["cam_pose"]
-        T_world_2_cam = np.linalg.inv(cam_pose)
-        vox_origin = data["voxel_origin"]
-        data["cam_k"] = self.cam_k
-        target = data[
-            "target_1_4"
-        ]  # Following SSC literature, the output resolution on NYUv2 is set to 1:4
-        data["target"] = target
-        target_1_4 = data["target_1_16"]
-        CP_mega_matrix = compute_CP_mega_matrix(
-            target_1_4, is_binary=self.n_relations == 2
-        )
-        data["CP_mega_matrix"] = CP_mega_matrix
-        # compute the 3D-2D mapping
-        projected_pix, fov_mask, pix_z = vox2pix(
-            T_world_2_cam,
-            self.cam_k,
-            vox_origin,
-            self.voxel_size,
-            self.img_W,
-            self.img_H,
-            self.scene_size,
-        )
-        data["projected_pix_1"] = projected_pix
-        data["fov_mask_1"] = fov_mask
-        # compute the masks, each indicates voxels inside a frustum
-        frustums_masks, frustums_class_dists = compute_local_frustums(
-            projected_pix,
-            pix_z,
-            target,
-            self.img_W,
-            self.img_H,
-            dataset="NYU",
-            n_classes=12,
-            size=self.frustum_size,
-        )
-        data["frustums_masks"] = frustums_masks
-        data["frustums_class_dists"] = frustums_class_dists
-        rgb_path = os.path.join(self.root, name + "_color.jpg")
-        img = Image.open(rgb_path).convert("RGB")
-        # Image augmentation
-        if self.color_jitter is not None:
-            img = self.color_jitter(img)
-        # PIL to numpy
-        img = np.array(img, dtype=np.float32, copy=False) / 255.0
-        # randomly fliplr the image
-        if np.random.rand() < self.fliplr:
-            img = np.ascontiguousarray(np.fliplr(img))
-            data["projected_pix_1"][:, 0] = (
-                img.shape[1] - 1 - data["projected_pix_1"][:, 0]
-            )
-        data["img"] = self.normalize_rgb(img)  # (3, img_H, img_W)
-        return data
-    def __len__(self):
-        return len(self.scan_names)

monoscene/data/NYU/nyu_dm.py DELETED Viewed

@@ -1,78 +0,0 @@
-from torch.utils.data.dataloader import DataLoader
-from monoscene.data.NYU.nyu_dataset import NYUDataset
-from monoscene.data.NYU.collate import collate_fn
-import pytorch_lightning as pl
-from monoscene.data.utils.torch_util import worker_init_fn
-class NYUDataModule(pl.LightningDataModule):
-    def __init__(
-        self,
-        root,
-        preprocess_root,
-        n_relations=4,
-        batch_size=4,
-        frustum_size=4,
-        num_workers=6,
-    ):
-        super().__init__()
-        self.n_relations = n_relations
-        self.preprocess_root = preprocess_root
-        self.root = root
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-        self.frustum_size = frustum_size
-    def setup(self, stage=None):
-        self.train_ds = NYUDataset(
-            split="train",
-            preprocess_root=self.preprocess_root,
-            n_relations=self.n_relations,
-            root=self.root,
-            fliplr=0.5,
-            frustum_size=self.frustum_size,
-            color_jitter=(0.4, 0.4, 0.4),
-        )
-        self.test_ds = NYUDataset(
-            split="test",
-            preprocess_root=self.preprocess_root,
-            n_relations=self.n_relations,
-            root=self.root,
-            frustum_size=self.frustum_size,
-            fliplr=0.0,
-            color_jitter=None,
-        )
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_ds,
-            batch_size=self.batch_size,
-            drop_last=True,
-            num_workers=self.num_workers,
-            shuffle=True,
-            pin_memory=True,
-            worker_init_fn=worker_init_fn,
-            collate_fn=collate_fn,
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.test_ds,
-            batch_size=self.batch_size,
-            num_workers=self.num_workers,
-            drop_last=False,
-            shuffle=False,
-            pin_memory=True,
-            collate_fn=collate_fn,
-        )
-    def test_dataloader(self):
-        return DataLoader(
-            self.test_ds,
-            batch_size=self.batch_size,
-            num_workers=self.num_workers,
-            drop_last=False,
-            shuffle=False,
-            pin_memory=True,
-            collate_fn=collate_fn,
-        )

monoscene/data/NYU/params.py DELETED Viewed

@@ -1,54 +0,0 @@
-import torch
-import numpy as np
-NYU_class_names = [
-    "empty",
-    "ceiling",
-    "floor",
-    "wall",
-    "window",
-    "chair",
-    "bed",
-    "sofa",
-    "table",
-    "tvs",
-    "furn",
-    "objs",
-]
-class_weights = torch.FloatTensor([0.05, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
-class_freq_1_4 = np.array(
-    [
-        43744234,
-        80205,
-        1070052,
-        905632,
-        116952,
-        180994,
-        436852,
-        279714,
-        254611,
-        28247,
-        1805949,
-        850724,
-    ]
-)
-class_freq_1_8 = np.array(
-    [
-        5176253,
-        17277,
-        220105,
-        183849,
-        21827,
-        33520,
-        67022,
-        44248,
-        46615,
-        4419,
-        290218,
-        142573,
-    ]
-)
-class_freq_1_16 = np.array(
-    [587620, 3820, 46836, 36256, 4241, 5978, 10939, 8000, 8224, 781, 49778, 25864]
-)

monoscene/data/NYU/preprocess.py DELETED Viewed

@@ -1,182 +0,0 @@
-import numpy as np
-from tqdm import tqdm
-import numpy.matlib
-import os
-import glob
-import pickle
-import hydra
-from omegaconf import DictConfig
-seg_class_map = [
-    0,
-    1,
-    2,
-    3,
-    4,
-    11,
-    5,
-    6,
-    7,
-    8,
-    8,
-    10,
-    10,
-    10,
-    11,
-    11,
-    9,
-    8,
-    11,
-    11,
-    11,
-    11,
-    11,
-    11,
-    11,
-    11,
-    11,
-    10,
-    10,
-    11,
-    8,
-    10,
-    11,
-    9,
-    11,
-    11,
-    11,
-]
-def _rle2voxel(rle, voxel_size=(240, 144, 240), rle_filename=""):
-    r"""Read voxel label data from file (RLE compression), and convert it to fully occupancy labeled voxels.
-    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L172
-    In the data loader of pytorch, only single thread is allowed.
-    For multi-threads version and more details, see 'readRLE.py'.
-    output: seg_label: 3D numpy array, size 240 x 144 x 240
-    """
-    seg_label = np.zeros(
-        int(voxel_size[0] * voxel_size[1] * voxel_size[2]), dtype=np.uint8
-    )  # segmentation label
-    vox_idx = 0
-    for idx in range(int(rle.shape[0] / 2)):
-        check_val = rle[idx * 2]
-        check_iter = rle[idx * 2 + 1]
-        if check_val >= 37 and check_val != 255:  # 37 classes to 12 classes
-            print("RLE {} check_val: {}".format(rle_filename, check_val))
-        seg_label_val = (
-            seg_class_map[check_val] if check_val != 255 else 255
-        )  # 37 classes to 12 classes
-        seg_label[vox_idx : vox_idx + check_iter] = np.matlib.repmat(
-            seg_label_val, 1, check_iter
-        )
-        vox_idx = vox_idx + check_iter
-    seg_label = seg_label.reshape(voxel_size)  # 3D array, size 240 x 144 x 240
-    return seg_label
-def _read_rle(rle_filename):  # 0.0005s
-    """Read RLE compression data
-    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L153
-    Return:
-        vox_origin,
-        cam_pose,
-        vox_rle, voxel label data from file
-    Shape:
-        vox_rle, (240, 144, 240)
-    """
-    fid = open(rle_filename, "rb")
-    vox_origin = np.fromfile(
-        fid, np.float32, 3
-    ).T  # Read voxel origin in world coordinates
-    cam_pose = np.fromfile(fid, np.float32, 16).reshape((4, 4))  # Read camera pose
-    vox_rle = (
-        np.fromfile(fid, np.uint32).reshape((-1, 1)).T
-    )  # Read voxel label data from file
-    vox_rle = np.squeeze(vox_rle)  # 2d array: (1 x N), to 1d array: (N , )
-    fid.close()
-    return vox_origin, cam_pose, vox_rle
-def _downsample_label(label, voxel_size=(240, 144, 240), downscale=4):
-    r"""downsample the labeled data,
-    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L262
-    Shape:
-        label, (240, 144, 240)
-        label_downscale, if downsample==4, then (60, 36, 60)
-    """
-    if downscale == 1:
-        return label
-    ds = downscale
-    small_size = (
-        voxel_size[0] // ds,
-        voxel_size[1] // ds,
-        voxel_size[2] // ds,
-    )  # small size
-    label_downscale = np.zeros(small_size, dtype=np.uint8)
-    empty_t = 0.95 * ds * ds * ds  # threshold
-    s01 = small_size[0] * small_size[1]
-    label_i = np.zeros((ds, ds, ds), dtype=np.int32)
-    for i in range(small_size[0] * small_size[1] * small_size[2]):
-        z = int(i / s01)
-        y = int((i - z * s01) / small_size[0])
-        x = int(i - z * s01 - y * small_size[0])
-        label_i[:, :, :] = label[
-            x * ds : (x + 1) * ds, y * ds : (y + 1) * ds, z * ds : (z + 1) * ds
-        ]
-        label_bin = label_i.flatten()
-        zero_count_0 = np.array(np.where(label_bin == 0)).size
-        zero_count_255 = np.array(np.where(label_bin == 255)).size
-        zero_count = zero_count_0 + zero_count_255
-        if zero_count > empty_t:
-            label_downscale[x, y, z] = 0 if zero_count_0 > zero_count_255 else 255
-        else:
-            label_i_s = label_bin[
-                np.where(np.logical_and(label_bin > 0, label_bin < 255))
-            ]
-            label_downscale[x, y, z] = np.argmax(np.bincount(label_i_s))
-    return label_downscale
-@hydra.main(config_name="../../config/monoscene.yaml")
-def main(config: DictConfig):
-    scene_size = (240, 144, 240)
-    for split in ["train", "test"]:
-        root = os.path.join(config.NYU_root, "NYU" + split)
-        base_dir = os.path.join(config.NYU_preprocess_root, "base", "NYU" + split)
-        os.makedirs(base_dir, exist_ok=True)
-        scans = glob.glob(os.path.join(root, "*.bin"))
-        for scan in tqdm(scans):
-            filename = os.path.basename(scan)
-            name = filename[:-4]
-            filepath = os.path.join(base_dir, name + ".pkl")
-            if os.path.exists(filepath):
-                continue
-            vox_origin, cam_pose, rle = _read_rle(scan)
-            target_1_1 = _rle2voxel(rle, scene_size, scan)
-            target_1_4 = _downsample_label(target_1_1, scene_size, 4)
-            target_1_16 = _downsample_label(target_1_1, scene_size, 16)
-            data = {
-                "cam_pose": cam_pose,
-                "voxel_origin": vox_origin,
-                "name": name,
-                "target_1_4": target_1_4,
-                "target_1_16": target_1_16,
-            }
-            with open(filepath, "wb") as handle:
-                pickle.dump(data, handle)
-                print("wrote to", filepath)
-if __name__ == "__main__":
-    main()

monoscene/data/kitti_360/collate.py DELETED Viewed

@@ -1,47 +0,0 @@
-import torch
-def collate_fn(batch):
-    data = {}
-    imgs = []
-    frame_ids = []
-    img_paths = []
-    sequences = []
-    cam_ks = []
-    T_velo_2_cams = []
-    scale_3ds = batch[0]["scale_3ds"]
-    for scale_3d in scale_3ds:
-        data["projected_pix_{}".format(scale_3d)] = []
-        data["fov_mask_{}".format(scale_3d)] = []
-    for _, input_dict in enumerate(batch):
-        if "img_path" in input_dict:
-            img_paths.append(input_dict["img_path"])
-        for key in data:
-            data[key].append(torch.from_numpy(input_dict[key]))
-        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).float())
-        T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
-        sequences.append(input_dict["sequence"])
-        img = input_dict["img"]
-        imgs.append(img)
-        frame_ids.append(input_dict["frame_id"])
-    ret_data = {
-        "sequence": sequences,
-        "frame_id": frame_ids,
-        "cam_k": cam_ks,
-        "T_velo_2_cam": T_velo_2_cams,
-        "img": torch.stack(imgs),
-        "img_path": img_paths,
-    }
-    for key in data:
-        ret_data[key] = data[key]
-    return ret_data

monoscene/data/kitti_360/kitti_360_dataset.py DELETED Viewed

@@ -1,125 +0,0 @@
-import torch
-import os
-import glob
-from torch.utils.data import Dataset
-import numpy as np
-from monoscene.data.utils.helpers import vox2pix
-from PIL import Image
-from torchvision import transforms
-class Kitti360Dataset(Dataset):
-    def __init__(self, root, sequences, n_scans):
-        """
-        Paramters
-        --------
-        root: str
-            Path to KITTI-360 dataset i.e. contain sequences such as 2013_05_28_drive_0009_sync
-        sequence: str
-            KITTI-360 sequence e.g. 2013_05_28_drive_0009_sync
-        n_scans: int
-            Only use the first n_scans since KITTI-360 sequence is very long
-        """
-        self.root = root
-        self.img_H = 376
-        self.img_W = 1408
-        self.project_scale = 2
-        self.output_scale = 1
-        self.voxel_size = 0.2
-        self.vox_origin = np.array([0, -25.6, -2])
-        self.scene_size = (51.2, 51.2, 6.4)
-        self.T_velo_2_cam = self.get_velo2cam()
-        self.cam_k = self.get_cam_k()
-        self.scans = []
-        for sequence in sequences:
-            glob_path = os.path.join(
-                self.root, "data_2d_raw", sequence, "image_00/data_rect", "*.png"
-            )
-            for img_path in glob.glob(glob_path):
-                self.scans.append({"img_path": img_path, "sequence": sequence})
-        self.scans = self.scans[:n_scans]
-        self.normalize_rgb = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-    def __len__(self):
-        return len(self.scans)
-    def get_cam_k(self):
-        cam_k = np.array(
-            [
-                552.554261,
-                0.000000,
-                682.049453,
-                0.000000,
-                0.000000,
-                552.554261,
-                238.769549,
-                0.000000,
-                0.000000,
-                0.000000,
-                1.000000,
-                0.000000,
-            ]
-        ).reshape(3, 4)
-        return cam_k[:3, :3]
-    def get_velo2cam(self):
-        cam2velo = np.array(
-            [
-                0.04307104361,
-                -0.08829286498,
-                0.995162929,
-                0.8043914418,
-                -0.999004371,
-                0.007784614041,
-                0.04392796942,
-                0.2993489574,
-                -0.01162548558,
-                -0.9960641394,
-                -0.08786966659,
-                -0.1770225824,
-            ]
-        ).reshape(3, 4)
-        cam2velo = np.concatenate(
-            [cam2velo, np.array([0, 0, 0, 1]).reshape(1, 4)], axis=0
-        )
-        return np.linalg.inv(cam2velo)
-    def __getitem__(self, index):
-        data = {"T_velo_2_cam": self.T_velo_2_cam, "cam_k": self.cam_k}
-        scan = self.scans[index]
-        img_path = scan["img_path"]
-        sequence = scan["sequence"]
-        filename = os.path.basename(img_path)
-        frame_id = os.path.splitext(filename)[0]
-        data["frame_id"] = frame_id
-        data["img_path"] = img_path
-        data["sequence"] = sequence
-        img = Image.open(img_path).convert("RGB")
-        img = np.array(img, dtype=np.float32, copy=False) / 255.0
-        img = self.normalize_rgb(img)
-        data["img"] = img
-        scale_3ds = [self.project_scale, self.output_scale]
-        data["scale_3ds"] = scale_3ds
-        for scale_3d in scale_3ds:
-            projected_pix, fov_mask, _ = vox2pix(
-                self.T_velo_2_cam,
-                self.cam_k,
-                self.vox_origin,
-                self.voxel_size * scale_3d,
-                self.img_W,
-                self.img_H,
-                self.scene_size,
-            )
-            data["projected_pix_{}".format(scale_3d)] = projected_pix
-            data["fov_mask_{}".format(scale_3d)] = fov_mask
-        return data

monoscene/data/kitti_360/kitti_360_dm.py DELETED Viewed

@@ -1,32 +0,0 @@
-from torch.utils.data.dataloader import DataLoader
-from monoscene.data.kitti_360.kitti_360_dataset import Kitti360Dataset
-import pytorch_lightning as pl
-from monoscene.data.kitti_360.collate import collate_fn
-from monoscene.data.utils.torch_util import worker_init_fn
-class Kitti360DataModule(pl.LightningDataModule):
-    def __init__(self, root, sequences, n_scans, batch_size=4, num_workers=3):
-        super().__init__()
-        self.root = root
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-        self.sequences = sequences
-        self.n_scans = n_scans
-    def setup(self, stage=None):
-        self.ds = Kitti360Dataset(
-            root=self.root, sequences=self.sequences, n_scans=self.n_scans
-        )
-    def dataloader(self):
-        return DataLoader(
-            self.ds,
-            batch_size=self.batch_size,
-            drop_last=False,
-            num_workers=self.num_workers,
-            shuffle=False,
-            pin_memory=True,
-            worker_init_fn=worker_init_fn,
-            collate_fn=collate_fn,
-        )

monoscene/data/semantic_kitti/collate.py DELETED Viewed

@@ -1,61 +0,0 @@
-import torch
-def collate_fn(batch):
-    data = {}
-    imgs = []
-    CP_mega_matrices = []
-    targets = []
-    frame_ids = []
-    sequences = []
-    cam_ks = []
-    T_velo_2_cams = []
-    frustums_masks = []
-    frustums_class_dists = []
-    scale_3ds = batch[0]["scale_3ds"]
-    for scale_3d in scale_3ds:
-        data["projected_pix_{}".format(scale_3d)] = []
-        data["fov_mask_{}".format(scale_3d)] = []
-    for idx, input_dict in enumerate(batch):
-        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
-        T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
-        if "frustums_masks" in input_dict:
-            frustums_masks.append(torch.from_numpy(input_dict["frustums_masks"]))
-            frustums_class_dists.append(
-                torch.from_numpy(input_dict["frustums_class_dists"]).float()
-            )
-        for key in data:
-            data[key].append(torch.from_numpy(input_dict[key]))
-        img = input_dict["img"]
-        imgs.append(img)
-        frame_ids.append(input_dict["frame_id"])
-        sequences.append(input_dict["sequence"])
-        target = torch.from_numpy(input_dict["target"])
-        targets.append(target)
-        CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
-    ret_data = {
-        "frame_id": frame_ids,
-        "sequence": sequences,
-        "frustums_class_dists": frustums_class_dists,
-        "frustums_masks": frustums_masks,
-        "cam_k": cam_ks,
-        "T_velo_2_cam": T_velo_2_cams,
-        "img": torch.stack(imgs),
-        "CP_mega_matrices": CP_mega_matrices,
-        "target": torch.stack(targets)
-    }
-    for key in data:
-        ret_data[key] = data[key]
-    return ret_data

monoscene/data/semantic_kitti/io_data.py DELETED Viewed

@@ -1,239 +0,0 @@
-"""
-Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
-"""
-import numpy as np
-import yaml
-import imageio
-def unpack(compressed):
-  ''' given a bit encoded voxel grid, make a normal voxel grid out of it.  '''
-  uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
-  uncompressed[::8] = compressed[:] >> 7 & 1
-  uncompressed[1::8] = compressed[:] >> 6 & 1
-  uncompressed[2::8] = compressed[:] >> 5 & 1
-  uncompressed[3::8] = compressed[:] >> 4 & 1
-  uncompressed[4::8] = compressed[:] >> 3 & 1
-  uncompressed[5::8] = compressed[:] >> 2 & 1
-  uncompressed[6::8] = compressed[:] >> 1 & 1
-  uncompressed[7::8] = compressed[:] & 1
-  return uncompressed
-def img_normalize(img, mean, std):
-  img = img.astype(np.float32) / 255.0
-  img = img - mean
-  img = img / std
-  return img
-def pack(array):
-  """ convert a boolean array into a bitwise array. """
-  array = array.reshape((-1))
-  #compressing bit flags.
-  # yapf: disable
-  compressed = array[::8] << 7 | array[1::8] << 6  | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
-  # yapf: enable
-  return np.array(compressed, dtype=np.uint8)
-def get_grid_coords(dims, resolution):
-  '''
-  :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
-  :return coords_grid: is the center coords of voxels in the grid
-  '''
-  # The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
-  g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
-  # The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
-  g_yy = np.arange(0, dims[1] + 1)
-  # The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
-  # if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
-  # (we go to 22 + 1 for the histogramdd)
-  # ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
-  sensor_pose = 10
-  g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)
-  # Obtaining the grid with coords...
-  xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
-  coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
-  coords_grid = coords_grid.astype(np.float)
-  coords_grid = (coords_grid * resolution) + resolution/2
-  temp = np.copy(coords_grid)
-  temp[:, 0] = coords_grid[:, 1]
-  temp[:, 1] = coords_grid[:, 0]
-  coords_grid = np.copy(temp)
-  return coords_grid, g_xx, g_yy, g_zz
-def _get_remap_lut(config_path):
-  '''
-  remap_lut to remap classes of semantic kitti for training...
-  :return:
-  '''
-  dataset_config = yaml.safe_load(open(config_path, 'r'))
-  # make lookup table for mapping
-  maxkey = max(dataset_config['learning_map'].keys())
-  # +100 hack making lut bigger just in case there are unknown labels
-  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
-  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
-  # in completion we have to distinguish empty and invalid voxels.
-  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
-  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
-  remap_lut[0] = 0  # only 'empty' stays 'empty'.
-  return remap_lut
-def get_inv_map():
-  '''
-  remap_lut to remap classes of semantic kitti for training...
-  :return:
-  '''
-  config_path = "./semantic-kitti.yaml"
-  dataset_config = yaml.safe_load(open(config_path, 'r'))
-  # make lookup table for mapping
-  inv_map = np.zeros(20, dtype=np.int32)
-  inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())
-  return inv_map
-def _read_SemKITTI(path, dtype, do_unpack):
-  bin = np.fromfile(path, dtype=dtype)  # Flattened array
-  if do_unpack:
-    bin = unpack(bin)
-  return bin
-def _read_label_SemKITTI(path):
-  label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
-  return label
-def _read_invalid_SemKITTI(path):
-  invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
-  return invalid
-def _read_occluded_SemKITTI(path):
-  occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
-  return occluded
-def _read_occupancy_SemKITTI(path):
-  occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
-  return occupancy
-def _read_rgb_SemKITTI(path):
-  rgb = np.asarray(imageio.imread(path))
-  return rgb
-def _read_pointcloud_SemKITTI(path):
-  'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
-  pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
-  pointcloud = pointcloud.reshape((-1, 4))
-  return pointcloud
-def _read_calib_SemKITTI(calib_path):
-  """
-  :param calib_path: Path to a calibration text file.
-  :return: dict with calibration matrices.
-  """
-  calib_all = {}
-  with open(calib_path, 'r') as f:
-    for line in f.readlines():
-      if line == '\n':
-        break
-      key, value = line.split(':', 1)
-      calib_all[key] = np.array([float(x) for x in value.split()])
-  # reshape matrices
-  calib_out = {}
-  calib_out['P2'] = calib_all['P2'].reshape(3, 4)  # 3x4 projection matrix for left camera
-  calib_out['Tr'] = np.identity(4)  # 4x4 matrix
-  calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
-  return calib_out
-def get_remap_lut(path):
-  '''
-  remap_lut to remap classes of semantic kitti for training...
-  :return:
-  '''
-  dataset_config = yaml.safe_load(open(path, 'r'))
-  # make lookup table for mapping
-  maxkey = max(dataset_config['learning_map'].keys())
-  # +100 hack making lut bigger just in case there are unknown labels
-  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
-  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
-  # in completion we have to distinguish empty and invalid voxels.
-  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
-  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
-  remap_lut[0] = 0  # only 'empty' stays 'empty'.
-  return remap_lut
-def data_augmentation_3Dflips(flip, data):
-  # The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
-  # into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
-  # https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
-  # Dims -> {XZY}
-  # Flipping around the X axis...
-  if np.isclose(flip, 1):
-    data = np.flip(data, axis=0).copy()
-  # Flipping around the Y axis...
-  if np.isclose(flip, 2):
-    data = np.flip(data, 2).copy()
-  # Flipping around the X and the Y axis...
-  if np.isclose(flip, 3):
-    data = np.flip(np.flip(data, axis=0), axis=2).copy()
-  return data
-def get_cmap_semanticKITTI20():
-  colors = np.array([
-    # [0  , 0  , 0, 255],
-    [100, 150, 245, 255],
-    [100, 230, 245, 255],
-    [30, 60, 150, 255],
-    [80, 30, 180, 255],
-    [100, 80, 250, 255],
-    [255, 30, 30, 255],
-    [255, 40, 200, 255],
-    [150, 30, 90, 255],
-    [255, 0, 255, 255],
-    [255, 150, 255, 255],
-    [75, 0, 75, 255],
-    [175, 0, 75, 255],
-    [255, 200, 0, 255],
-    [255, 120, 50, 255],
-    [0, 175, 0, 255],
-    [135, 60, 0, 255],
-    [150, 240, 80, 255],
-    [255, 240, 150, 255],
-    [255, 0, 0, 255]]).astype(np.uint8)
-  return colors

monoscene/data/semantic_kitti/kitti_dataset.py DELETED Viewed

@@ -1,200 +0,0 @@
-import torch
-import os
-import glob
-from torch.utils.data import Dataset
-import numpy as np
-from PIL import Image
-from torchvision import transforms
-from monoscene.data.utils.helpers import (
-    vox2pix,
-    compute_local_frustums,
-    compute_CP_mega_matrix,
-)
-class KittiDataset(Dataset):
-    def __init__(
-        self,
-        split,
-        root,
-        preprocess_root,
-        project_scale=2,
-        frustum_size=4,
-        color_jitter=None,
-        fliplr=0.0,
-    ):
-        super().__init__()
-        self.root = root
-        self.label_root = os.path.join(preprocess_root, "labels")
-        self.n_classes = 20
-        splits = {
-            "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
-            "val": ["08"],
-            "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
-        }
-        self.split = split
-        self.sequences = splits[split]
-        self.frustum_size = frustum_size
-        self.project_scale = project_scale
-        self.output_scale = int(self.project_scale / 2)
-        self.scene_size = (51.2, 51.2, 6.4)
-        self.vox_origin = np.array([0, -25.6, -2])
-        self.fliplr = fliplr
-        self.voxel_size = 0.2  # 0.2m
-        self.img_W = 1220
-        self.img_H = 370
-        self.color_jitter = (
-            transforms.ColorJitter(*color_jitter) if color_jitter else None
-        )
-        self.scans = []
-        for sequence in self.sequences:
-            calib = self.read_calib(
-                os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
-            )
-            P = calib["P2"]
-            T_velo_2_cam = calib["Tr"]
-            proj_matrix = P @ T_velo_2_cam
-            glob_path = os.path.join(
-                self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
-            )
-            for voxel_path in glob.glob(glob_path):
-                self.scans.append(
-                    {
-                        "sequence": sequence,
-                        "P": P,
-                        "T_velo_2_cam": T_velo_2_cam,
-                        "proj_matrix": proj_matrix,
-                        "voxel_path": voxel_path,
-                    }
-                )
-        self.normalize_rgb = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-    def __getitem__(self, index):
-        scan = self.scans[index]
-        voxel_path = scan["voxel_path"]
-        sequence = scan["sequence"]
-        P = scan["P"]
-        T_velo_2_cam = scan["T_velo_2_cam"]
-        proj_matrix = scan["proj_matrix"]
-        filename = os.path.basename(voxel_path)
-        frame_id = os.path.splitext(filename)[0]
-        rgb_path = os.path.join(
-            self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png"
-        )
-        data = {
-            "frame_id": frame_id,
-            "sequence": sequence,
-            "P": P,
-            "T_velo_2_cam": T_velo_2_cam,
-            "proj_matrix": proj_matrix,
-        }
-        scale_3ds = [self.output_scale, self.project_scale]
-        data["scale_3ds"] = scale_3ds
-        cam_k = P[0:3, 0:3]
-        data["cam_k"] = cam_k
-        for scale_3d in scale_3ds:
-            # compute the 3D-2D mapping
-            projected_pix, fov_mask, pix_z = vox2pix(
-                T_velo_2_cam,
-                cam_k,
-                self.vox_origin,
-                self.voxel_size * scale_3d,
-                self.img_W,
-                self.img_H,
-                self.scene_size,
-            )
-            data["projected_pix_{}".format(scale_3d)] = projected_pix
-            data["pix_z_{}".format(scale_3d)] = pix_z
-            data["fov_mask_{}".format(scale_3d)] = fov_mask
-        target_1_path = os.path.join(self.label_root, sequence, frame_id + "_1_1.npy")
-        target = np.load(target_1_path)
-        data["target"] = target
-        target_8_path = os.path.join(self.label_root, sequence, frame_id + "_1_8.npy")
-        target_1_8 = np.load(target_8_path)
-        CP_mega_matrix = compute_CP_mega_matrix(target_1_8)
-        data["CP_mega_matrix"] = CP_mega_matrix
-        # Compute the masks, each indicate the voxels of a local frustum
-        if self.split != "test":
-            projected_pix_output = data["projected_pix_{}".format(self.output_scale)]
-            pix_z_output = data[
-                "pix_z_{}".format(self.output_scale)
-            ]
-            frustums_masks, frustums_class_dists = compute_local_frustums(
-                projected_pix_output,
-                pix_z_output,
-                target,
-                self.img_W,
-                self.img_H,
-                dataset="kitti",
-                n_classes=20,
-                size=self.frustum_size,
-            )
-        else:
-            frustums_masks = None
-            frustums_class_dists = None
-        data["frustums_masks"] = frustums_masks
-        data["frustums_class_dists"] = frustums_class_dists
-        img = Image.open(rgb_path).convert("RGB")
-        # Image augmentation
-        if self.color_jitter is not None:
-            img = self.color_jitter(img)
-        # PIL to numpy
-        img = np.array(img, dtype=np.float32, copy=False) / 255.0
-        img = img[:370, :1220, :]  # crop image
-        # Fliplr the image
-        if np.random.rand() < self.fliplr:
-            img = np.ascontiguousarray(np.fliplr(img))
-            for scale in scale_3ds:
-                key = "projected_pix_" + str(scale)
-                data[key][:, 0] = img.shape[1] - 1 - data[key][:, 0]
-        data["img"] = self.normalize_rgb(img)
-        return data
-    def __len__(self):
-        return len(self.scans)
-    @staticmethod
-    def read_calib(calib_path):
-        """
-        Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
-        :param calib_path: Path to a calibration text file.
-        :return: dict with calibration matrices.
-        """
-        calib_all = {}
-        with open(calib_path, "r") as f:
-            for line in f.readlines():
-                if line == "\n":
-                    break
-                key, value = line.split(":", 1)
-                calib_all[key] = np.array([float(x) for x in value.split()])
-        # reshape matrices
-        calib_out = {}
-        # 3x4 projection matrix for left camera
-        calib_out["P2"] = calib_all["P2"].reshape(3, 4)
-        calib_out["Tr"] = np.identity(4)  # 4x4 matrix
-        calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
-        return calib_out

monoscene/data/semantic_kitti/kitti_dm.py DELETED Viewed

@@ -1,91 +0,0 @@
-from torch.utils.data.dataloader import DataLoader
-from monoscene.data.semantic_kitti.kitti_dataset import KittiDataset
-import pytorch_lightning as pl
-from monoscene.data.semantic_kitti.collate import collate_fn
-from monoscene.data.utils.torch_util import worker_init_fn
-class KittiDataModule(pl.LightningDataModule):
-    def __init__(
-        self,
-        root,
-        preprocess_root,
-        project_scale=2,
-        frustum_size=4,
-        batch_size=4,
-        num_workers=6,
-    ):
-        super().__init__()
-        self.root = root
-        self.preprocess_root = preprocess_root
-        self.project_scale = project_scale
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-        self.frustum_size = frustum_size
-    def setup(self, stage=None):
-        self.train_ds = KittiDataset(
-            split="train",
-            root=self.root,
-            preprocess_root=self.preprocess_root,
-            project_scale=self.project_scale,
-            frustum_size=self.frustum_size,
-            fliplr=0.5,
-            color_jitter=(0.4, 0.4, 0.4),
-        )
-        self.val_ds = KittiDataset(
-            split="val",
-            root=self.root,
-            preprocess_root=self.preprocess_root,
-            project_scale=self.project_scale,
-            frustum_size=self.frustum_size,
-            fliplr=0,
-            color_jitter=None,
-        )
-        self.test_ds = KittiDataset(
-            split="test",
-            root=self.root,
-            preprocess_root=self.preprocess_root,
-            project_scale=self.project_scale,
-            frustum_size=self.frustum_size,
-            fliplr=0,
-            color_jitter=None,
-        )
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_ds,
-            batch_size=self.batch_size,
-            drop_last=True,
-            num_workers=self.num_workers,
-            shuffle=True,
-            pin_memory=True,
-            worker_init_fn=worker_init_fn,
-            collate_fn=collate_fn,
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.val_ds,
-            batch_size=self.batch_size,
-            drop_last=False,
-            num_workers=self.num_workers,
-            shuffle=False,
-            pin_memory=True,
-            worker_init_fn=worker_init_fn,
-            collate_fn=collate_fn,
-        )
-    def test_dataloader(self):
-        return DataLoader(
-            self.test_ds,
-            batch_size=self.batch_size,
-            drop_last=False,
-            num_workers=self.num_workers,
-            shuffle=False,
-            pin_memory=True,
-            worker_init_fn=worker_init_fn,
-            collate_fn=collate_fn,
-        )

monoscene/data/semantic_kitti/params.py DELETED Viewed

@@ -1,48 +0,0 @@
-import numpy as np
-semantic_kitti_class_frequencies = np.array(
-    [
-        5.41773033e09,
-        1.57835390e07,
-        1.25136000e05,
-        1.18809000e05,
-        6.46799000e05,
-        8.21951000e05,
-        2.62978000e05,
-        2.83696000e05,
-        2.04750000e05,
-        6.16887030e07,
-        4.50296100e06,
-        4.48836500e07,
-        2.26992300e06,
-        5.68402180e07,
-        1.57196520e07,
-        1.58442623e08,
-        2.06162300e06,
-        3.69705220e07,
-        1.15198800e06,
-        3.34146000e05,
-    ]
-)
-kitti_class_names = [
-    "empty",
-    "car",
-    "bicycle",
-    "motorcycle",
-    "truck",
-    "other-vehicle",
-    "person",
-    "bicyclist",
-    "motorcyclist",
-    "road",
-    "parking",
-    "sidewalk",
-    "other-ground",
-    "building",
-    "fence",
-    "vegetation",
-    "trunk",
-    "terrain",
-    "pole",
-    "traffic-sign",
-]

monoscene/data/semantic_kitti/preprocess.py DELETED Viewed

@@ -1,102 +0,0 @@
-"""
-Code partly taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/labels_downscale.py
-"""
-import numpy as np
-from tqdm import tqdm
-import numpy.matlib
-import os
-import glob
-import hydra
-from omegaconf import DictConfig
-import monoscene.data.semantic_kitti.io_data as SemanticKittiIO
-from hydra.utils import get_original_cwd
-from monoscene.data.NYU.preprocess import _downsample_label
-def majority_pooling(grid, k_size=2):
-    result = np.zeros(
-        (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
-    )
-    for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
-        for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
-            for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
-                sub_m = grid[
-                    (xx * k_size) : (xx * k_size) + k_size,
-                    (yy * k_size) : (yy * k_size) + k_size,
-                    (zz * k_size) : (zz * k_size) + k_size,
-                ]
-                unique, counts = np.unique(sub_m, return_counts=True)
-                if True in ((unique != 0) & (unique != 255)):
-                    # Remove counts with 0 and 255
-                    counts = counts[((unique != 0) & (unique != 255))]
-                    unique = unique[((unique != 0) & (unique != 255))]
-                else:
-                    if True in (unique == 0):
-                        counts = counts[(unique != 255)]
-                        unique = unique[(unique != 255)]
-                value = unique[np.argmax(counts)]
-                result[xx, yy, zz] = value
-    return result
-@hydra.main(config_name="../../config/monoscene.yaml")
-def main(config: DictConfig):
-    scene_size = (256, 256, 32)
-    sequences = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
-    remap_lut = SemanticKittiIO.get_remap_lut(
-        os.path.join(
-            get_original_cwd(),
-            "monoscene",
-            "data",
-            "semantic_kitti",
-            "semantic-kitti.yaml",
-        )
-    )
-    for sequence in sequences:
-        sequence_path = os.path.join(
-            config.kitti_root, "dataset", "sequences", sequence
-        )
-        label_paths = sorted(
-            glob.glob(os.path.join(sequence_path, "voxels", "*.label"))
-        )
-        invalid_paths = sorted(
-            glob.glob(os.path.join(sequence_path, "voxels", "*.invalid"))
-        )
-        out_dir = os.path.join(config.kitti_preprocess_root, "labels", sequence)
-        os.makedirs(out_dir, exist_ok=True)
-        downscaling = {"1_1": 1, "1_8": 8}
-        for i in tqdm(range(len(label_paths))):
-            frame_id, extension = os.path.splitext(os.path.basename(label_paths[i]))
-            LABEL = SemanticKittiIO._read_label_SemKITTI(label_paths[i])
-            INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_paths[i])
-            LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
-                np.float32
-            )  # Remap 20 classes semanticKITTI SSC
-            LABEL[
-                np.isclose(INVALID, 1)
-            ] = 255  # Setting to unknown all voxels marked on invalid mask...
-            LABEL = LABEL.reshape([256, 256, 32])
-            for scale in downscaling:
-                filename = frame_id + "_" + scale + ".npy"
-                label_filename = os.path.join(out_dir, filename)
-                # If files have not been created...
-                if not os.path.exists(label_filename):
-                    if scale == "1_8":
-                        LABEL_ds = _downsample_label(
-                            LABEL, (256, 256, 32), downscaling[scale]
-                        )
-                    else:
-                        LABEL_ds = LABEL
-                    np.save(label_filename, LABEL_ds)
-                    print("wrote to", label_filename)
-if __name__ == "__main__":
-    main()

monoscene/data/semantic_kitti/semantic-kitti.yaml DELETED Viewed

@@ -1,213 +0,0 @@
-# This file is covered by the LICENSE file in the root of this project.
-nbr_classes: 20
-grid_dims: [256, 32, 256]  # (W, H, D)
-labels:
-  0 : "unlabeled"
-  1 : "outlier"
-  10: "car"
-  11: "bicycle"
-  13: "bus"
-  15: "motorcycle"
-  16: "on-rails"
-  18: "truck"
-  20: "other-vehicle"
-  30: "person"
-  31: "bicyclist"
-  32: "motorcyclist"
-  40: "road"
-  44: "parking"
-  48: "sidewalk"
-  49: "other-ground"
-  50: "building"
-  51: "fence"
-  52: "other-structure"
-  60: "lane-marking"
-  70: "vegetation"
-  71: "trunk"
-  72: "terrain"
-  80: "pole"
-  81: "traffic-sign"
-  99: "other-object"
-  252: "moving-car"
-  253: "moving-bicyclist"
-  254: "moving-person"
-  255: "moving-motorcyclist"
-  256: "moving-on-rails"
-  257: "moving-bus"
-  258: "moving-truck"
-  259: "moving-other-vehicle"
-color_map: # bgr
-  0 : [0, 0, 0]
-  1 : [0, 0, 255]
-  10: [245, 150, 100]
-  11: [245, 230, 100]
-  13: [250, 80, 100]
-  15: [150, 60, 30]
-  16: [255, 0, 0]
-  18: [180, 30, 80]
-  20: [255, 0, 0]
-  30: [30, 30, 255]
-  31: [200, 40, 255]
-  32: [90, 30, 150]
-  40: [255, 0, 255]
-  44: [255, 150, 255]
-  48: [75, 0, 75]
-  49: [75, 0, 175]
-  50: [0, 200, 255]
-  51: [50, 120, 255]
-  52: [0, 150, 255]
-  60: [170, 255, 150]
-  70: [0, 175, 0]
-  71: [0, 60, 135]
-  72: [80, 240, 150]
-  80: [150, 240, 255]
-  81: [0, 0, 255]
-  99: [255, 255, 50]
-  252: [245, 150, 100]
-  256: [255, 0, 0]
-  253: [200, 40, 255]
-  254: [30, 30, 255]
-  255: [90, 30, 150]
-  257: [250, 80, 100]
-  258: [180, 30, 80]
-  259: [255, 0, 0]
-content: # as a ratio with the total number of points
-  0: 0.018889854628292943
-  1: 0.0002937197336781505
-  10: 0.040818519255974316
-  11: 0.00016609538710764618
-  13: 2.7879693665067774e-05
-  15: 0.00039838616015114444
-  16: 0.0
-  18: 0.0020633612104619787
-  20: 0.0016218197275284021
-  30: 0.00017698551338515307
-  31: 1.1065903904919655e-08
-  32: 5.532951952459828e-09
-  40: 0.1987493871255525
-  44: 0.014717169549888214
-  48: 0.14392298360372
-  49: 0.0039048553037472045
-  50: 0.1326861944777486
-  51: 0.0723592229456223
-  52: 0.002395131480328884
-  60: 4.7084144280367186e-05
-  70: 0.26681502148037506
-  71: 0.006035012012626033
-  72: 0.07814222006271769
-  80: 0.002855498193863172
-  81: 0.0006155958086189918
-  99: 0.009923127583046915
-  252: 0.001789309418528068
-  253: 0.00012709999297008662
-  254: 0.00016059776092534436
-  255: 3.745553104802113e-05
-  256: 0.0
-  257: 0.00011351574470342043
-  258: 0.00010157861367183268
-  259: 4.3840131989471124e-05
-# classes that are indistinguishable from single scan or inconsistent in
-# ground truth are mapped to their closest equivalent
-learning_map:
-  0 : 0     # "unlabeled"
-  1 : 0     # "outlier" mapped to "unlabeled" --------------------------mapped
-  10: 1     # "car"
-  11: 2     # "bicycle"
-  13: 5     # "bus" mapped to "other-vehicle" --------------------------mapped
-  15: 3     # "motorcycle"
-  16: 5     # "on-rails" mapped to "other-vehicle" ---------------------mapped
-  18: 4     # "truck"
-  20: 5     # "other-vehicle"
-  30: 6     # "person"
-  31: 7     # "bicyclist"
-  32: 8     # "motorcyclist"
-  40: 9     # "road"
-  44: 10    # "parking"
-  48: 11    # "sidewalk"
-  49: 12    # "other-ground"
-  50: 13    # "building"
-  51: 14    # "fence"
-  52: 0     # "other-structure" mapped to "unlabeled" ------------------mapped
-  60: 9     # "lane-marking" to "road" ---------------------------------mapped
-  70: 15    # "vegetation"
-  71: 16    # "trunk"
-  72: 17    # "terrain"
-  80: 18    # "pole"
-  81: 19    # "traffic-sign"
-  99: 0     # "other-object" to "unlabeled" ----------------------------mapped
-  252: 1    # "moving-car" to "car" ------------------------------------mapped
-  253: 7    # "moving-bicyclist" to "bicyclist" ------------------------mapped
-  254: 6    # "moving-person" to "person" ------------------------------mapped
-  255: 8    # "moving-motorcyclist" to "motorcyclist" ------------------mapped
-  256: 5    # "moving-on-rails" mapped to "other-vehicle" --------------mapped
-  257: 5    # "moving-bus" mapped to "other-vehicle" -------------------mapped
-  258: 4    # "moving-truck" to "truck" --------------------------------mapped
-  259: 5    # "moving-other"-vehicle to "other-vehicle" ----------------mapped
-learning_map_inv: # inverse of previous map
-  0: 0      # "unlabeled", and others ignored
-  1: 10     # "car"
-  2: 11     # "bicycle"
-  3: 15     # "motorcycle"
-  4: 18     # "truck"
-  5: 20     # "other-vehicle"
-  6: 30     # "person"
-  7: 31     # "bicyclist"
-  8: 32     # "motorcyclist"
-  9: 40     # "road"
-  10: 44    # "parking"
-  11: 48    # "sidewalk"
-  12: 49    # "other-ground"
-  13: 50    # "building"
-  14: 51    # "fence"
-  15: 70    # "vegetation"
-  16: 71    # "trunk"
-  17: 72    # "terrain"
-  18: 80    # "pole"
-  19: 81    # "traffic-sign"
-learning_ignore: # Ignore classes
-  0: True      # "unlabeled", and others ignored
-  1: False     # "car"
-  2: False     # "bicycle"
-  3: False     # "motorcycle"
-  4: False     # "truck"
-  5: False     # "other-vehicle"
-  6: False     # "person"
-  7: False     # "bicyclist"
-  8: False     # "motorcyclist"
-  9: False     # "road"
-  10: False    # "parking"
-  11: False    # "sidewalk"
-  12: False    # "other-ground"
-  13: False    # "building"
-  14: False    # "fence"
-  15: False    # "vegetation"
-  16: False    # "trunk"
-  17: False    # "terrain"
-  18: False    # "pole"
-  19: False    # "traffic-sign"
-split: # sequence numbers
-  train:
-    - 0
-    - 1
-    - 2
-    - 3
-    - 4
-    - 5
-    - 6
-    - 7
-    - 9
-    - 10
-  valid:
-    - 8
-  test:
-    - 11
-    - 12
-    - 13
-    - 14
-    - 15
-    - 16
-    - 17
-    - 18
-    - 19
-    - 20
-    - 21

monoscene/data/utils/fusion.py DELETED Viewed

@@ -1,507 +0,0 @@
-"""
-Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
-@inproceedings{zeng20163dmatch,
-    title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
-    author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
-    booktitle={CVPR},
-    year={2017}
-}
-"""
-import numpy as np
-from numba import njit, prange
-from skimage import measure
-FUSION_GPU_MODE = 0
-class TSDFVolume:
-    """Volumetric TSDF Fusion of RGB-D Images."""
-    def __init__(self, vol_bnds, voxel_size, use_gpu=True):
-        """Constructor.
-        Args:
-          vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
-            xyz bounds (min/max) in meters.
-          voxel_size (float): The volume discretization in meters.
-        """
-        vol_bnds = np.asarray(vol_bnds)
-        assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
-        # Define voxel volume parameters
-        self._vol_bnds = vol_bnds
-        self._voxel_size = float(voxel_size)
-        self._trunc_margin = 5 * self._voxel_size  # truncation on SDF
-        # self._trunc_margin = 10  # truncation on SDF
-        self._color_const = 256 * 256
-        # Adjust volume bounds and ensure C-order contiguous
-        self._vol_dim = (
-            np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
-            .copy(order="C")
-            .astype(int)
-        )
-        self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
-        self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
-        print(
-            "Voxel volume size: {} x {} x {} - # points: {:,}".format(
-                self._vol_dim[0],
-                self._vol_dim[1],
-                self._vol_dim[2],
-                self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
-            )
-        )
-        # Initialize pointers to voxel volume in CPU memory
-        self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
-        # for computing the cumulative moving average of observations per voxel
-        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
-        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
-        self.gpu_mode = use_gpu and FUSION_GPU_MODE
-        # Copy voxel volumes to GPU
-        if self.gpu_mode:
-            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
-            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
-            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
-            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
-            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
-            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
-            # Cuda kernel function (C++)
-            self._cuda_src_mod = SourceModule(
-                """
-        __global__ void integrate(float * tsdf_vol,
-                                  float * weight_vol,
-                                  float * color_vol,
-                                  float * vol_dim,
-                                  float * vol_origin,
-                                  float * cam_intr,
-                                  float * cam_pose,
-                                  float * other_params,
-                                  float * color_im,
-                                  float * depth_im) {
-          // Get voxel index
-          int gpu_loop_idx = (int) other_params[0];
-          int max_threads_per_block = blockDim.x;
-          int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
-          int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
-          int vol_dim_x = (int) vol_dim[0];
-          int vol_dim_y = (int) vol_dim[1];
-          int vol_dim_z = (int) vol_dim[2];
-          if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
-              return;
-          // Get voxel grid coordinates (note: be careful when casting)
-          float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
-          float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
-          float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
-          // Voxel grid coordinates to world coordinates
-          float voxel_size = other_params[1];
-          float pt_x = vol_origin[0]+voxel_x*voxel_size;
-          float pt_y = vol_origin[1]+voxel_y*voxel_size;
-          float pt_z = vol_origin[2]+voxel_z*voxel_size;
-          // World coordinates to camera coordinates
-          float tmp_pt_x = pt_x-cam_pose[0*4+3];
-          float tmp_pt_y = pt_y-cam_pose[1*4+3];
-          float tmp_pt_z = pt_z-cam_pose[2*4+3];
-          float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
-          float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
-          float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
-          // Camera coordinates to image pixels
-          int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
-          int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
-          // Skip if outside view frustum
-          int im_h = (int) other_params[2];
-          int im_w = (int) other_params[3];
-          if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
-              return;
-          // Skip invalid depth
-          float depth_value = depth_im[pixel_y*im_w+pixel_x];
-          if (depth_value == 0)
-              return;
-          // Integrate TSDF
-          float trunc_margin = other_params[4];
-          float depth_diff = depth_value-cam_pt_z;
-          if (depth_diff < -trunc_margin)
-              return;
-          float dist = fmin(1.0f,depth_diff/trunc_margin);
-          float w_old = weight_vol[voxel_idx];
-          float obs_weight = other_params[5];
-          float w_new = w_old + obs_weight;
-          weight_vol[voxel_idx] = w_new;
-          tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
-          // Integrate color
-          float old_color = color_vol[voxel_idx];
-          float old_b = floorf(old_color/(256*256));
-          float old_g = floorf((old_color-old_b*256*256)/256);
-          float old_r = old_color-old_b*256*256-old_g*256;
-          float new_color = color_im[pixel_y*im_w+pixel_x];
-          float new_b = floorf(new_color/(256*256));
-          float new_g = floorf((new_color-new_b*256*256)/256);
-          float new_r = new_color-new_b*256*256-new_g*256;
-          new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
-          new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
-          new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
-          color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
-        }"""
-            )
-            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
-            # Determine block/grid size on GPU
-            gpu_dev = cuda.Device(0)
-            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
-            n_blocks = int(
-                np.ceil(
-                    float(np.prod(self._vol_dim))
-                    / float(self._max_gpu_threads_per_block)
-                )
-            )
-            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
-            grid_dim_y = min(
-                gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
-            )
-            grid_dim_z = min(
-                gpu_dev.MAX_GRID_DIM_Z,
-                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
-            )
-            self._max_gpu_grid_dim = np.array(
-                [grid_dim_x, grid_dim_y, grid_dim_z]
-            ).astype(int)
-            self._n_gpu_loops = int(
-                np.ceil(
-                    float(np.prod(self._vol_dim))
-                    / float(
-                        np.prod(self._max_gpu_grid_dim)
-                        * self._max_gpu_threads_per_block
-                    )
-                )
-            )
-        else:
-            # Get voxel grid coordinates
-            xv, yv, zv = np.meshgrid(
-                range(self._vol_dim[0]),
-                range(self._vol_dim[1]),
-                range(self._vol_dim[2]),
-                indexing="ij",
-            )
-            self.vox_coords = (
-                np.concatenate(
-                    [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
-                )
-                .astype(int)
-                .T
-            )
-    @staticmethod
-    @njit(parallel=True)
-    def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
-        """Convert voxel grid coordinates to world coordinates."""
-        vol_origin = vol_origin.astype(np.float32)
-        vox_coords = vox_coords.astype(np.float32)
-        #    print(np.min(vox_coords))
-        cam_pts = np.empty_like(vox_coords, dtype=np.float32)
-        for i in prange(vox_coords.shape[0]):
-            for j in range(3):
-                cam_pts[i, j] = (
-                    vol_origin[j]
-                    + (vox_size * vox_coords[i, j])
-                    + vox_size * offsets[j]
-                )
-        return cam_pts
-    @staticmethod
-    @njit(parallel=True)
-    def cam2pix(cam_pts, intr):
-        """Convert camera coordinates to pixel coordinates."""
-        intr = intr.astype(np.float32)
-        fx, fy = intr[0, 0], intr[1, 1]
-        cx, cy = intr[0, 2], intr[1, 2]
-        pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
-        for i in prange(cam_pts.shape[0]):
-            pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
-            pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
-        return pix
-    @staticmethod
-    @njit(parallel=True)
-    def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
-        """Integrate the TSDF volume."""
-        tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
-        # print(tsdf_vol.shape)
-        w_new = np.empty_like(w_old, dtype=np.float32)
-        for i in prange(len(tsdf_vol)):
-            w_new[i] = w_old[i] + obs_weight
-            tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
-        return tsdf_vol_int, w_new
-    def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
-        """Integrate an RGB-D frame into the TSDF volume.
-        Args:
-          color_im (ndarray): An RGB image of shape (H, W, 3).
-          depth_im (ndarray): A depth image of shape (H, W).
-          cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
-          cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
-          obs_weight (float): The weight to assign for the current observation. A higher
-            value
-        """
-        im_h, im_w = depth_im.shape
-        # Fold RGB color image into a single channel image
-        color_im = color_im.astype(np.float32)
-        color_im = np.floor(
-            color_im[..., 2] * self._color_const
-            + color_im[..., 1] * 256
-            + color_im[..., 0]
-        )
-        if self.gpu_mode:  # GPU mode: integrate voxel volume (calls CUDA kernel)
-            for gpu_loop_idx in range(self._n_gpu_loops):
-                self._cuda_integrate(
-                    self._tsdf_vol_gpu,
-                    self._weight_vol_gpu,
-                    self._color_vol_gpu,
-                    cuda.InOut(self._vol_dim.astype(np.float32)),
-                    cuda.InOut(self._vol_origin.astype(np.float32)),
-                    cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
-                    cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
-                    cuda.InOut(
-                        np.asarray(
-                            [
-                                gpu_loop_idx,
-                                self._voxel_size,
-                                im_h,
-                                im_w,
-                                self._trunc_margin,
-                                obs_weight,
-                            ],
-                            np.float32,
-                        )
-                    ),
-                    cuda.InOut(color_im.reshape(-1).astype(np.float32)),
-                    cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
-                    block=(self._max_gpu_threads_per_block, 1, 1),
-                    grid=(
-                        int(self._max_gpu_grid_dim[0]),
-                        int(self._max_gpu_grid_dim[1]),
-                        int(self._max_gpu_grid_dim[2]),
-                    ),
-                )
-        else:  # CPU mode: integrate voxel volume (vectorized implementation)
-            # Convert voxel grid coordinates to pixel coordinates
-            cam_pts = self.vox2world(
-                self._vol_origin, self.vox_coords, self._voxel_size
-            )
-            cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
-            pix_z = cam_pts[:, 2]
-            pix = self.cam2pix(cam_pts, cam_intr)
-            pix_x, pix_y = pix[:, 0], pix[:, 1]
-            # Eliminate pixels outside view frustum
-            valid_pix = np.logical_and(
-                pix_x >= 0,
-                np.logical_and(
-                    pix_x < im_w,
-                    np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
-                ),
-            )
-            depth_val = np.zeros(pix_x.shape)
-            depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
-            # Integrate TSDF
-            depth_diff = depth_val - pix_z
-            valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
-            dist = depth_diff
-            valid_vox_x = self.vox_coords[valid_pts, 0]
-            valid_vox_y = self.vox_coords[valid_pts, 1]
-            valid_vox_z = self.vox_coords[valid_pts, 2]
-            w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
-            tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
-            valid_dist = dist[valid_pts]
-            tsdf_vol_new, w_new = self.integrate_tsdf(
-                tsdf_vals, valid_dist, w_old, obs_weight
-            )
-            self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
-            self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
-            # Integrate color
-            old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
-            old_b = np.floor(old_color / self._color_const)
-            old_g = np.floor((old_color - old_b * self._color_const) / 256)
-            old_r = old_color - old_b * self._color_const - old_g * 256
-            new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
-            new_b = np.floor(new_color / self._color_const)
-            new_g = np.floor((new_color - new_b * self._color_const) / 256)
-            new_r = new_color - new_b * self._color_const - new_g * 256
-            new_b = np.minimum(
-                255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
-            )
-            new_g = np.minimum(
-                255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
-            )
-            new_r = np.minimum(
-                255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
-            )
-            self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
-                new_b * self._color_const + new_g * 256 + new_r
-            )
-    def get_volume(self):
-        if self.gpu_mode:
-            cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
-            cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
-        return self._tsdf_vol_cpu, self._color_vol_cpu
-    def get_point_cloud(self):
-        """Extract a point cloud from the voxel volume."""
-        tsdf_vol, color_vol = self.get_volume()
-        # Marching cubes
-        verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
-        verts_ind = np.round(verts).astype(int)
-        verts = verts * self._voxel_size + self._vol_origin
-        # Get vertex colors
-        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
-        colors_b = np.floor(rgb_vals / self._color_const)
-        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
-        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
-        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
-        colors = colors.astype(np.uint8)
-        pc = np.hstack([verts, colors])
-        return pc
-    def get_mesh(self):
-        """Compute a mesh from the voxel volume using marching cubes."""
-        tsdf_vol, color_vol = self.get_volume()
-        # Marching cubes
-        verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
-        verts_ind = np.round(verts).astype(int)
-        verts = (
-            verts * self._voxel_size + self._vol_origin
-        )  # voxel grid coordinates to world coordinates
-        # Get vertex colors
-        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
-        colors_b = np.floor(rgb_vals / self._color_const)
-        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
-        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
-        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
-        colors = colors.astype(np.uint8)
-        return verts, faces, norms, colors
-def rigid_transform(xyz, transform):
-    """Applies a rigid transform to an (N, 3) pointcloud."""
-    xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
-    xyz_t_h = np.dot(transform, xyz_h.T).T
-    return xyz_t_h[:, :3]
-def get_view_frustum(depth_im, cam_intr, cam_pose):
-    """Get corners of 3D camera view frustum of depth image"""
-    im_h = depth_im.shape[0]
-    im_w = depth_im.shape[1]
-    max_depth = np.max(depth_im)
-    view_frust_pts = np.array(
-        [
-            (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
-            * np.array([0, max_depth, max_depth, max_depth, max_depth])
-            / cam_intr[0, 0],
-            (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
-            * np.array([0, max_depth, max_depth, max_depth, max_depth])
-            / cam_intr[1, 1],
-            np.array([0, max_depth, max_depth, max_depth, max_depth]),
-        ]
-    )
-    view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
-    return view_frust_pts
-def meshwrite(filename, verts, faces, norms, colors):
-    """Save a 3D mesh to a polygon .ply file."""
-    # Write header
-    ply_file = open(filename, "w")
-    ply_file.write("ply\n")
-    ply_file.write("format ascii 1.0\n")
-    ply_file.write("element vertex %d\n" % (verts.shape[0]))
-    ply_file.write("property float x\n")
-    ply_file.write("property float y\n")
-    ply_file.write("property float z\n")
-    ply_file.write("property float nx\n")
-    ply_file.write("property float ny\n")
-    ply_file.write("property float nz\n")
-    ply_file.write("property uchar red\n")
-    ply_file.write("property uchar green\n")
-    ply_file.write("property uchar blue\n")
-    ply_file.write("element face %d\n" % (faces.shape[0]))
-    ply_file.write("property list uchar int vertex_index\n")
-    ply_file.write("end_header\n")
-    # Write vertex list
-    for i in range(verts.shape[0]):
-        ply_file.write(
-            "%f %f %f %f %f %f %d %d %d\n"
-            % (
-                verts[i, 0],
-                verts[i, 1],
-                verts[i, 2],
-                norms[i, 0],
-                norms[i, 1],
-                norms[i, 2],
-                colors[i, 0],
-                colors[i, 1],
-                colors[i, 2],
-            )
-        )
-    # Write face list
-    for i in range(faces.shape[0]):
-        ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
-    ply_file.close()
-def pcwrite(filename, xyzrgb):
-    """Save a point cloud to a polygon .ply file."""
-    xyz = xyzrgb[:, :3]
-    rgb = xyzrgb[:, 3:].astype(np.uint8)
-    # Write header
-    ply_file = open(filename, "w")
-    ply_file.write("ply\n")
-    ply_file.write("format ascii 1.0\n")
-    ply_file.write("element vertex %d\n" % (xyz.shape[0]))
-    ply_file.write("property float x\n")
-    ply_file.write("property float y\n")
-    ply_file.write("property float z\n")
-    ply_file.write("property uchar red\n")
-    ply_file.write("property uchar green\n")
-    ply_file.write("property uchar blue\n")
-    ply_file.write("end_header\n")
-    # Write vertex list
-    for i in range(xyz.shape[0]):
-        ply_file.write(
-            "%f %f %f %d %d %d\n"
-            % (
-                xyz[i, 0],
-                xyz[i, 1],
-                xyz[i, 2],
-                rgb[i, 0],
-                rgb[i, 1],
-                rgb[i, 2],
-            )
-        )

monoscene/data/utils/helpers.py DELETED Viewed

@@ -1,185 +0,0 @@
-import numpy as np
-import monoscene.data.utils.fusion as fusion
-import torch
-def compute_CP_mega_matrix(target, is_binary=False):
-    """
-    Parameters
-    ---------
-    target: (H, W, D)
-        contains voxels semantic labels
-    is_binary: bool
-        if True, return binary voxels relations else return 4-way relations
-    """
-    label = target.reshape(-1)
-    label_row = label
-    N = label.shape[0]
-    super_voxel_size = [i//2 for i in target.shape]
-    if is_binary:
-        matrix = np.zeros((2, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
-    else:
-        matrix = np.zeros((4, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
-    for xx in range(super_voxel_size[0]):
-        for yy in range(super_voxel_size[1]):
-            for zz in range(super_voxel_size[2]):
-                col_idx = xx * (super_voxel_size[1] * super_voxel_size[2]) + yy * super_voxel_size[2] + zz
-                label_col_megas = np.array([
-                    target[xx * 2,     yy * 2,     zz * 2],
-                    target[xx * 2 + 1, yy * 2,     zz * 2],
-                    target[xx * 2,     yy * 2 + 1, zz * 2],
-                    target[xx * 2,     yy * 2,     zz * 2 + 1],
-                    target[xx * 2 + 1, yy * 2 + 1, zz * 2],
-                    target[xx * 2 + 1, yy * 2,     zz * 2 + 1],
-                    target[xx * 2,     yy * 2 + 1, zz * 2 + 1],
-                    target[xx * 2 + 1, yy * 2 + 1, zz * 2 + 1],
-                ])
-                label_col_megas = label_col_megas[label_col_megas != 255]
-                for label_col_mega in label_col_megas:
-                    label_col = np.ones(N)  * label_col_mega
-                    if not is_binary:
-                        matrix[0, (label_row != 255) & (label_col == label_row) & (label_col != 0), col_idx] = 1.0 # non non same
-                        matrix[1, (label_row != 255) & (label_col != label_row) & (label_col != 0) & (label_row != 0), col_idx] = 1.0 # non non diff
-                        matrix[2, (label_row != 255) & (label_row == label_col) & (label_col == 0), col_idx] = 1.0 # empty empty
-                        matrix[3, (label_row != 255) & (label_row != label_col) & ((label_row == 0) | (label_col == 0)), col_idx] = 1.0 # nonempty empty
-                    else:
-                        matrix[0, (label_row != 255) & (label_col != label_row), col_idx] = 1.0 # diff
-                        matrix[1, (label_row != 255) & (label_col == label_row), col_idx] = 1.0 # same
-    return matrix
-def vox2pix(cam_E, cam_k,
-            vox_origin, voxel_size,
-            img_W, img_H,
-            scene_size):
-    """
-    compute the 2D projection of voxels centroids
-    Parameters:
-    ----------
-    cam_E: 4x4
-       =camera pose in case of NYUv2 dataset
-       =Transformation from camera to lidar coordinate in case of SemKITTI
-    cam_k: 3x3
-        camera intrinsics
-    vox_origin: (3,)
-        world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
-    img_W: int
-        image width
-    img_H: int
-        image height
-    scene_size: (3,)
-        scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
-    Returns
-    -------
-    projected_pix: (N, 2)
-        Projected 2D positions of voxels
-    fov_mask: (N,)
-        Voxels mask indice voxels inside image's FOV
-    pix_z: (N,)
-        Voxels'distance to the sensor in meter
-    """
-    # Compute the x, y, z bounding of the scene in meter
-    vol_bnds = np.zeros((3,2))
-    vol_bnds[:,0] = vox_origin
-    vol_bnds[:,1] = vox_origin + np.array(scene_size)
-    # Compute the voxels centroids in lidar cooridnates
-    vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
-    xv, yv, zv = np.meshgrid(
-            range(vol_dim[0]),
-            range(vol_dim[1]),
-            range(vol_dim[2]),
-            indexing='ij'
-          )
-    vox_coords = np.concatenate([
-            xv.reshape(1,-1),
-            yv.reshape(1,-1),
-            zv.reshape(1,-1)
-          ], axis=0).astype(int).T
-    # Project voxels'centroid from lidar coordinates to camera coordinates
-    cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
-    cam_pts = fusion.rigid_transform(cam_pts, cam_E)
-    # Project camera coordinates to pixel positions
-    projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
-    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
-    # Eliminate pixels outside view frustum
-    pix_z = cam_pts[:, 2]
-    fov_mask = np.logical_and(pix_x >= 0,
-                np.logical_and(pix_x < img_W,
-                np.logical_and(pix_y >= 0,
-                np.logical_and(pix_y < img_H,
-                pix_z > 0))))
-    return projected_pix, fov_mask, pix_z
-def compute_local_frustum(pix_x, pix_y, min_x, max_x, min_y, max_y, pix_z):
-    valid_pix = np.logical_and(pix_x >= min_x,
-                np.logical_and(pix_x < max_x,
-                np.logical_and(pix_y >= min_y,
-                np.logical_and(pix_y < max_y,
-                pix_z > 0))))
-    return valid_pix
-def compute_local_frustums(projected_pix, pix_z, target, img_W, img_H, dataset, n_classes, size=4):
-    """
-    Compute the local frustums mask and their class frequencies
-    Parameters:
-    ----------
-    projected_pix: (N, 2)
-        2D projected pix of all voxels
-    pix_z: (N,)
-        Distance of the camera sensor to voxels
-    target: (H, W, D)
-        Voxelized sematic labels
-    img_W: int
-        Image width
-    img_H: int
-        Image height
-    dataset: str
-        ="NYU" or "kitti" (for both SemKITTI and KITTI-360)
-    n_classes: int
-        Number of classes (12 for NYU and 20 for SemKITTI)
-    size: int
-        determine the number of local frustums i.e. size * size
-    Returns
-    -------
-    frustums_masks: (n_frustums, N)
-        List of frustums_masks, each indicates the belonging voxels
-    frustums_class_dists: (n_frustums, n_classes)
-        Contains the class frequencies in each frustum
-    """
-    H, W, D = target.shape
-    ranges = [(i * 1.0/size, (i * 1.0 + 1)/size) for i in range(size)]
-    local_frustum_masks = []
-    local_frustum_class_dists = []
-    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
-    for y in ranges:
-        for x in ranges:
-            start_x = x[0] * img_W
-            end_x = x[1] * img_W
-            start_y = y[0] * img_H
-            end_y = y[1] * img_H
-            local_frustum = compute_local_frustum(pix_x, pix_y, start_x, end_x, start_y, end_y, pix_z)
-            if dataset == "NYU":
-                mask = (target != 255) & np.moveaxis(local_frustum.reshape(60, 60, 36), [0, 1, 2], [0, 2, 1])
-            elif dataset == "kitti":
-                mask = (target != 255) & local_frustum.reshape(H, W, D)
-            local_frustum_masks.append(mask)
-            classes, cnts = np.unique(target[mask], return_counts=True)
-            class_counts = np.zeros(n_classes)
-            class_counts[classes.astype(int)] = cnts
-            local_frustum_class_dists.append(class_counts)
-    frustums_masks, frustums_class_dists = np.array(local_frustum_masks), np.array(local_frustum_class_dists)
-    return frustums_masks, frustums_class_dists

monoscene/data/utils/torch_util.py DELETED Viewed

@@ -1,15 +0,0 @@
-import numpy as np
-import torch
-def worker_init_fn(worker_id):
-    """The function is designed for pytorch multi-process dataloader.
-    Note that we use the pytorch random generator to generate a base_seed.
-    Please try to be consistent.
-    References:
-        https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
-    """
-    base_seed = torch.IntTensor(1).random_().item()
-    np.random.seed(base_seed + worker_id)

monoscene/{models/flosp.py → flosp.py} RENAMED Viewed

File without changes

monoscene/loss/CRP_loss.py DELETED Viewed

@@ -1,24 +0,0 @@
-import torch
-def compute_super_CP_multilabel_loss(pred_logits, CP_mega_matrices):
-    logits = []
-    labels = []
-    bs, n_relations, _, _ = pred_logits.shape
-    for i in range(bs):
-        pred_logit = pred_logits[i, :, :, :].permute(
-            0, 2, 1
-        )  # n_relations, N, n_mega_voxels
-        CP_mega_matrix = CP_mega_matrices[i]  # n_relations, N, n_mega_voxels
-        logits.append(pred_logit.reshape(n_relations, -1))
-        labels.append(CP_mega_matrix.reshape(n_relations, -1))
-    logits = torch.cat(logits, dim=1).T  # M, 4
-    labels = torch.cat(labels, dim=1).T  # M, 4
-    cnt_neg = (labels == 0).sum(0)
-    cnt_pos = labels.sum(0)
-    pos_weight = cnt_neg / cnt_pos
-    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
-    loss_bce = criterion(logits, labels.float())
-    return loss_bce

monoscene/loss/sscMetrics.py DELETED Viewed

@@ -1,204 +0,0 @@
-"""
-Part of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py
-"""
-import numpy as np
-from sklearn.metrics import accuracy_score, precision_recall_fscore_support
-def get_iou(iou_sum, cnt_class):
-    _C = iou_sum.shape[0]  # 12
-    iou = np.zeros(_C, dtype=np.float32)  # iou for each class
-    for idx in range(_C):
-        iou[idx] = iou_sum[idx] / cnt_class[idx] if cnt_class[idx] else 0
-    mean_iou = np.sum(iou[1:]) / np.count_nonzero(cnt_class[1:])
-    return iou, mean_iou
-def get_accuracy(predict, target, weight=None):  # 0.05s
-    _bs = predict.shape[0]  # batch size
-    _C = predict.shape[1]  # _C = 12
-    target = np.int32(target)
-    target = target.reshape(_bs, -1)  # (_bs, 60*36*60) 129600
-    predict = predict.reshape(_bs, _C, -1)  # (_bs, _C, 60*36*60)
-    predict = np.argmax(
-        predict, axis=1
-    )  # one-hot: _bs x _C x 60*36*60 -->  label: _bs x 60*36*60.
-    correct = predict == target  # (_bs, 129600)
-    if weight:  # 0.04s, add class weights
-        weight_k = np.ones(target.shape)
-        for i in range(_bs):
-            for n in range(target.shape[1]):
-                idx = 0 if target[i, n] == 255 else target[i, n]
-                weight_k[i, n] = weight[idx]
-        correct = correct * weight_k
-    acc = correct.sum() / correct.size
-    return acc
-class SSCMetrics:
-    def __init__(self, n_classes):
-        self.n_classes = n_classes
-        self.reset()
-    def hist_info(self, n_cl, pred, gt):
-        assert pred.shape == gt.shape
-        k = (gt >= 0) & (gt < n_cl)  # exclude 255
-        labeled = np.sum(k)
-        correct = np.sum((pred[k] == gt[k]))
-        return (
-            np.bincount(
-                n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
-            ).reshape(n_cl, n_cl),
-            correct,
-            labeled,
-        )
-    @staticmethod
-    def compute_score(hist, correct, labeled):
-        iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
-        mean_IU = np.nanmean(iu)
-        mean_IU_no_back = np.nanmean(iu[1:])
-        freq = hist.sum(1) / hist.sum()
-        freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
-        mean_pixel_acc = correct / labeled if labeled != 0 else 0
-        return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
-    def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
-        self.count += 1
-        mask = y_true != 255
-        if nonempty is not None:
-            mask = mask & nonempty
-        if nonsurface is not None:
-            mask = mask & nonsurface
-        tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)
-        self.completion_tp += tp
-        self.completion_fp += fp
-        self.completion_fn += fn
-        mask = y_true != 255
-        if nonempty is not None:
-            mask = mask & nonempty
-        tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
-            y_pred, y_true, mask
-        )
-        self.tps += tp_sum
-        self.fps += fp_sum
-        self.fns += fn_sum
-    def get_stats(self):
-        if self.completion_tp != 0:
-            precision = self.completion_tp / (self.completion_tp + self.completion_fp)
-            recall = self.completion_tp / (self.completion_tp + self.completion_fn)
-            iou = self.completion_tp / (
-                self.completion_tp + self.completion_fp + self.completion_fn
-            )
-        else:
-            precision, recall, iou = 0, 0, 0
-        iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
-        return {
-            "precision": precision,
-            "recall": recall,
-            "iou": iou,
-            "iou_ssc": iou_ssc,
-            "iou_ssc_mean": np.mean(iou_ssc[1:]),
-        }
-    def reset(self):
-        self.completion_tp = 0
-        self.completion_fp = 0
-        self.completion_fn = 0
-        self.tps = np.zeros(self.n_classes)
-        self.fps = np.zeros(self.n_classes)
-        self.fns = np.zeros(self.n_classes)
-        self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
-        self.labeled_ssc = 0
-        self.correct_ssc = 0
-        self.precision = 0
-        self.recall = 0
-        self.iou = 0
-        self.count = 1e-8
-        self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
-        self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)
-    def get_score_completion(self, predict, target, nonempty=None):
-        predict = np.copy(predict)
-        target = np.copy(target)
-        """for scene completion, treat the task as two-classes problem, just empty or occupancy"""
-        _bs = predict.shape[0]  # batch size
-        # ---- ignore
-        predict[target == 255] = 0
-        target[target == 255] = 0
-        # ---- flatten
-        target = target.reshape(_bs, -1)  # (_bs, 129600)
-        predict = predict.reshape(_bs, -1)  # (_bs, _C, 129600), 60*36*60=129600
-        # ---- treat all non-empty object class as one category, set them to label 1
-        b_pred = np.zeros(predict.shape)
-        b_true = np.zeros(target.shape)
-        b_pred[predict > 0] = 1
-        b_true[target > 0] = 1
-        p, r, iou = 0.0, 0.0, 0.0
-        tp_sum, fp_sum, fn_sum = 0, 0, 0
-        for idx in range(_bs):
-            y_true = b_true[idx, :]  # GT
-            y_pred = b_pred[idx, :]
-            if nonempty is not None:
-                nonempty_idx = nonempty[idx, :].reshape(-1)
-                y_true = y_true[nonempty_idx == 1]
-                y_pred = y_pred[nonempty_idx == 1]
-            tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
-            fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
-            fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
-            tp_sum += tp
-            fp_sum += fp
-            fn_sum += fn
-        return tp_sum, fp_sum, fn_sum
-    def get_score_semantic_and_completion(self, predict, target, nonempty=None):
-        target = np.copy(target)
-        predict = np.copy(predict)
-        _bs = predict.shape[0]  # batch size
-        _C = self.n_classes  # _C = 12
-        # ---- ignore
-        predict[target == 255] = 0
-        target[target == 255] = 0
-        # ---- flatten
-        target = target.reshape(_bs, -1)  # (_bs, 129600)
-        predict = predict.reshape(_bs, -1)  # (_bs, 129600), 60*36*60=129600
-        cnt_class = np.zeros(_C, dtype=np.int32)  # count for each class
-        iou_sum = np.zeros(_C, dtype=np.float32)  # sum of iou for each class
-        tp_sum = np.zeros(_C, dtype=np.int32)  # tp
-        fp_sum = np.zeros(_C, dtype=np.int32)  # fp
-        fn_sum = np.zeros(_C, dtype=np.int32)  # fn
-        for idx in range(_bs):
-            y_true = target[idx, :]  # GT
-            y_pred = predict[idx, :]
-            if nonempty is not None:
-                nonempty_idx = nonempty[idx, :].reshape(-1)
-                y_pred = y_pred[
-                    np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
-                ]
-                y_true = y_true[
-                    np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
-                ]
-            for j in range(_C):  # for each class
-                tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
-                fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
-                fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size
-                tp_sum[j] += tp
-                fp_sum[j] += fp
-                fn_sum[j] += fn
-        return tp_sum, fp_sum, fn_sum

monoscene/loss/ssc_loss.py DELETED Viewed

@@ -1,99 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-def KL_sep(p, target):
-    """
-    KL divergence on nonzeros classes
-    """
-    nonzeros = target != 0
-    nonzero_p = p[nonzeros]
-    kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
-    return kl_term
-def geo_scal_loss(pred, ssc_target):
-    # Get softmax probabilities
-    pred = F.softmax(pred, dim=1)
-    # Compute empty and nonempty probabilities
-    empty_probs = pred[:, 0, :, :, :]
-    nonempty_probs = 1 - empty_probs
-    # Remove unknown voxels
-    mask = ssc_target != 255
-    nonempty_target = ssc_target != 0
-    nonempty_target = nonempty_target[mask].float()
-    nonempty_probs = nonempty_probs[mask]
-    empty_probs = empty_probs[mask]
-    intersection = (nonempty_target * nonempty_probs).sum()
-    precision = intersection / nonempty_probs.sum()
-    recall = intersection / nonempty_target.sum()
-    spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
-    return (
-        F.binary_cross_entropy(precision, torch.ones_like(precision))
-        + F.binary_cross_entropy(recall, torch.ones_like(recall))
-        + F.binary_cross_entropy(spec, torch.ones_like(spec))
-    )
-def sem_scal_loss(pred, ssc_target):
-    # Get softmax probabilities
-    pred = F.softmax(pred, dim=1)
-    loss = 0
-    count = 0
-    mask = ssc_target != 255
-    n_classes = pred.shape[1]
-    for i in range(0, n_classes):
-        # Get probability of class i
-        p = pred[:, i, :, :, :]
-        # Remove unknown voxels
-        target_ori = ssc_target
-        p = p[mask]
-        target = ssc_target[mask]
-        completion_target = torch.ones_like(target)
-        completion_target[target != i] = 0
-        completion_target_ori = torch.ones_like(target_ori).float()
-        completion_target_ori[target_ori != i] = 0
-        if torch.sum(completion_target) > 0:
-            count += 1.0
-            nominator = torch.sum(p * completion_target)
-            loss_class = 0
-            if torch.sum(p) > 0:
-                precision = nominator / (torch.sum(p))
-                loss_precision = F.binary_cross_entropy(
-                    precision, torch.ones_like(precision)
-                )
-                loss_class += loss_precision
-            if torch.sum(completion_target) > 0:
-                recall = nominator / (torch.sum(completion_target))
-                loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
-                loss_class += loss_recall
-            if torch.sum(1 - completion_target) > 0:
-                specificity = torch.sum((1 - p) * (1 - completion_target)) / (
-                    torch.sum(1 - completion_target)
-                )
-                loss_specificity = F.binary_cross_entropy(
-                    specificity, torch.ones_like(specificity)
-                )
-                loss_class += loss_specificity
-            loss += loss_class
-    return loss / count
-def CE_ssc_loss(pred, target, class_weights):
-    """
-    :param: prediction: the predicted tensor, must be [BS, C, H, W, D]
-    """
-    criterion = nn.CrossEntropyLoss(
-        weight=class_weights, ignore_index=255, reduction="mean"
-    )
-    loss = criterion(pred, target.long())
-    return loss

monoscene/{models/modules.py → modules.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from monoscene.models.DDR import Bottleneck3D
 class ASPP(nn.Module):

 import torch
 import torch.nn as nn
+from monoscene.DDR import Bottleneck3D
 class ASPP(nn.Module):

monoscene/{models/monoscene.py → monoscene.py} RENAMED Viewed

@@ -1,32 +1,26 @@
 import pytorch_lightning as pl
 import torch
 import torch.nn as nn
-from monoscene.models.unet3d_nyu import UNet3D as UNet3DNYU
-from monoscene.models.unet3d_kitti import UNet3D as UNet3DKitti
-from monoscene.loss.sscMetrics import SSCMetrics
-from monoscene.loss.ssc_loss import sem_scal_loss, CE_ssc_loss, KL_sep, geo_scal_loss
-from monoscene.models.flosp import FLoSP
-from monoscene.loss.CRP_loss import compute_super_CP_multilabel_loss
 import numpy as np
 import torch.nn.functional as F
-from monoscene.models.unet2d import UNet2D
-from torch.optim.lr_scheduler import MultiStepLR
 class MonoScene(pl.LightningModule):
     def __init__(
         self,
         n_classes,
-        class_names,
         feature,
-        class_weights,
         project_scale,
         full_scene_size,
         dataset,
         n_relations=4,
         context_prior=True,
         fp_loss=True,
-        project_res=[],
         frustum_size=4,
         relation_loss=False,
         CE_ssc_loss=True,
@@ -42,13 +36,11 @@ class MonoScene(pl.LightningModule):
         self.dataset = dataset
         self.context_prior = context_prior
         self.frustum_size = frustum_size
-        self.class_names = class_names
         self.relation_loss = relation_loss
         self.CE_ssc_loss = CE_ssc_loss
         self.sem_scal_loss = sem_scal_loss
         self.geo_scal_loss = geo_scal_loss
         self.project_scale = project_scale
-        self.class_weights = class_weights
         self.lr = lr
         self.weight_decay = weight_decay
@@ -81,13 +73,6 @@ class MonoScene(pl.LightningModule):
             )
         self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
-        # log hyperparameters
-        self.save_hyperparameters()
-        self.train_metrics = SSCMetrics(self.n_classes)
-        self.val_metrics = SSCMetrics(self.n_classes)
-        self.test_metrics = SSCMetrics(self.n_classes)
     def forward(self, batch):
         img = batch["img"]
@@ -104,19 +89,21 @@ class MonoScene(pl.LightningModule):
                 # project features at each 2D scale to target 3D scale
                 scale_2d = int(scale_2d)
-                projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda()
-                fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda()
                 # Sum all the 3D features
                 if x3d is None:
                     x3d = self.projects[str(scale_2d)](
                         x_rgb["1_" + str(scale_2d)][i],
                         projected_pix // scale_2d,
                         fov_mask,
                     )
                 else:
                     x3d += self.projects[str(scale_2d)](
                         x_rgb["1_" + str(scale_2d)][i],
                         projected_pix // scale_2d,
                         fov_mask,
                     )
@@ -126,165 +113,13 @@ class MonoScene(pl.LightningModule):
             "x3d": torch.stack(x3ds),
         }
-        out = self.net_3d_decoder(input_dict)
-        return out
-    def step(self, batch, step_type, metric):
-        bs = len(batch["img"])
-        loss = 0
-        out_dict = self(batch)
         ssc_pred = out_dict["ssc_logit"]
-        target = batch["target"]
-        if self.context_prior:
-            P_logits = out_dict["P_logits"]
-            CP_mega_matrices = batch["CP_mega_matrices"]
-            if self.relation_loss:
-                loss_rel_ce = compute_super_CP_multilabel_loss(
-                    P_logits, CP_mega_matrices
-                )
-                loss += loss_rel_ce
-                self.log(
-                    step_type + "/loss_relation_ce_super",
-                    loss_rel_ce.detach(),
-                    on_epoch=True,
-                    sync_dist=True,
-                )
-        class_weight = self.class_weights.type_as(batch["img"])
-        if self.CE_ssc_loss:
-            loss_ssc = CE_ssc_loss(ssc_pred, target, class_weight)
-            loss += loss_ssc
-            self.log(
-                step_type + "/loss_ssc",
-                loss_ssc.detach(),
-                on_epoch=True,
-                sync_dist=True,
-            )
-        if self.sem_scal_loss:
-            loss_sem_scal = sem_scal_loss(ssc_pred, target)
-            loss += loss_sem_scal
-            self.log(
-                step_type + "/loss_sem_scal",
-                loss_sem_scal.detach(),
-                on_epoch=True,
-                sync_dist=True,
-            )
-        if self.geo_scal_loss:
-            loss_geo_scal = geo_scal_loss(ssc_pred, target)
-            loss += loss_geo_scal
-            self.log(
-                step_type + "/loss_geo_scal",
-                loss_geo_scal.detach(),
-                on_epoch=True,
-                sync_dist=True,
-            )
-        if self.fp_loss and step_type != "test":
-            frustums_masks = torch.stack(batch["frustums_masks"])
-            frustums_class_dists = torch.stack(
-                batch["frustums_class_dists"]
-            ).float()  # (bs, n_frustums, n_classes)
-            n_frustums = frustums_class_dists.shape[1]
-            pred_prob = F.softmax(ssc_pred, dim=1)
-            batch_cnt = frustums_class_dists.sum(0)  # (n_frustums, n_classes)
-            frustum_loss = 0
-            frustum_nonempty = 0
-            for frus in range(n_frustums):
-                frustum_mask = frustums_masks[:, frus, :, :, :].unsqueeze(1).float()
-                prob = frustum_mask * pred_prob  # bs, n_classes, H, W, D
-                prob = prob.reshape(bs, self.n_classes, -1).permute(1, 0, 2)
-                prob = prob.reshape(self.n_classes, -1)
-                cum_prob = prob.sum(dim=1)  # n_classes
-                total_cnt = torch.sum(batch_cnt[frus])
-                total_prob = prob.sum()
-                if total_prob > 0 and total_cnt > 0:
-                    frustum_target_proportion = batch_cnt[frus] / total_cnt
-                    cum_prob = cum_prob / total_prob  # n_classes
-                    frustum_loss_i = KL_sep(cum_prob, frustum_target_proportion)
-                    frustum_loss += frustum_loss_i
-                    frustum_nonempty += 1
-            frustum_loss = frustum_loss / frustum_nonempty
-            loss += frustum_loss
-            self.log(
-                step_type + "/loss_frustums",
-                frustum_loss.detach(),
-                on_epoch=True,
-                sync_dist=True,
-            )
-        y_true = target.cpu().numpy()
         y_pred = ssc_pred.detach().cpu().numpy()
         y_pred = np.argmax(y_pred, axis=1)
-        metric.add_batch(y_pred, y_true)
-        self.log(step_type + "/loss", loss.detach(), on_epoch=True, sync_dist=True)
-        return loss
-    def training_step(self, batch, batch_idx):
-        return self.step(batch, "train", self.train_metrics)
-    def validation_step(self, batch, batch_idx):
-        self.step(batch, "val", self.val_metrics)
-    def validation_epoch_end(self, outputs):
-        metric_list = [("train", self.train_metrics), ("val", self.val_metrics)]
-        for prefix, metric in metric_list:
-            stats = metric.get_stats()
-            for i, class_name in enumerate(self.class_names):
-                self.log(
-                    "{}_SemIoU/{}".format(prefix, class_name),
-                    stats["iou_ssc"][i],
-                    sync_dist=True,
-                )
-            self.log("{}/mIoU".format(prefix), stats["iou_ssc_mean"], sync_dist=True)
-            self.log("{}/IoU".format(prefix), stats["iou"], sync_dist=True)
-            self.log("{}/Precision".format(prefix), stats["precision"], sync_dist=True)
-            self.log("{}/Recall".format(prefix), stats["recall"], sync_dist=True)
-            metric.reset()
-    def test_step(self, batch, batch_idx):
-        self.step(batch, "test", self.test_metrics)
-    def test_epoch_end(self, outputs):
-        classes = self.class_names
-        metric_list = [("test", self.test_metrics)]
-        for prefix, metric in metric_list:
-            print("{}======".format(prefix))
-            stats = metric.get_stats()
-            print(
-                "Precision={:.4f}, Recall={:.4f}, IoU={:.4f}".format(
-                    stats["precision"] * 100, stats["recall"] * 100, stats["iou"] * 100
-                )
-            )
-            print("class IoU: {}, ".format(classes))
-            print(
-                " ".join(["{:.4f}, "] * len(classes)).format(
-                    *(stats["iou_ssc"] * 100).tolist()
-                )
-            )
-            print("mIoU={:.4f}".format(stats["iou_ssc_mean"] * 100))
-            metric.reset()
-    def configure_optimizers(self):
-        if self.dataset == "NYU":
-            optimizer = torch.optim.AdamW(
-                self.parameters(), lr=self.lr, weight_decay=self.weight_decay
-            )
-            scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
-            return [optimizer], [scheduler]
-        elif self.dataset == "kitti":
-            optimizer = torch.optim.AdamW(
-                self.parameters(), lr=self.lr, weight_decay=self.weight_decay
-            )
-            scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
-            return [optimizer], [scheduler]

 import pytorch_lightning as pl
 import torch
 import torch.nn as nn
+from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.flosp import FLoSP
 import numpy as np
 import torch.nn.functional as F
+from monoscene.unet2d import UNet2D
 class MonoScene(pl.LightningModule):
     def __init__(
         self,
         n_classes,
         feature,
         project_scale,
         full_scene_size,
         dataset,
+        project_res=["1", "2", "4", "8"],
         n_relations=4,
         context_prior=True,
         fp_loss=True,
         frustum_size=4,
         relation_loss=False,
         CE_ssc_loss=True,
         self.dataset = dataset
         self.context_prior = context_prior
         self.frustum_size = frustum_size
         self.relation_loss = relation_loss
         self.CE_ssc_loss = CE_ssc_loss
         self.sem_scal_loss = sem_scal_loss
         self.geo_scal_loss = geo_scal_loss
         self.project_scale = project_scale
         self.lr = lr
         self.weight_decay = weight_decay
             )
         self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
     def forward(self, batch):
         img = batch["img"]
                 # project features at each 2D scale to target 3D scale
                 scale_2d = int(scale_2d)
+                projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda()
+                fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda()
                 # Sum all the 3D features
                 if x3d is None:
                     x3d = self.projects[str(scale_2d)](
                         x_rgb["1_" + str(scale_2d)][i],
+                        # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
                         projected_pix // scale_2d,
                         fov_mask,
                     )
                 else:
                     x3d += self.projects[str(scale_2d)](
                         x_rgb["1_" + str(scale_2d)][i],
+                        # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
                         projected_pix // scale_2d,
                         fov_mask,
                     )
             "x3d": torch.stack(x3ds),
         }
+        out_dict = self.net_3d_decoder(input_dict)
         ssc_pred = out_dict["ssc_logit"]
         y_pred = ssc_pred.detach().cpu().numpy()
         y_pred = np.argmax(y_pred, axis=1)
+        return y_pred

monoscene/monoscene_model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from transformers import PreTrainedModel
+from .config import MonoSceneConfig
+from monoscene.monoscene import MonoScene
+class MonoSceneModel(PreTrainedModel):
+    config_class = MonoSceneConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MonoScene(
+            dataset=config.dataset,
+            n_classes=config.n_classes,
+            feature=config.feature,
+            project_scale=config.project_scale,
+            full_scene_size=config.full_scene_size
+        )
+    def forward(self, tensor):
+        return self.model.forward(tensor)

monoscene/scripts/eval_monoscene.py DELETED Viewed

@@ -1,71 +0,0 @@
-from pytorch_lightning import Trainer
-from monoscene.models.monoscene import MonoScene
-from monoscene.data.NYU.nyu_dm import NYUDataModule
-from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
-import hydra
-from omegaconf import DictConfig
-import torch
-import os
-from hydra.utils import get_original_cwd
-@hydra.main(config_name="../config/monoscene.yaml")
-def main(config: DictConfig):
-    torch.set_grad_enabled(False)
-    if config.dataset == "kitti":
-        config.batch_size = 1
-        n_classes = 20
-        feature = 64
-        project_scale = 2
-        full_scene_size = (256, 256, 32)
-        data_module = KittiDataModule(
-            root=config.kitti_root,
-            preprocess_root=config.kitti_preprocess_root,
-            frustum_size=config.frustum_size,
-            batch_size=int(config.batch_size / config.n_gpus),
-            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
-        )
-    elif config.dataset == "NYU":
-        config.batch_size = 2
-        project_scale = 1
-        n_classes = 12
-        feature = 200
-        full_scene_size = (60, 36, 60)
-        data_module = NYUDataModule(
-            root=config.NYU_root,
-            preprocess_root=config.NYU_preprocess_root,
-            n_relations=config.n_relations,
-            frustum_size=config.frustum_size,
-            batch_size=int(config.batch_size / config.n_gpus),
-            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
-        )
-    trainer = Trainer(
-        sync_batchnorm=True, deterministic=True, gpus=config.n_gpus, accelerator="ddp"
-    )
-    if config.dataset == "NYU":
-        model_path = os.path.join(
-            get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
-        )
-    else:
-        model_path = os.path.join(
-            get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
-        )
-    model = MonoScene.load_from_checkpoint(
-        model_path,
-        feature=feature,
-        project_scale=project_scale,
-        fp_loss=config.fp_loss,
-        full_scene_size=full_scene_size,
-    )
-    model.eval()
-    data_module.setup()
-    val_dataloader = data_module.val_dataloader()
-    trainer.test(model, test_dataloaders=val_dataloader)
-if __name__ == "__main__":
-    main()

monoscene/scripts/generate_output.py DELETED Viewed

@@ -1,127 +0,0 @@
-from pytorch_lightning import Trainer
-from monoscene.models.monoscene import MonoScene
-from monoscene.data.NYU.nyu_dm import NYUDataModule
-from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
-from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
-import hydra
-from omegaconf import DictConfig
-import torch
-import numpy as np
-import os
-from hydra.utils import get_original_cwd
-from tqdm import tqdm
-import pickle
-@hydra.main(config_name="../config/monoscene.yaml")
-def main(config: DictConfig):
-    torch.set_grad_enabled(False)
-    # Setup dataloader
-    if config.dataset == "kitti" or config.dataset == "kitti_360":
-        feature = 64
-        project_scale = 2
-        full_scene_size = (256, 256, 32)
-        if config.dataset == "kitti":
-            data_module = KittiDataModule(
-                root=config.kitti_root,
-                preprocess_root=config.kitti_preprocess_root,
-                frustum_size=config.frustum_size,
-                batch_size=int(config.batch_size / config.n_gpus),
-                num_workers=int(config.num_workers_per_gpu * config.n_gpus),
-            )
-            data_module.setup()
-            data_loader = data_module.val_dataloader()
-            # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
-        else:
-            data_module = Kitti360DataModule(
-                root=config.kitti_360_root,
-                sequences=[config.kitti_360_sequence],
-                n_scans=2000,
-                batch_size=1,
-                num_workers=3,
-            )
-            data_module.setup()
-            data_loader = data_module.dataloader()
-    elif config.dataset == "NYU":
-        project_scale = 1
-        feature = 200
-        full_scene_size = (60, 36, 60)
-        data_module = NYUDataModule(
-            root=config.NYU_root,
-            preprocess_root=config.NYU_preprocess_root,
-            n_relations=config.n_relations,
-            frustum_size=config.frustum_size,
-            batch_size=int(config.batch_size / config.n_gpus),
-            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
-        )
-        data_module.setup()
-        data_loader = data_module.val_dataloader()
-        # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
-    else:
-        print("dataset not support")
-    # Load pretrained models
-    if config.dataset == "NYU":
-        model_path = os.path.join(
-            get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
-        )
-    else:
-        model_path = os.path.join(
-            get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
-        )
-    model = MonoScene.load_from_checkpoint(
-        model_path,
-        feature=feature,
-        project_scale=project_scale,
-        fp_loss=config.fp_loss,
-        full_scene_size=full_scene_size,
-    )
-    model.cuda()
-    model.eval()
-    # Save prediction and additional data
-    # to draw the viewing frustum and remove scene outside the room for NYUv2
-    output_path = os.path.join(config.output_path, config.dataset)
-    with torch.no_grad():
-        for batch in tqdm(data_loader):
-            batch["img"] = batch["img"].cuda()
-            pred = model(batch)
-            y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
-            y_pred = np.argmax(y_pred, axis=1)
-            for i in range(config.batch_size):
-                out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
-                if "target" in batch:
-                    out_dict["target"] = (
-                        batch["target"][i].detach().cpu().numpy().astype(np.uint16)
-                    )
-                if config.dataset == "NYU":
-                    write_path = output_path
-                    filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
-                    out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
-                    out_dict["vox_origin"] = (
-                        batch["vox_origin"][i].detach().cpu().numpy()
-                    )
-                else:
-                    write_path = os.path.join(output_path, batch["sequence"][i])
-                    filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
-                    out_dict["fov_mask_1"] = (
-                        batch["fov_mask_1"][i].detach().cpu().numpy()
-                    )
-                    out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
-                    out_dict["T_velo_2_cam"] = (
-                        batch["T_velo_2_cam"][i].detach().cpu().numpy()
-                    )
-                os.makedirs(write_path, exist_ok=True)
-                with open(filepath, "wb") as handle:
-                    pickle.dump(out_dict, handle)
-                    print("wrote to", filepath)
-if __name__ == "__main__":
-    main()

monoscene/scripts/train_monoscene.py DELETED Viewed

@@ -1,173 +0,0 @@
-from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
-from monoscene.data.semantic_kitti.params import (
-    semantic_kitti_class_frequencies,
-    kitti_class_names,
-)
-from monoscene.data.NYU.params import (
-    class_weights as NYU_class_weights,
-    NYU_class_names,
-)
-from monoscene.data.NYU.nyu_dm import NYUDataModule
-from torch.utils.data.dataloader import DataLoader
-from monoscene.models.monoscene import MonoScene
-from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
-import os
-import hydra
-from omegaconf import DictConfig
-import numpy as np
-import torch
-hydra.output_subdir = None
-@hydra.main(config_name="../config/monoscene.yaml")
-def main(config: DictConfig):
-    exp_name = config.exp_prefix
-    exp_name += "_{}_{}".format(config.dataset, config.run)
-    exp_name += "_FrusSize_{}".format(config.frustum_size)
-    exp_name += "_nRelations{}".format(config.n_relations)
-    exp_name += "_WD{}_lr{}".format(config.weight_decay, config.lr)
-    if config.CE_ssc_loss:
-        exp_name += "_CEssc"
-    if config.geo_scal_loss:
-        exp_name += "_geoScalLoss"
-    if config.sem_scal_loss:
-        exp_name += "_semScalLoss"
-    if config.fp_loss:
-        exp_name += "_fpLoss"
-    if config.relation_loss:
-        exp_name += "_CERel"
-    if config.context_prior:
-        exp_name += "_3DCRP"
-    # Setup dataloaders
-    if config.dataset == "kitti":
-        class_names = kitti_class_names
-        max_epochs = 30
-        logdir = config.kitti_logdir
-        full_scene_size = (256, 256, 32)
-        project_scale = 2
-        feature = 64
-        n_classes = 20
-        class_weights = torch.from_numpy(
-            1 / np.log(semantic_kitti_class_frequencies + 0.001)
-        )
-        data_module = KittiDataModule(
-            root=config.kitti_root,
-            preprocess_root=config.kitti_preprocess_root,
-            frustum_size=config.frustum_size,
-            project_scale=project_scale,
-            batch_size=int(config.batch_size / config.n_gpus),
-            num_workers=int(config.num_workers_per_gpu),
-        )
-    elif config.dataset == "NYU":
-        class_names = NYU_class_names
-        max_epochs = 30
-        logdir = config.logdir
-        full_scene_size = (60, 36, 60)
-        project_scale = 1
-        feature = 200
-        n_classes = 12
-        class_weights = NYU_class_weights
-        data_module = NYUDataModule(
-            root=config.NYU_root,
-            preprocess_root=config.NYU_preprocess_root,
-            n_relations=config.n_relations,
-            frustum_size=config.frustum_size,
-            batch_size=int(config.batch_size / config.n_gpus),
-            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
-        )
-    project_res = ["1"]
-    if config.project_1_2:
-        exp_name += "_Proj_2"
-        project_res.append("2")
-    if config.project_1_4:
-        exp_name += "_4"
-        project_res.append("4")
-    if config.project_1_8:
-        exp_name += "_8"
-        project_res.append("8")
-    print(exp_name)
-    # Initialize MonoScene model
-    model = MonoScene(
-        dataset=config.dataset,
-        frustum_size=config.frustum_size,
-        project_scale=project_scale,
-        n_relations=config.n_relations,
-        fp_loss=config.fp_loss,
-        feature=feature,
-        full_scene_size=full_scene_size,
-        project_res=project_res,
-        n_classes=n_classes,
-        class_names=class_names,
-        context_prior=config.context_prior,
-        relation_loss=config.relation_loss,
-        CE_ssc_loss=config.CE_ssc_loss,
-        sem_scal_loss=config.sem_scal_loss,
-        geo_scal_loss=config.geo_scal_loss,
-        lr=config.lr,
-        weight_decay=config.weight_decay,
-        class_weights=class_weights,
-    )
-    if config.enable_log:
-        logger = TensorBoardLogger(save_dir=logdir, name=exp_name, version="")
-        lr_monitor = LearningRateMonitor(logging_interval="step")
-        checkpoint_callbacks = [
-            ModelCheckpoint(
-                save_last=True,
-                monitor="val/mIoU",
-                save_top_k=1,
-                mode="max",
-                filename="{epoch:03d}-{val/mIoU:.5f}",
-            ),
-            lr_monitor,
-        ]
-    else:
-        logger = False
-        checkpoint_callbacks = False
-    model_path = os.path.join(logdir, exp_name, "checkpoints/last.ckpt")
-    if os.path.isfile(model_path):
-        # Continue training from last.ckpt
-        trainer = Trainer(
-            callbacks=checkpoint_callbacks,
-            resume_from_checkpoint=model_path,
-            sync_batchnorm=True,
-            deterministic=False,
-            max_epochs=max_epochs,
-            gpus=config.n_gpus,
-            logger=logger,
-            check_val_every_n_epoch=1,
-            log_every_n_steps=10,
-            flush_logs_every_n_steps=100,
-            accelerator="ddp",
-        )
-    else:
-        # Train from scratch
-        trainer = Trainer(
-            callbacks=checkpoint_callbacks,
-            sync_batchnorm=True,
-            deterministic=False,
-            max_epochs=max_epochs,
-            gpus=config.n_gpus,
-            logger=logger,
-            check_val_every_n_epoch=1,
-            log_every_n_steps=10,
-            flush_logs_every_n_steps=100,
-            accelerator="ddp",
-        )
-    trainer.fit(model, data_module)
-if __name__ == "__main__":
-    main()

monoscene/scripts/visualization/NYU_vis_pred.py DELETED Viewed

@@ -1,156 +0,0 @@
-import pickle
-import os
-from omegaconf import DictConfig
-import numpy as np
-import hydra
-from mayavi import mlab
-def get_grid_coords(dims, resolution):
-    """
-    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
-    :return coords_grid: is the center coords of voxels in the grid
-    """
-    g_xx = np.arange(0, dims[0] + 1)
-    g_yy = np.arange(0, dims[1] + 1)
-    g_zz = np.arange(0, dims[2] + 1)
-    # Obtaining the grid with coords...
-    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
-    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
-    coords_grid = coords_grid.astype(np.float)
-    coords_grid = (coords_grid * resolution) + resolution / 2
-    temp = np.copy(coords_grid)
-    temp[:, 0] = coords_grid[:, 1]
-    temp[:, 1] = coords_grid[:, 0]
-    coords_grid = np.copy(temp)
-    return coords_grid
-def draw(
-    voxels,
-    cam_pose,
-    vox_origin,
-    voxel_size=0.08,
-    d=0.75,  # 0.75m - determine the size of the mesh representing the camera
-):
-    # Compute the coordinates of the mesh representing camera
-    y = d * 480 / (2 * 518.8579)
-    x = d * 640 / (2 * 518.8579)
-    tri_points = np.array(
-        [
-            [0, 0, 0],
-            [x, y, d],
-            [-x, y, d],
-            [-x, -y, d],
-            [x, -y, d],
-        ]
-    )
-    tri_points = np.hstack([tri_points, np.ones((5, 1))])
-    tri_points = (cam_pose @ tri_points.T).T
-    x = tri_points[:, 0] - vox_origin[0]
-    y = tri_points[:, 1] - vox_origin[1]
-    z = tri_points[:, 2] - vox_origin[2]
-    triangles = [
-        (0, 1, 2),
-        (0, 1, 4),
-        (0, 3, 4),
-        (0, 2, 3),
-    ]
-    # Compute the voxels coordinates
-    grid_coords = get_grid_coords(
-        [voxels.shape[0], voxels.shape[2], voxels.shape[1]], voxel_size
-    )
-    # Attach the predicted class to every voxel
-    grid_coords = np.vstack(
-        (grid_coords.T, np.moveaxis(voxels, [0, 1, 2], [0, 2, 1]).reshape(-1))
-    ).T
-    # Remove empty and unknown voxels
-    occupied_voxels = grid_coords[(grid_coords[:, 3] > 0) & (grid_coords[:, 3] < 255)]
-    figure = mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1))
-    # Draw the camera
-    mlab.triangular_mesh(
-        x,
-        y,
-        z,
-        triangles,
-        representation="wireframe",
-        color=(0, 0, 0),
-        line_width=5,
-    )
-    # Draw occupied voxels
-    plt_plot = mlab.points3d(
-        occupied_voxels[:, 0],
-        occupied_voxels[:, 1],
-        occupied_voxels[:, 2],
-        occupied_voxels[:, 3],
-        colormap="viridis",
-        scale_factor=voxel_size - 0.1 * voxel_size,
-        mode="cube",
-        opacity=1.0,
-        vmin=0,
-        vmax=12,
-    )
-    colors = np.array(
-        [
-            [22, 191, 206, 255],
-            [214, 38, 40, 255],
-            [43, 160, 43, 255],
-            [158, 216, 229, 255],
-            [114, 158, 206, 255],
-            [204, 204, 91, 255],
-            [255, 186, 119, 255],
-            [147, 102, 188, 255],
-            [30, 119, 181, 255],
-            [188, 188, 33, 255],
-            [255, 127, 12, 255],
-            [196, 175, 214, 255],
-            [153, 153, 153, 255],
-        ]
-    )
-    plt_plot.glyph.scale_mode = "scale_by_vector"
-    plt_plot.module_manager.scalar_lut_manager.lut.table = colors
-    mlab.show()
-@hydra.main(config_path=None)
-def main(config: DictConfig):
-    scan = config.file
-    with open(scan, "rb") as handle:
-        b = pickle.load(handle)
-    cam_pose = b["cam_pose"]
-    vox_origin = b["vox_origin"]
-    gt_scene = b["target"]
-    pred_scene = b["y_pred"]
-    scan = os.path.basename(scan)[:12]
-    pred_scene[(gt_scene == 255)] = 255  # only draw scene inside the room
-    draw(
-        pred_scene,
-        cam_pose,
-        vox_origin,
-        voxel_size=0.08,
-        d=0.75,
-    )
-if __name__ == "__main__":
-    main()

monoscene/scripts/visualization/kitti_vis_pred.py DELETED Viewed

@@ -1,201 +0,0 @@
-# from operator import gt
-import pickle
-import numpy as np
-from omegaconf import DictConfig
-import hydra
-from mayavi import mlab
-def get_grid_coords(dims, resolution):
-    """
-    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
-    :return coords_grid: is the center coords of voxels in the grid
-    """
-    g_xx = np.arange(0, dims[0] + 1)
-    g_yy = np.arange(0, dims[1] + 1)
-    sensor_pose = 10
-    g_zz = np.arange(0, dims[2] + 1)
-    # Obtaining the grid with coords...
-    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
-    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
-    coords_grid = coords_grid.astype(np.float)
-    coords_grid = (coords_grid * resolution) + resolution / 2
-    temp = np.copy(coords_grid)
-    temp[:, 0] = coords_grid[:, 1]
-    temp[:, 1] = coords_grid[:, 0]
-    coords_grid = np.copy(temp)
-    return coords_grid
-def draw(
-    voxels,
-    T_velo_2_cam,
-    vox_origin,
-    fov_mask,
-    img_size,
-    f,
-    voxel_size=0.2,
-    d=7,  # 7m - determine the size of the mesh representing the camera
-):
-    # Compute the coordinates of the mesh representing camera
-    x = d * img_size[0] / (2 * f)
-    y = d * img_size[1] / (2 * f)
-    tri_points = np.array(
-        [
-            [0, 0, 0],
-            [x, y, d],
-            [-x, y, d],
-            [-x, -y, d],
-            [x, -y, d],
-        ]
-    )
-    tri_points = np.hstack([tri_points, np.ones((5, 1))])
-    tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T
-    x = tri_points[:, 0] - vox_origin[0]
-    y = tri_points[:, 1] - vox_origin[1]
-    z = tri_points[:, 2] - vox_origin[2]
-    triangles = [
-        (0, 1, 2),
-        (0, 1, 4),
-        (0, 3, 4),
-        (0, 2, 3),
-    ]
-    # Compute the voxels coordinates
-    grid_coords = get_grid_coords(
-        [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
-    )
-    # Attach the predicted class to every voxel
-    grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
-    # Get the voxels inside FOV
-    fov_grid_coords = grid_coords[fov_mask, :]
-    # Get the voxels outside FOV
-    outfov_grid_coords = grid_coords[~fov_mask, :]
-    # Remove empty and unknown voxels
-    fov_voxels = fov_grid_coords[
-        (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255)
-    ]
-    outfov_voxels = outfov_grid_coords[
-        (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255)
-    ]
-    figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
-    # Draw the camera
-    mlab.triangular_mesh(
-        x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5
-    )
-    # Draw occupied inside FOV voxels
-    plt_plot_fov = mlab.points3d(
-        fov_voxels[:, 0],
-        fov_voxels[:, 1],
-        fov_voxels[:, 2],
-        fov_voxels[:, 3],
-        colormap="viridis",
-        scale_factor=voxel_size - 0.05 * voxel_size,
-        mode="cube",
-        opacity=1.0,
-        vmin=1,
-        vmax=19,
-    )
-    # Draw occupied outside FOV voxels
-    plt_plot_outfov = mlab.points3d(
-        outfov_voxels[:, 0],
-        outfov_voxels[:, 1],
-        outfov_voxels[:, 2],
-        outfov_voxels[:, 3],
-        colormap="viridis",
-        scale_factor=voxel_size - 0.05 * voxel_size,
-        mode="cube",
-        opacity=1.0,
-        vmin=1,
-        vmax=19,
-    )
-    colors = np.array(
-        [
-            [100, 150, 245, 255],
-            [100, 230, 245, 255],
-            [30, 60, 150, 255],
-            [80, 30, 180, 255],
-            [100, 80, 250, 255],
-            [255, 30, 30, 255],
-            [255, 40, 200, 255],
-            [150, 30, 90, 255],
-            [255, 0, 255, 255],
-            [255, 150, 255, 255],
-            [75, 0, 75, 255],
-            [175, 0, 75, 255],
-            [255, 200, 0, 255],
-            [255, 120, 50, 255],
-            [0, 175, 0, 255],
-            [135, 60, 0, 255],
-            [150, 240, 80, 255],
-            [255, 240, 150, 255],
-            [255, 0, 0, 255],
-        ]
-    ).astype(np.uint8)
-    plt_plot_fov.glyph.scale_mode = "scale_by_vector"
-    plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
-    plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
-    outfov_colors = colors
-    outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
-    plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
-    mlab.show()
-@hydra.main(config_path=None)
-def main(config: DictConfig):
-    scan = config.file
-    with open(scan, "rb") as handle:
-        b = pickle.load(handle)
-    fov_mask_1 = b["fov_mask_1"]
-    T_velo_2_cam = b["T_velo_2_cam"]
-    vox_origin = np.array([0, -25.6, -2])
-    y_pred = b["y_pred"]
-    if config.dataset == "kitti_360":
-        # Visualize KITTI-360
-        draw(
-            y_pred,
-            T_velo_2_cam,
-            vox_origin,
-            fov_mask_1,
-            voxel_size=0.2,
-            f=552.55426,
-            img_size=(1408, 376),
-            d=7,
-        )
-    else:
-        # Visualize Semantic KITTI
-        draw(
-            y_pred,
-            T_velo_2_cam,
-            vox_origin,
-            fov_mask_1,
-            img_size=(1220, 370),
-            f=707.0912,
-            voxel_size=0.2,
-            d=7,
-        )
-if __name__ == "__main__":
-    main()

monoscene/{models/unet2d.py → unet2d.py} RENAMED Viewed

File without changes

monoscene/{models/unet3d_kitti.py → unet3d_kitti.py} RENAMED Viewed

@@ -2,9 +2,9 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from monoscene.models.modules import SegmentationHead
-from monoscene.models.CRP3D import CPMegaVoxels
-from monoscene.models.modules import Process, Upsample, Downsample
 class UNet3D(nn.Module):

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from monoscene.modules import SegmentationHead
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import Process, Upsample, Downsample
 class UNet3D(nn.Module):

monoscene/{models/unet3d_nyu.py → unet3d_nyu.py} RENAMED Viewed

@@ -3,8 +3,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
-from monoscene.models.CRP3D import CPMegaVoxels
-from monoscene.models.modules import (
     Process,
     Upsample,
     Downsample,

 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import (
     Process,
     Upsample,
     Downsample,