MonoScene_room3d_model

Build error

App Files Files Community

Sohaib36 commited on Mar 2, 2023

Commit

6705a8b

•

1 Parent(s): e5f4906

add: adding monoscene

Browse files

Files changed (46) hide show

monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py +0 -97
monoscene/.ipynb_checkpoints/config-checkpoint.py +0 -34
monoscene/.ipynb_checkpoints/modules-checkpoint.py +0 -194
monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py +0 -22
monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py +0 -88
monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py +0 -90
monoscene/__init__.py +0 -0
monoscene/app.py +0 -138
monoscene/config.py +0 -26
monoscene/config/monoscene.yaml +35 -0
monoscene/data/NYU/collate.py +50 -0
monoscene/data/NYU/nyu_dataset.py +133 -0
monoscene/data/NYU/nyu_dm.py +78 -0
monoscene/data/NYU/params.py +54 -0
monoscene/data/NYU/preprocess.py +182 -0
monoscene/data/kitti_360/collate.py +47 -0
monoscene/data/kitti_360/kitti_360_dataset.py +125 -0
monoscene/data/kitti_360/kitti_360_dm.py +32 -0
monoscene/data/semantic_kitti/collate.py +61 -0
monoscene/data/semantic_kitti/io_data.py +239 -0
monoscene/data/semantic_kitti/kitti_dataset.py +200 -0
monoscene/data/semantic_kitti/kitti_dm.py +91 -0
monoscene/data/semantic_kitti/params.py +48 -0
monoscene/data/semantic_kitti/preprocess.py +102 -0
monoscene/data/semantic_kitti/semantic-kitti.yaml +213 -0
monoscene/data/utils/fusion.py +507 -0
monoscene/data/utils/helpers.py +185 -0
monoscene/data/utils/torch_util.py +15 -0
monoscene/loss/CRP_loss.py +24 -0
monoscene/loss/sscMetrics.py +204 -0
monoscene/loss/ssc_loss.py +99 -0
monoscene/{CRP3D.py → models/CRP3D.py} +1 -1
monoscene/{DDR.py → models/DDR.py} +0 -0
monoscene/{flosp.py → models/flosp.py} +0 -0
monoscene/{modules.py → models/modules.py} +1 -1
monoscene/{.ipynb_checkpoints/monoscene-checkpoint.py → models/monoscene.py} +174 -7
monoscene/{unet2d.py → models/unet2d.py} +0 -0
monoscene/{unet3d_kitti.py → models/unet3d_kitti.py} +3 -3
monoscene/{unet3d_nyu.py → models/unet3d_nyu.py} +2 -2
monoscene/monoscene.py +0 -125
monoscene/monoscene_model.py +0 -21
monoscene/scripts/eval_monoscene.py +71 -0
monoscene/scripts/generate_output.py +127 -0
monoscene/scripts/train_monoscene.py +173 -0
monoscene/scripts/visualization/NYU_vis_pred.py +156 -0
monoscene/scripts/visualization/kitti_vis_pred.py +201 -0

monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py DELETED Viewed

@@ -1,97 +0,0 @@
-import torch
-import torch.nn as nn
-from monoscene.modules import (
-    Process,
-    ASPP,
-)
-class CPMegaVoxels(nn.Module):
-    def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
-        super().__init__()
-        self.size = size
-        self.n_relations = n_relations
-        print("n_relations", self.n_relations)
-        self.flatten_size = size[0] * size[1] * size[2]
-        self.feature = feature
-        self.context_feature = feature * 2
-        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
-        padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
-        self.mega_context = nn.Sequential(
-            nn.Conv3d(
-                feature, self.context_feature, stride=2, padding=padding, kernel_size=3
-            ),
-        )
-        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
-        self.context_prior_logits = nn.ModuleList(
-            [
-                nn.Sequential(
-                    nn.Conv3d(
-                        self.feature,
-                        self.flatten_context_size,
-                        padding=0,
-                        kernel_size=1,
-                    ),
-                )
-                for i in range(n_relations)
-            ]
-        )
-        self.aspp = ASPP(feature, [1, 2, 3])
-        self.resize = nn.Sequential(
-            nn.Conv3d(
-                self.context_feature * self.n_relations + feature,
-                feature,
-                kernel_size=1,
-                padding=0,
-                bias=False,
-            ),
-            Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
-        )
-    def forward(self, input):
-        ret = {}
-        bs = input.shape[0]
-        x_agg = self.aspp(input)
-        # get the mega context
-        x_mega_context_raw = self.mega_context(x_agg)
-        x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
-        x_mega_context = x_mega_context.permute(0, 2, 1)
-        # get context prior map
-        x_context_prior_logits = []
-        x_context_rels = []
-        for rel in range(self.n_relations):
-            # Compute the relation matrices
-            x_context_prior_logit = self.context_prior_logits[rel](x_agg)
-            x_context_prior_logit = x_context_prior_logit.reshape(
-                bs, self.flatten_context_size, self.flatten_size
-            )
-            x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
-            x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
-            x_context_prior = torch.sigmoid(x_context_prior_logit)
-            # Multiply the relation matrices with the mega context to gather context features
-            x_context_rel = torch.bmm(x_context_prior, x_mega_context)  # bs, N, f
-            x_context_rels.append(x_context_rel)
-        x_context = torch.cat(x_context_rels, dim=2)
-        x_context = x_context.permute(0, 2, 1)
-        x_context = x_context.reshape(
-            bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
-        )
-        x = torch.cat([input, x_context], dim=1)
-        x = self.resize(x)
-        x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
-        ret["P_logits"] = x_context_prior_logits
-        ret["x"] = x
-        return ret

monoscene/.ipynb_checkpoints/config-checkpoint.py DELETED Viewed

@@ -1,34 +0,0 @@
-from transformers import PretrainedConfig
-from typing import List
-class MonoSceneConfig(PretrainedConfig):
-    def __init__(
-        self,
-        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
-        num_classes: int = 1000,
-        input_channels: int = 3,
-        cardinality: int = 1,
-        base_width: int = 64,
-        stem_width: int = 64,
-        stem_type: str = "",
-        avg_down: bool = False,
-        **kwargs,
-    ):
-        self.block_type = block_type
-        self.layers = layers
-        self.num_classes = num_classes
-        self.input_channels = input_channels
-        self.cardinality = cardinality
-        self.base_width = base_width
-        self.stem_width = stem_width
-        self.stem_type = stem_type
-        self.avg_down = avg_down
-        super().__init__(**kwargs)

monoscene/.ipynb_checkpoints/modules-checkpoint.py DELETED Viewed

@@ -1,194 +0,0 @@
-import torch
-import torch.nn as nn
-from monoscene.DDR import Bottleneck3D
-class ASPP(nn.Module):
-    """
-    ASPP 3D
-    Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
-    """
-    def __init__(self, planes, dilations_conv_list):
-        super().__init__()
-        # ASPP Block
-        self.conv_list = dilations_conv_list
-        self.conv1 = nn.ModuleList(
-            [
-                nn.Conv3d(
-                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
-                )
-                for dil in dilations_conv_list
-            ]
-        )
-        self.bn1 = nn.ModuleList(
-            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
-        )
-        self.conv2 = nn.ModuleList(
-            [
-                nn.Conv3d(
-                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
-                )
-                for dil in dilations_conv_list
-            ]
-        )
-        self.bn2 = nn.ModuleList(
-            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
-        )
-        self.relu = nn.ReLU()
-    def forward(self, x_in):
-        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
-        for i in range(1, len(self.conv_list)):
-            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
-        x_in = self.relu(y + x_in)  # modified
-        return x_in
-class SegmentationHead(nn.Module):
-    """
-    3D Segmentation heads to retrieve semantic segmentation at each scale.
-    Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
-    Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
-    """
-    def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
-        super().__init__()
-        # First convolution
-        self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
-        # ASPP Block
-        self.conv_list = dilations_conv_list
-        self.conv1 = nn.ModuleList(
-            [
-                nn.Conv3d(
-                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
-                )
-                for dil in dilations_conv_list
-            ]
-        )
-        self.bn1 = nn.ModuleList(
-            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
-        )
-        self.conv2 = nn.ModuleList(
-            [
-                nn.Conv3d(
-                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
-                )
-                for dil in dilations_conv_list
-            ]
-        )
-        self.bn2 = nn.ModuleList(
-            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
-        )
-        self.relu = nn.ReLU()
-        self.conv_classes = nn.Conv3d(
-            planes, nbr_classes, kernel_size=3, padding=1, stride=1
-        )
-    def forward(self, x_in):
-        # Convolution to go from inplanes to planes features...
-        x_in = self.relu(self.conv0(x_in))
-        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
-        for i in range(1, len(self.conv_list)):
-            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
-        x_in = self.relu(y + x_in)  # modified
-        x_in = self.conv_classes(x_in)
-        return x_in
-class ProcessKitti(nn.Module):
-    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
-        super(Process, self).__init__()
-        self.main = nn.Sequential(
-            *[
-                Bottleneck3D(
-                    feature,
-                    feature // 4,
-                    bn_momentum=bn_momentum,
-                    norm_layer=norm_layer,
-                    dilation=[i, i, i],
-                )
-                for i in dilations
-            ]
-        )
-    def forward(self, x):
-        return self.main(x)
-class Process(nn.Module):
-    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
-        super(Process, self).__init__()
-        self.main = nn.Sequential(
-            *[
-                Bottleneck3D(
-                    feature,
-                    feature // 4,
-                    bn_momentum=bn_momentum,
-                    norm_layer=norm_layer,
-                    dilation=[i, i, i],
-                )
-                for i in dilations
-            ]
-        )
-    def forward(self, x):
-        return self.main(x)
-class Upsample(nn.Module):
-    def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
-        super(Upsample, self).__init__()
-        self.main = nn.Sequential(
-            nn.ConvTranspose3d(
-                in_channels,
-                out_channels,
-                kernel_size=3,
-                stride=2,
-                padding=1,
-                dilation=1,
-                output_padding=1,
-            ),
-            norm_layer(out_channels, momentum=bn_momentum),
-            nn.ReLU(),
-        )
-    def forward(self, x):
-        return self.main(x)
-class Downsample(nn.Module):
-    def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
-        super(Downsample, self).__init__()
-        self.main = Bottleneck3D(
-            feature,
-            feature // 4,
-            bn_momentum=bn_momentum,
-            expansion=expansion,
-            stride=2,
-            downsample=nn.Sequential(
-                nn.AvgPool3d(kernel_size=2, stride=2),
-                nn.Conv3d(
-                    feature,
-                    int(feature * expansion / 4),
-                    kernel_size=1,
-                    stride=1,
-                    bias=False,
-                ),
-                norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
-            ),
-            norm_layer=norm_layer,
-        )
-    def forward(self, x):
-        return self.main(x)

monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py DELETED Viewed

@@ -1,22 +0,0 @@
-from transformers import PreTrainedModel
-from .config import MonoSceneConfig
-from monoscene.monoscene import MonoScene
-class MonoSceneModel(PreTrainedModel):
-    config_class = ResnetConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = MonoScene(
-            dataset=config.dataset,
-            n_classes=config.n_classes,
-            feature=config.feature,
-            project_scale=config.project_scale,
-            full_scene_size=config.full_scene_size
-        )
-    def forward(self, tensor):
-        return self.model.forward(tensor)

monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py DELETED Viewed

@@ -1,88 +0,0 @@
-# encoding: utf-8
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from monoscene.modules import SegmentationHead
-from monoscene.CRP3D import CPMegaVoxels
-from monoscene.modules import Process, Upsample, Downsample
-class UNet3D(nn.Module):
-    def __init__(
-        self,
-        class_num,
-        norm_layer,
-        full_scene_size,
-        feature,
-        project_scale,
-        context_prior=None,
-        bn_momentum=0.1,
-    ):
-        super(UNet3D, self).__init__()
-        self.business_layer = []
-        self.project_scale = project_scale
-        self.full_scene_size = full_scene_size
-        self.feature = feature
-        size_l1 = (
-            int(self.full_scene_size[0] / project_scale),
-            int(self.full_scene_size[1] / project_scale),
-            int(self.full_scene_size[2] / project_scale),
-        )
-        size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
-        size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
-        dilations = [1, 2, 3]
-        self.process_l1 = nn.Sequential(
-            Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
-            Downsample(self.feature, norm_layer, bn_momentum),
-        )
-        self.process_l2 = nn.Sequential(
-            Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
-            Downsample(self.feature * 2, norm_layer, bn_momentum),
-        )
-        self.up_13_l2 = Upsample(
-            self.feature * 4, self.feature * 2, norm_layer, bn_momentum
-        )
-        self.up_12_l1 = Upsample(
-            self.feature * 2, self.feature, norm_layer, bn_momentum
-        )
-        self.up_l1_lfull = Upsample(
-            self.feature, self.feature // 2, norm_layer, bn_momentum
-        )
-        self.ssc_head = SegmentationHead(
-            self.feature // 2, self.feature // 2, class_num, dilations
-        )
-        self.context_prior = context_prior
-        if context_prior:
-            self.CP_mega_voxels = CPMegaVoxels(
-                self.feature * 4, size_l3, bn_momentum=bn_momentum
-            )
-    def forward(self, input_dict):
-        res = {}
-        x3d_l1 = input_dict["x3d"]
-        x3d_l2 = self.process_l1(x3d_l1)
-        x3d_l3 = self.process_l2(x3d_l2)
-        if self.context_prior:
-            ret = self.CP_mega_voxels(x3d_l3)
-            x3d_l3 = ret["x"]
-            for k in ret.keys():
-                res[k] = ret[k]
-        x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
-        x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
-        x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
-        ssc_logit_full = self.ssc_head(x3d_up_lfull)
-        res["ssc_logit"] = ssc_logit_full
-        return res

monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py DELETED Viewed

@@ -1,90 +0,0 @@
-# encoding: utf-8
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-from monoscene.CRP3D import CPMegaVoxels
-from monoscene.modules import (
-    Process,
-    Upsample,
-    Downsample,
-    SegmentationHead,
-    ASPP,
-)
-class UNet3D(nn.Module):
-    def __init__(
-        self,
-        class_num,
-        norm_layer,
-        feature,
-        full_scene_size,
-        n_relations=4,
-        project_res=[],
-        context_prior=True,
-        bn_momentum=0.1,
-    ):
-        super(UNet3D, self).__init__()
-        self.business_layer = []
-        self.project_res = project_res
-        self.feature_1_4 = feature
-        self.feature_1_8 = feature * 2
-        self.feature_1_16 = feature * 4
-        self.feature_1_16_dec = self.feature_1_16
-        self.feature_1_8_dec = self.feature_1_8
-        self.feature_1_4_dec = self.feature_1_4
-        self.process_1_4 = nn.Sequential(
-            Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
-            Downsample(self.feature_1_4, norm_layer, bn_momentum),
-        )
-        self.process_1_8 = nn.Sequential(
-            Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
-            Downsample(self.feature_1_8, norm_layer, bn_momentum),
-        )
-        self.up_1_16_1_8 = Upsample(
-            self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
-        )
-        self.up_1_8_1_4 = Upsample(
-            self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
-        )
-        self.ssc_head_1_4 = SegmentationHead(
-            self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
-        )
-        self.context_prior = context_prior
-        size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
-        if context_prior:
-            self.CP_mega_voxels = CPMegaVoxels(
-                self.feature_1_16,
-                size_1_16,
-                n_relations=n_relations,
-                bn_momentum=bn_momentum,
-            )
-    #
-    def forward(self, input_dict):
-        res = {}
-        x3d_1_4 = input_dict["x3d"]
-        x3d_1_8 = self.process_1_4(x3d_1_4)
-        x3d_1_16 = self.process_1_8(x3d_1_8)
-        if self.context_prior:
-            ret = self.CP_mega_voxels(x3d_1_16)
-            x3d_1_16 = ret["x"]
-            for k in ret.keys():
-                res[k] = ret[k]
-        x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
-        x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
-        ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
-        res["ssc_logit"] = ssc_logit_1_4
-        return res

monoscene/__init__.py DELETED Viewed

File without changes

monoscene/app.py DELETED Viewed

@@ -1,138 +0,0 @@
-from pytorch_lightning import Trainer
-from monoscene.models.monoscene import MonoScene
-from monoscene.data.NYU.nyu_dm import NYUDataModule
-from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
-from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
-# import hydra
-from omegaconf import DictConfig
-import torch
-import numpy as np
-import os
-from hydra.utils import get_original_cwd
-import gradio as gr
-import numpy as np
-import plotly.express as px
-import pandas as pd
-# @hydra.main(config_name="../config/monoscene.yaml")
-def plot(input_img):
-    torch.set_grad_enabled(False)
-    # Setup dataloader
-    # if config.dataset == "kitti" or config.dataset == "kitti_360":
-    feature = 64
-    project_scale = 2
-    full_scene_size = (256, 256, 32)
-    #     if config.dataset == "kitti":
-    #         data_module = KittiDataModule(
-    #             root=config.kitti_root,
-    #             preprocess_root=config.kitti_preprocess_root,
-    #             frustum_size=config.frustum_size,
-    #             batch_size=int(config.batch_size / config.n_gpus),
-    #             num_workers=int(config.num_workers_per_gpu * config.n_gpus),
-    #         )
-    #         data_module.setup()
-    #         data_loader = data_module.val_dataloader()
-    #         # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
-    #     else:
-    #         data_module = Kitti360DataModule(
-    #             root=config.kitti_360_root,
-    #             sequences=[config.kitti_360_sequence],
-    #             n_scans=2000,
-    #             batch_size=1,
-    #             num_workers=3,
-    #         )
-    #         data_module.setup()
-    #         data_loader = data_module.dataloader()
-    # elif config.dataset == "NYU":
-    #     project_scale = 1
-    #     feature = 200
-    #     full_scene_size = (60, 36, 60)
-    #     data_module = NYUDataModule(
-    #         root=config.NYU_root,
-    #         preprocess_root=config.NYU_preprocess_root,
-    #         n_relations=config.n_relations,
-    #         frustum_size=config.frustum_size,
-    #         batch_size=int(config.batch_size / config.n_gpus),
-    #         num_workers=int(config.num_workers_per_gpu * config.n_gpus),
-    #     )
-    #     data_module.setup()
-    #     data_loader = data_module.val_dataloader()
-    #     # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
-    # else:
-    #     print("dataset not support")
-    # Load pretrained models
-    # if config.dataset == "NYU":
-    #     model_path = os.path.join(
-    #         get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
-    #     )
-    # else:
-    # model_path = os.path.join(
-    #     get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
-    # )
-    model_path = "trained_models/monoscene_kitti.ckpt"
-    model = MonoScene.load_from_checkpoint(
-        model_path,
-        feature=feature,
-        project_scale=project_scale,
-        fp_loss=False,
-        full_scene_size=full_scene_size,
-    )
-    model.cuda()
-    model.eval()
-    print(input_img.shape)
-    x = np.arange(12).reshape(4, 3) / 12
-    data = pd.DataFrame(data=x, columns=['x', 'y', 'z'])
-    fig = px.scatter_3d(data, x="x", y="y", z="z")
-    return fig
-demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot())
-demo.launch()
-    # Save prediction and additional data
-    # to draw the viewing frustum and remove scene outside the room for NYUv2
-    # output_path = os.path.join(config.output_path, config.dataset)
-    # with torch.no_grad():
-    #     for batch in tqdm(data_loader):
-    #         batch["img"] = batch["img"].cuda()
-    #         pred = model(batch)
-    #         y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
-    #         y_pred = np.argmax(y_pred, axis=1)
-    #         for i in range(config.batch_size):
-    #             out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
-    #             if "target" in batch:
-    #                 out_dict["target"] = (
-    #                     batch["target"][i].detach().cpu().numpy().astype(np.uint16)
-    #                 )
-    #             if config.dataset == "NYU":
-    #                 write_path = output_path
-    #                 filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
-    #                 out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
-    #                 out_dict["vox_origin"] = (
-    #                     batch["vox_origin"][i].detach().cpu().numpy()
-    #                 )
-    #             else:
-    #                 write_path = os.path.join(output_path, batch["sequence"][i])
-    #                 filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
-    #                 out_dict["fov_mask_1"] = (
-    #                     batch["fov_mask_1"][i].detach().cpu().numpy()
-    #                 )
-    #                 out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
-    #                 out_dict["T_velo_2_cam"] = (
-    #                     batch["T_velo_2_cam"][i].detach().cpu().numpy()
-    #                 )
-    #             os.makedirs(write_path, exist_ok=True)
-    #             with open(filepath, "wb") as handle:
-    #                 pickle.dump(out_dict, handle)
-    #                 print("wrote to", filepath)

monoscene/config.py DELETED Viewed

@@ -1,26 +0,0 @@
-from transformers import PretrainedConfig
-from typing import List
-class MonoSceneConfig(PretrainedConfig):
-    def __init__(
-        self,
-        dataset="kitti",
-        n_classes=20,
-        feature=64,
-        project_scale=2,
-        full_scene_size=(256, 256, 32),
-        **kwargs,
-    ):
-        self.dataset = dataset
-        self.n_classes = n_classes
-        self.feature = feature
-        self.project_scale = project_scale
-        self.full_scene_size = full_scene_size
-        super().__init__(**kwargs)

monoscene/config/monoscene.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+#dataset: "NYU" # "kitti", "kitti_360"
+dataset: "kitti_360"
+n_relations: 4
+enable_log: false
+kitti_root: '/path/to/semantic_kitti'
+kitti_preprocess_root: '/path/to/kitti/preprocess/folder'
+kitti_logdir: '/path/to/semantic_kitti/logdir'
+NYU_root: '/path/to/NYU/depthbin'
+NYU_preprocess_root: '/path/to/NYU/preprocess/folder'
+logdir: '/path/to/NYU/logdir'
+fp_loss: true
+frustum_size: 8
+batch_size: 1
+n_gpus: 1
+num_workers_per_gpu: 3
+exp_prefix: "exp"
+run: 1
+lr: 1e-4
+weight_decay: 1e-4
+context_prior: true
+relation_loss: true
+CE_ssc_loss: true
+sem_scal_loss: true
+geo_scal_loss: true
+project_1_2: true
+project_1_4: true
+project_1_8: true

monoscene/data/NYU/collate.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+def collate_fn(batch):
+    data = {}
+    imgs = []
+    targets = []
+    names = []
+    cam_poses = []
+    vox_origins = []
+    cam_ks = []
+    CP_mega_matrices = []
+    data["projected_pix_1"] = []
+    data["fov_mask_1"] = []
+    data["frustums_masks"] = []
+    data["frustums_class_dists"] = []
+    for idx, input_dict in enumerate(batch):
+        CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
+        for key in data:
+            if key in input_dict:
+                data[key].append(torch.from_numpy(input_dict[key]))
+        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
+        cam_poses.append(torch.from_numpy(input_dict["cam_pose"]).float())
+        vox_origins.append(torch.from_numpy(input_dict["voxel_origin"]).double())
+        names.append(input_dict["name"])
+        img = input_dict["img"]
+        imgs.append(img)
+        target = torch.from_numpy(input_dict["target"])
+        targets.append(target)
+    ret_data = {
+        "CP_mega_matrices": CP_mega_matrices,
+        "cam_pose": torch.stack(cam_poses),
+        "cam_k": torch.stack(cam_ks),
+        "vox_origin": torch.stack(vox_origins),
+        "name": names,
+        "img": torch.stack(imgs),
+        "target": torch.stack(targets),
+    }
+    for key in data:
+        ret_data[key] = data[key]
+    return ret_data

monoscene/data/NYU/nyu_dataset.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import os
+import glob
+from torch.utils.data import Dataset
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from monoscene.data.utils.helpers import (
+    vox2pix,
+    compute_local_frustums,
+    compute_CP_mega_matrix,
+)
+import pickle
+import torch.nn.functional as F
+class NYUDataset(Dataset):
+    def __init__(
+        self,
+        split,
+        root,
+        preprocess_root,
+        n_relations=4,
+        color_jitter=None,
+        frustum_size=4,
+        fliplr=0.0,
+    ):
+        self.n_relations = n_relations
+        self.frustum_size = frustum_size
+        self.n_classes = 12
+        self.root = os.path.join(root, "NYU" + split)
+        self.preprocess_root = preprocess_root
+        self.base_dir = os.path.join(preprocess_root, "base", "NYU" + split)
+        self.fliplr = fliplr
+        self.voxel_size = 0.08  # 0.08m
+        self.scene_size = (4.8, 4.8, 2.88)  # (4.8m, 4.8m, 2.88m)
+        self.img_W = 640
+        self.img_H = 480
+        self.cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]])
+        self.color_jitter = (
+            transforms.ColorJitter(*color_jitter) if color_jitter else None
+        )
+        self.scan_names = glob.glob(os.path.join(self.root, "*.bin"))
+        self.normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+    def __getitem__(self, index):
+        file_path = self.scan_names[index]
+        filename = os.path.basename(file_path)
+        name = filename[:-4]
+        os.makedirs(self.base_dir, exist_ok=True)
+        filepath = os.path.join(self.base_dir, name + ".pkl")
+        with open(filepath, "rb") as handle:
+            data = pickle.load(handle)
+        cam_pose = data["cam_pose"]
+        T_world_2_cam = np.linalg.inv(cam_pose)
+        vox_origin = data["voxel_origin"]
+        data["cam_k"] = self.cam_k
+        target = data[
+            "target_1_4"
+        ]  # Following SSC literature, the output resolution on NYUv2 is set to 1:4
+        data["target"] = target
+        target_1_4 = data["target_1_16"]
+        CP_mega_matrix = compute_CP_mega_matrix(
+            target_1_4, is_binary=self.n_relations == 2
+        )
+        data["CP_mega_matrix"] = CP_mega_matrix
+        # compute the 3D-2D mapping
+        projected_pix, fov_mask, pix_z = vox2pix(
+            T_world_2_cam,
+            self.cam_k,
+            vox_origin,
+            self.voxel_size,
+            self.img_W,
+            self.img_H,
+            self.scene_size,
+        )
+        data["projected_pix_1"] = projected_pix
+        data["fov_mask_1"] = fov_mask
+        # compute the masks, each indicates voxels inside a frustum
+        frustums_masks, frustums_class_dists = compute_local_frustums(
+            projected_pix,
+            pix_z,
+            target,
+            self.img_W,
+            self.img_H,
+            dataset="NYU",
+            n_classes=12,
+            size=self.frustum_size,
+        )
+        data["frustums_masks"] = frustums_masks
+        data["frustums_class_dists"] = frustums_class_dists
+        rgb_path = os.path.join(self.root, name + "_color.jpg")
+        img = Image.open(rgb_path).convert("RGB")
+        # Image augmentation
+        if self.color_jitter is not None:
+            img = self.color_jitter(img)
+        # PIL to numpy
+        img = np.array(img, dtype=np.float32, copy=False) / 255.0
+        # randomly fliplr the image
+        if np.random.rand() < self.fliplr:
+            img = np.ascontiguousarray(np.fliplr(img))
+            data["projected_pix_1"][:, 0] = (
+                img.shape[1] - 1 - data["projected_pix_1"][:, 0]
+            )
+        data["img"] = self.normalize_rgb(img)  # (3, img_H, img_W)
+        return data
+    def __len__(self):
+        return len(self.scan_names)

monoscene/data/NYU/nyu_dm.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from torch.utils.data.dataloader import DataLoader
+from monoscene.data.NYU.nyu_dataset import NYUDataset
+from monoscene.data.NYU.collate import collate_fn
+import pytorch_lightning as pl
+from monoscene.data.utils.torch_util import worker_init_fn
+class NYUDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        root,
+        preprocess_root,
+        n_relations=4,
+        batch_size=4,
+        frustum_size=4,
+        num_workers=6,
+    ):
+        super().__init__()
+        self.n_relations = n_relations
+        self.preprocess_root = preprocess_root
+        self.root = root
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.frustum_size = frustum_size
+    def setup(self, stage=None):
+        self.train_ds = NYUDataset(
+            split="train",
+            preprocess_root=self.preprocess_root,
+            n_relations=self.n_relations,
+            root=self.root,
+            fliplr=0.5,
+            frustum_size=self.frustum_size,
+            color_jitter=(0.4, 0.4, 0.4),
+        )
+        self.test_ds = NYUDataset(
+            split="test",
+            preprocess_root=self.preprocess_root,
+            n_relations=self.n_relations,
+            root=self.root,
+            frustum_size=self.frustum_size,
+            fliplr=0.0,
+            color_jitter=None,
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_ds,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=self.num_workers,
+            shuffle=True,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_ds,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            drop_last=False,
+            shuffle=False,
+            pin_memory=True,
+            collate_fn=collate_fn,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_ds,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            drop_last=False,
+            shuffle=False,
+            pin_memory=True,
+            collate_fn=collate_fn,
+        )

monoscene/data/NYU/params.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+import numpy as np
+NYU_class_names = [
+    "empty",
+    "ceiling",
+    "floor",
+    "wall",
+    "window",
+    "chair",
+    "bed",
+    "sofa",
+    "table",
+    "tvs",
+    "furn",
+    "objs",
+]
+class_weights = torch.FloatTensor([0.05, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+class_freq_1_4 = np.array(
+    [
+        43744234,
+        80205,
+        1070052,
+        905632,
+        116952,
+        180994,
+        436852,
+        279714,
+        254611,
+        28247,
+        1805949,
+        850724,
+    ]
+)
+class_freq_1_8 = np.array(
+    [
+        5176253,
+        17277,
+        220105,
+        183849,
+        21827,
+        33520,
+        67022,
+        44248,
+        46615,
+        4419,
+        290218,
+        142573,
+    ]
+)
+class_freq_1_16 = np.array(
+    [587620, 3820, 46836, 36256, 4241, 5978, 10939, 8000, 8224, 781, 49778, 25864]
+)

monoscene/data/NYU/preprocess.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import numpy as np
+from tqdm import tqdm
+import numpy.matlib
+import os
+import glob
+import pickle
+import hydra
+from omegaconf import DictConfig
+seg_class_map = [
+    0,
+    1,
+    2,
+    3,
+    4,
+    11,
+    5,
+    6,
+    7,
+    8,
+    8,
+    10,
+    10,
+    10,
+    11,
+    11,
+    9,
+    8,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    10,
+    10,
+    11,
+    8,
+    10,
+    11,
+    9,
+    11,
+    11,
+    11,
+]
+def _rle2voxel(rle, voxel_size=(240, 144, 240), rle_filename=""):
+    r"""Read voxel label data from file (RLE compression), and convert it to fully occupancy labeled voxels.
+    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L172
+    In the data loader of pytorch, only single thread is allowed.
+    For multi-threads version and more details, see 'readRLE.py'.
+    output: seg_label: 3D numpy array, size 240 x 144 x 240
+    """
+    seg_label = np.zeros(
+        int(voxel_size[0] * voxel_size[1] * voxel_size[2]), dtype=np.uint8
+    )  # segmentation label
+    vox_idx = 0
+    for idx in range(int(rle.shape[0] / 2)):
+        check_val = rle[idx * 2]
+        check_iter = rle[idx * 2 + 1]
+        if check_val >= 37 and check_val != 255:  # 37 classes to 12 classes
+            print("RLE {} check_val: {}".format(rle_filename, check_val))
+        seg_label_val = (
+            seg_class_map[check_val] if check_val != 255 else 255
+        )  # 37 classes to 12 classes
+        seg_label[vox_idx : vox_idx + check_iter] = np.matlib.repmat(
+            seg_label_val, 1, check_iter
+        )
+        vox_idx = vox_idx + check_iter
+    seg_label = seg_label.reshape(voxel_size)  # 3D array, size 240 x 144 x 240
+    return seg_label
+def _read_rle(rle_filename):  # 0.0005s
+    """Read RLE compression data
+    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L153
+    Return:
+        vox_origin,
+        cam_pose,
+        vox_rle, voxel label data from file
+    Shape:
+        vox_rle, (240, 144, 240)
+    """
+    fid = open(rle_filename, "rb")
+    vox_origin = np.fromfile(
+        fid, np.float32, 3
+    ).T  # Read voxel origin in world coordinates
+    cam_pose = np.fromfile(fid, np.float32, 16).reshape((4, 4))  # Read camera pose
+    vox_rle = (
+        np.fromfile(fid, np.uint32).reshape((-1, 1)).T
+    )  # Read voxel label data from file
+    vox_rle = np.squeeze(vox_rle)  # 2d array: (1 x N), to 1d array: (N , )
+    fid.close()
+    return vox_origin, cam_pose, vox_rle
+def _downsample_label(label, voxel_size=(240, 144, 240), downscale=4):
+    r"""downsample the labeled data,
+    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L262
+    Shape:
+        label, (240, 144, 240)
+        label_downscale, if downsample==4, then (60, 36, 60)
+    """
+    if downscale == 1:
+        return label
+    ds = downscale
+    small_size = (
+        voxel_size[0] // ds,
+        voxel_size[1] // ds,
+        voxel_size[2] // ds,
+    )  # small size
+    label_downscale = np.zeros(small_size, dtype=np.uint8)
+    empty_t = 0.95 * ds * ds * ds  # threshold
+    s01 = small_size[0] * small_size[1]
+    label_i = np.zeros((ds, ds, ds), dtype=np.int32)
+    for i in range(small_size[0] * small_size[1] * small_size[2]):
+        z = int(i / s01)
+        y = int((i - z * s01) / small_size[0])
+        x = int(i - z * s01 - y * small_size[0])
+        label_i[:, :, :] = label[
+            x * ds : (x + 1) * ds, y * ds : (y + 1) * ds, z * ds : (z + 1) * ds
+        ]
+        label_bin = label_i.flatten()
+        zero_count_0 = np.array(np.where(label_bin == 0)).size
+        zero_count_255 = np.array(np.where(label_bin == 255)).size
+        zero_count = zero_count_0 + zero_count_255
+        if zero_count > empty_t:
+            label_downscale[x, y, z] = 0 if zero_count_0 > zero_count_255 else 255
+        else:
+            label_i_s = label_bin[
+                np.where(np.logical_and(label_bin > 0, label_bin < 255))
+            ]
+            label_downscale[x, y, z] = np.argmax(np.bincount(label_i_s))
+    return label_downscale
+@hydra.main(config_name="../../config/monoscene.yaml")
+def main(config: DictConfig):
+    scene_size = (240, 144, 240)
+    for split in ["train", "test"]:
+        root = os.path.join(config.NYU_root, "NYU" + split)
+        base_dir = os.path.join(config.NYU_preprocess_root, "base", "NYU" + split)
+        os.makedirs(base_dir, exist_ok=True)
+        scans = glob.glob(os.path.join(root, "*.bin"))
+        for scan in tqdm(scans):
+            filename = os.path.basename(scan)
+            name = filename[:-4]
+            filepath = os.path.join(base_dir, name + ".pkl")
+            if os.path.exists(filepath):
+                continue
+            vox_origin, cam_pose, rle = _read_rle(scan)
+            target_1_1 = _rle2voxel(rle, scene_size, scan)
+            target_1_4 = _downsample_label(target_1_1, scene_size, 4)
+            target_1_16 = _downsample_label(target_1_1, scene_size, 16)
+            data = {
+                "cam_pose": cam_pose,
+                "voxel_origin": vox_origin,
+                "name": name,
+                "target_1_4": target_1_4,
+                "target_1_16": target_1_16,
+            }
+            with open(filepath, "wb") as handle:
+                pickle.dump(data, handle)
+                print("wrote to", filepath)
+if __name__ == "__main__":
+    main()

monoscene/data/kitti_360/collate.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+def collate_fn(batch):
+    data = {}
+    imgs = []
+    frame_ids = []
+    img_paths = []
+    sequences = []
+    cam_ks = []
+    T_velo_2_cams = []
+    scale_3ds = batch[0]["scale_3ds"]
+    for scale_3d in scale_3ds:
+        data["projected_pix_{}".format(scale_3d)] = []
+        data["fov_mask_{}".format(scale_3d)] = []
+    for _, input_dict in enumerate(batch):
+        if "img_path" in input_dict:
+            img_paths.append(input_dict["img_path"])
+        for key in data:
+            data[key].append(torch.from_numpy(input_dict[key]))
+        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).float())
+        T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
+        sequences.append(input_dict["sequence"])
+        img = input_dict["img"]
+        imgs.append(img)
+        frame_ids.append(input_dict["frame_id"])
+    ret_data = {
+        "sequence": sequences,
+        "frame_id": frame_ids,
+        "cam_k": cam_ks,
+        "T_velo_2_cam": T_velo_2_cams,
+        "img": torch.stack(imgs),
+        "img_path": img_paths,
+    }
+    for key in data:
+        ret_data[key] = data[key]
+    return ret_data

monoscene/data/kitti_360/kitti_360_dataset.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+import os
+import glob
+from torch.utils.data import Dataset
+import numpy as np
+from monoscene.data.utils.helpers import vox2pix
+from PIL import Image
+from torchvision import transforms
+class Kitti360Dataset(Dataset):
+    def __init__(self, root, sequences, n_scans):
+        """
+        Paramters
+        --------
+        root: str
+            Path to KITTI-360 dataset i.e. contain sequences such as 2013_05_28_drive_0009_sync
+        sequence: str
+            KITTI-360 sequence e.g. 2013_05_28_drive_0009_sync
+        n_scans: int
+            Only use the first n_scans since KITTI-360 sequence is very long
+        """
+        self.root = root
+        self.img_H = 376
+        self.img_W = 1408
+        self.project_scale = 2
+        self.output_scale = 1
+        self.voxel_size = 0.2
+        self.vox_origin = np.array([0, -25.6, -2])
+        self.scene_size = (51.2, 51.2, 6.4)
+        self.T_velo_2_cam = self.get_velo2cam()
+        self.cam_k = self.get_cam_k()
+        self.scans = []
+        for sequence in sequences:
+            glob_path = os.path.join(
+                self.root, "data_2d_raw", sequence, "image_00/data_rect", "*.png"
+            )
+            for img_path in glob.glob(glob_path):
+                self.scans.append({"img_path": img_path, "sequence": sequence})
+        self.scans = self.scans[:n_scans]
+        self.normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+    def __len__(self):
+        return len(self.scans)
+    def get_cam_k(self):
+        cam_k = np.array(
+            [
+                552.554261,
+                0.000000,
+                682.049453,
+                0.000000,
+                0.000000,
+                552.554261,
+                238.769549,
+                0.000000,
+                0.000000,
+                0.000000,
+                1.000000,
+                0.000000,
+            ]
+        ).reshape(3, 4)
+        return cam_k[:3, :3]
+    def get_velo2cam(self):
+        cam2velo = np.array(
+            [
+                0.04307104361,
+                -0.08829286498,
+                0.995162929,
+                0.8043914418,
+                -0.999004371,
+                0.007784614041,
+                0.04392796942,
+                0.2993489574,
+                -0.01162548558,
+                -0.9960641394,
+                -0.08786966659,
+                -0.1770225824,
+            ]
+        ).reshape(3, 4)
+        cam2velo = np.concatenate(
+            [cam2velo, np.array([0, 0, 0, 1]).reshape(1, 4)], axis=0
+        )
+        return np.linalg.inv(cam2velo)
+    def __getitem__(self, index):
+        data = {"T_velo_2_cam": self.T_velo_2_cam, "cam_k": self.cam_k}
+        scan = self.scans[index]
+        img_path = scan["img_path"]
+        sequence = scan["sequence"]
+        filename = os.path.basename(img_path)
+        frame_id = os.path.splitext(filename)[0]
+        data["frame_id"] = frame_id
+        data["img_path"] = img_path
+        data["sequence"] = sequence
+        img = Image.open(img_path).convert("RGB")
+        img = np.array(img, dtype=np.float32, copy=False) / 255.0
+        img = self.normalize_rgb(img)
+        data["img"] = img
+        scale_3ds = [self.project_scale, self.output_scale]
+        data["scale_3ds"] = scale_3ds
+        for scale_3d in scale_3ds:
+            projected_pix, fov_mask, _ = vox2pix(
+                self.T_velo_2_cam,
+                self.cam_k,
+                self.vox_origin,
+                self.voxel_size * scale_3d,
+                self.img_W,
+                self.img_H,
+                self.scene_size,
+            )
+            data["projected_pix_{}".format(scale_3d)] = projected_pix
+            data["fov_mask_{}".format(scale_3d)] = fov_mask
+        return data

monoscene/data/kitti_360/kitti_360_dm.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from torch.utils.data.dataloader import DataLoader
+from monoscene.data.kitti_360.kitti_360_dataset import Kitti360Dataset
+import pytorch_lightning as pl
+from monoscene.data.kitti_360.collate import collate_fn
+from monoscene.data.utils.torch_util import worker_init_fn
+class Kitti360DataModule(pl.LightningDataModule):
+    def __init__(self, root, sequences, n_scans, batch_size=4, num_workers=3):
+        super().__init__()
+        self.root = root
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.sequences = sequences
+        self.n_scans = n_scans
+    def setup(self, stage=None):
+        self.ds = Kitti360Dataset(
+            root=self.root, sequences=self.sequences, n_scans=self.n_scans
+        )
+    def dataloader(self):
+        return DataLoader(
+            self.ds,
+            batch_size=self.batch_size,
+            drop_last=False,
+            num_workers=self.num_workers,
+            shuffle=False,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )

monoscene/data/semantic_kitti/collate.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+def collate_fn(batch):
+    data = {}
+    imgs = []
+    CP_mega_matrices = []
+    targets = []
+    frame_ids = []
+    sequences = []
+    cam_ks = []
+    T_velo_2_cams = []
+    frustums_masks = []
+    frustums_class_dists = []
+    scale_3ds = batch[0]["scale_3ds"]
+    for scale_3d in scale_3ds:
+        data["projected_pix_{}".format(scale_3d)] = []
+        data["fov_mask_{}".format(scale_3d)] = []
+    for idx, input_dict in enumerate(batch):
+        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
+        T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
+        if "frustums_masks" in input_dict:
+            frustums_masks.append(torch.from_numpy(input_dict["frustums_masks"]))
+            frustums_class_dists.append(
+                torch.from_numpy(input_dict["frustums_class_dists"]).float()
+            )
+        for key in data:
+            data[key].append(torch.from_numpy(input_dict[key]))
+        img = input_dict["img"]
+        imgs.append(img)
+        frame_ids.append(input_dict["frame_id"])
+        sequences.append(input_dict["sequence"])
+        target = torch.from_numpy(input_dict["target"])
+        targets.append(target)
+        CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
+    ret_data = {
+        "frame_id": frame_ids,
+        "sequence": sequences,
+        "frustums_class_dists": frustums_class_dists,
+        "frustums_masks": frustums_masks,
+        "cam_k": cam_ks,
+        "T_velo_2_cam": T_velo_2_cams,
+        "img": torch.stack(imgs),
+        "CP_mega_matrices": CP_mega_matrices,
+        "target": torch.stack(targets)
+    }
+    for key in data:
+        ret_data[key] = data[key]
+    return ret_data

monoscene/data/semantic_kitti/io_data.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
+"""
+import numpy as np
+import yaml
+import imageio
+def unpack(compressed):
+  ''' given a bit encoded voxel grid, make a normal voxel grid out of it.  '''
+  uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
+  uncompressed[::8] = compressed[:] >> 7 & 1
+  uncompressed[1::8] = compressed[:] >> 6 & 1
+  uncompressed[2::8] = compressed[:] >> 5 & 1
+  uncompressed[3::8] = compressed[:] >> 4 & 1
+  uncompressed[4::8] = compressed[:] >> 3 & 1
+  uncompressed[5::8] = compressed[:] >> 2 & 1
+  uncompressed[6::8] = compressed[:] >> 1 & 1
+  uncompressed[7::8] = compressed[:] & 1
+  return uncompressed
+def img_normalize(img, mean, std):
+  img = img.astype(np.float32) / 255.0
+  img = img - mean
+  img = img / std
+  return img
+def pack(array):
+  """ convert a boolean array into a bitwise array. """
+  array = array.reshape((-1))
+  #compressing bit flags.
+  # yapf: disable
+  compressed = array[::8] << 7 | array[1::8] << 6  | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
+  # yapf: enable
+  return np.array(compressed, dtype=np.uint8)
+def get_grid_coords(dims, resolution):
+  '''
+  :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+  :return coords_grid: is the center coords of voxels in the grid
+  '''
+  # The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
+  g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
+  # The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
+  g_yy = np.arange(0, dims[1] + 1)
+  # The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
+  # if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
+  # (we go to 22 + 1 for the histogramdd)
+  # ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
+  sensor_pose = 10
+  g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)
+  # Obtaining the grid with coords...
+  xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+  coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+  coords_grid = coords_grid.astype(np.float)
+  coords_grid = (coords_grid * resolution) + resolution/2
+  temp = np.copy(coords_grid)
+  temp[:, 0] = coords_grid[:, 1]
+  temp[:, 1] = coords_grid[:, 0]
+  coords_grid = np.copy(temp)
+  return coords_grid, g_xx, g_yy, g_zz
+def _get_remap_lut(config_path):
+  '''
+  remap_lut to remap classes of semantic kitti for training...
+  :return:
+  '''
+  dataset_config = yaml.safe_load(open(config_path, 'r'))
+  # make lookup table for mapping
+  maxkey = max(dataset_config['learning_map'].keys())
+  # +100 hack making lut bigger just in case there are unknown labels
+  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
+  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
+  # in completion we have to distinguish empty and invalid voxels.
+  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
+  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
+  remap_lut[0] = 0  # only 'empty' stays 'empty'.
+  return remap_lut
+def get_inv_map():
+  '''
+  remap_lut to remap classes of semantic kitti for training...
+  :return:
+  '''
+  config_path = "./semantic-kitti.yaml"
+  dataset_config = yaml.safe_load(open(config_path, 'r'))
+  # make lookup table for mapping
+  inv_map = np.zeros(20, dtype=np.int32)
+  inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())
+  return inv_map
+def _read_SemKITTI(path, dtype, do_unpack):
+  bin = np.fromfile(path, dtype=dtype)  # Flattened array
+  if do_unpack:
+    bin = unpack(bin)
+  return bin
+def _read_label_SemKITTI(path):
+  label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
+  return label
+def _read_invalid_SemKITTI(path):
+  invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
+  return invalid
+def _read_occluded_SemKITTI(path):
+  occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
+  return occluded
+def _read_occupancy_SemKITTI(path):
+  occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
+  return occupancy
+def _read_rgb_SemKITTI(path):
+  rgb = np.asarray(imageio.imread(path))
+  return rgb
+def _read_pointcloud_SemKITTI(path):
+  'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
+  pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
+  pointcloud = pointcloud.reshape((-1, 4))
+  return pointcloud
+def _read_calib_SemKITTI(calib_path):
+  """
+  :param calib_path: Path to a calibration text file.
+  :return: dict with calibration matrices.
+  """
+  calib_all = {}
+  with open(calib_path, 'r') as f:
+    for line in f.readlines():
+      if line == '\n':
+        break
+      key, value = line.split(':', 1)
+      calib_all[key] = np.array([float(x) for x in value.split()])
+  # reshape matrices
+  calib_out = {}
+  calib_out['P2'] = calib_all['P2'].reshape(3, 4)  # 3x4 projection matrix for left camera
+  calib_out['Tr'] = np.identity(4)  # 4x4 matrix
+  calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
+  return calib_out
+def get_remap_lut(path):
+  '''
+  remap_lut to remap classes of semantic kitti for training...
+  :return:
+  '''
+  dataset_config = yaml.safe_load(open(path, 'r'))
+  # make lookup table for mapping
+  maxkey = max(dataset_config['learning_map'].keys())
+  # +100 hack making lut bigger just in case there are unknown labels
+  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
+  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
+  # in completion we have to distinguish empty and invalid voxels.
+  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
+  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
+  remap_lut[0] = 0  # only 'empty' stays 'empty'.
+  return remap_lut
+def data_augmentation_3Dflips(flip, data):
+  # The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
+  # into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
+  # https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
+  # Dims -> {XZY}
+  # Flipping around the X axis...
+  if np.isclose(flip, 1):
+    data = np.flip(data, axis=0).copy()
+  # Flipping around the Y axis...
+  if np.isclose(flip, 2):
+    data = np.flip(data, 2).copy()
+  # Flipping around the X and the Y axis...
+  if np.isclose(flip, 3):
+    data = np.flip(np.flip(data, axis=0), axis=2).copy()
+  return data
+def get_cmap_semanticKITTI20():
+  colors = np.array([
+    # [0  , 0  , 0, 255],
+    [100, 150, 245, 255],
+    [100, 230, 245, 255],
+    [30, 60, 150, 255],
+    [80, 30, 180, 255],
+    [100, 80, 250, 255],
+    [255, 30, 30, 255],
+    [255, 40, 200, 255],
+    [150, 30, 90, 255],
+    [255, 0, 255, 255],
+    [255, 150, 255, 255],
+    [75, 0, 75, 255],
+    [175, 0, 75, 255],
+    [255, 200, 0, 255],
+    [255, 120, 50, 255],
+    [0, 175, 0, 255],
+    [135, 60, 0, 255],
+    [150, 240, 80, 255],
+    [255, 240, 150, 255],
+    [255, 0, 0, 255]]).astype(np.uint8)
+  return colors

monoscene/data/semantic_kitti/kitti_dataset.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import os
+import glob
+from torch.utils.data import Dataset
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from monoscene.data.utils.helpers import (
+    vox2pix,
+    compute_local_frustums,
+    compute_CP_mega_matrix,
+)
+class KittiDataset(Dataset):
+    def __init__(
+        self,
+        split,
+        root,
+        preprocess_root,
+        project_scale=2,
+        frustum_size=4,
+        color_jitter=None,
+        fliplr=0.0,
+    ):
+        super().__init__()
+        self.root = root
+        self.label_root = os.path.join(preprocess_root, "labels")
+        self.n_classes = 20
+        splits = {
+            "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
+            "val": ["08"],
+            "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
+        }
+        self.split = split
+        self.sequences = splits[split]
+        self.frustum_size = frustum_size
+        self.project_scale = project_scale
+        self.output_scale = int(self.project_scale / 2)
+        self.scene_size = (51.2, 51.2, 6.4)
+        self.vox_origin = np.array([0, -25.6, -2])
+        self.fliplr = fliplr
+        self.voxel_size = 0.2  # 0.2m
+        self.img_W = 1220
+        self.img_H = 370
+        self.color_jitter = (
+            transforms.ColorJitter(*color_jitter) if color_jitter else None
+        )
+        self.scans = []
+        for sequence in self.sequences:
+            calib = self.read_calib(
+                os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
+            )
+            P = calib["P2"]
+            T_velo_2_cam = calib["Tr"]
+            proj_matrix = P @ T_velo_2_cam
+            glob_path = os.path.join(
+                self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
+            )
+            for voxel_path in glob.glob(glob_path):
+                self.scans.append(
+                    {
+                        "sequence": sequence,
+                        "P": P,
+                        "T_velo_2_cam": T_velo_2_cam,
+                        "proj_matrix": proj_matrix,
+                        "voxel_path": voxel_path,
+                    }
+                )
+        self.normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+    def __getitem__(self, index):
+        scan = self.scans[index]
+        voxel_path = scan["voxel_path"]
+        sequence = scan["sequence"]
+        P = scan["P"]
+        T_velo_2_cam = scan["T_velo_2_cam"]
+        proj_matrix = scan["proj_matrix"]
+        filename = os.path.basename(voxel_path)
+        frame_id = os.path.splitext(filename)[0]
+        rgb_path = os.path.join(
+            self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png"
+        )
+        data = {
+            "frame_id": frame_id,
+            "sequence": sequence,
+            "P": P,
+            "T_velo_2_cam": T_velo_2_cam,
+            "proj_matrix": proj_matrix,
+        }
+        scale_3ds = [self.output_scale, self.project_scale]
+        data["scale_3ds"] = scale_3ds
+        cam_k = P[0:3, 0:3]
+        data["cam_k"] = cam_k
+        for scale_3d in scale_3ds:
+            # compute the 3D-2D mapping
+            projected_pix, fov_mask, pix_z = vox2pix(
+                T_velo_2_cam,
+                cam_k,
+                self.vox_origin,
+                self.voxel_size * scale_3d,
+                self.img_W,
+                self.img_H,
+                self.scene_size,
+            )
+            data["projected_pix_{}".format(scale_3d)] = projected_pix
+            data["pix_z_{}".format(scale_3d)] = pix_z
+            data["fov_mask_{}".format(scale_3d)] = fov_mask
+        target_1_path = os.path.join(self.label_root, sequence, frame_id + "_1_1.npy")
+        target = np.load(target_1_path)
+        data["target"] = target
+        target_8_path = os.path.join(self.label_root, sequence, frame_id + "_1_8.npy")
+        target_1_8 = np.load(target_8_path)
+        CP_mega_matrix = compute_CP_mega_matrix(target_1_8)
+        data["CP_mega_matrix"] = CP_mega_matrix
+        # Compute the masks, each indicate the voxels of a local frustum
+        if self.split != "test":
+            projected_pix_output = data["projected_pix_{}".format(self.output_scale)]
+            pix_z_output = data[
+                "pix_z_{}".format(self.output_scale)
+            ]
+            frustums_masks, frustums_class_dists = compute_local_frustums(
+                projected_pix_output,
+                pix_z_output,
+                target,
+                self.img_W,
+                self.img_H,
+                dataset="kitti",
+                n_classes=20,
+                size=self.frustum_size,
+            )
+        else:
+            frustums_masks = None
+            frustums_class_dists = None
+        data["frustums_masks"] = frustums_masks
+        data["frustums_class_dists"] = frustums_class_dists
+        img = Image.open(rgb_path).convert("RGB")
+        # Image augmentation
+        if self.color_jitter is not None:
+            img = self.color_jitter(img)
+        # PIL to numpy
+        img = np.array(img, dtype=np.float32, copy=False) / 255.0
+        img = img[:370, :1220, :]  # crop image
+        # Fliplr the image
+        if np.random.rand() < self.fliplr:
+            img = np.ascontiguousarray(np.fliplr(img))
+            for scale in scale_3ds:
+                key = "projected_pix_" + str(scale)
+                data[key][:, 0] = img.shape[1] - 1 - data[key][:, 0]
+        data["img"] = self.normalize_rgb(img)
+        return data
+    def __len__(self):
+        return len(self.scans)
+    @staticmethod
+    def read_calib(calib_path):
+        """
+        Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
+        :param calib_path: Path to a calibration text file.
+        :return: dict with calibration matrices.
+        """
+        calib_all = {}
+        with open(calib_path, "r") as f:
+            for line in f.readlines():
+                if line == "\n":
+                    break
+                key, value = line.split(":", 1)
+                calib_all[key] = np.array([float(x) for x in value.split()])
+        # reshape matrices
+        calib_out = {}
+        # 3x4 projection matrix for left camera
+        calib_out["P2"] = calib_all["P2"].reshape(3, 4)
+        calib_out["Tr"] = np.identity(4)  # 4x4 matrix
+        calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
+        return calib_out

monoscene/data/semantic_kitti/kitti_dm.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from torch.utils.data.dataloader import DataLoader
+from monoscene.data.semantic_kitti.kitti_dataset import KittiDataset
+import pytorch_lightning as pl
+from monoscene.data.semantic_kitti.collate import collate_fn
+from monoscene.data.utils.torch_util import worker_init_fn
+class KittiDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        root,
+        preprocess_root,
+        project_scale=2,
+        frustum_size=4,
+        batch_size=4,
+        num_workers=6,
+    ):
+        super().__init__()
+        self.root = root
+        self.preprocess_root = preprocess_root
+        self.project_scale = project_scale
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.frustum_size = frustum_size
+    def setup(self, stage=None):
+        self.train_ds = KittiDataset(
+            split="train",
+            root=self.root,
+            preprocess_root=self.preprocess_root,
+            project_scale=self.project_scale,
+            frustum_size=self.frustum_size,
+            fliplr=0.5,
+            color_jitter=(0.4, 0.4, 0.4),
+        )
+        self.val_ds = KittiDataset(
+            split="val",
+            root=self.root,
+            preprocess_root=self.preprocess_root,
+            project_scale=self.project_scale,
+            frustum_size=self.frustum_size,
+            fliplr=0,
+            color_jitter=None,
+        )
+        self.test_ds = KittiDataset(
+            split="test",
+            root=self.root,
+            preprocess_root=self.preprocess_root,
+            project_scale=self.project_scale,
+            frustum_size=self.frustum_size,
+            fliplr=0,
+            color_jitter=None,
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_ds,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=self.num_workers,
+            shuffle=True,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_ds,
+            batch_size=self.batch_size,
+            drop_last=False,
+            num_workers=self.num_workers,
+            shuffle=False,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_ds,
+            batch_size=self.batch_size,
+            drop_last=False,
+            num_workers=self.num_workers,
+            shuffle=False,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )

monoscene/data/semantic_kitti/params.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import numpy as np
+semantic_kitti_class_frequencies = np.array(
+    [
+        5.41773033e09,
+        1.57835390e07,
+        1.25136000e05,
+        1.18809000e05,
+        6.46799000e05,
+        8.21951000e05,
+        2.62978000e05,
+        2.83696000e05,
+        2.04750000e05,
+        6.16887030e07,
+        4.50296100e06,
+        4.48836500e07,
+        2.26992300e06,
+        5.68402180e07,
+        1.57196520e07,
+        1.58442623e08,
+        2.06162300e06,
+        3.69705220e07,
+        1.15198800e06,
+        3.34146000e05,
+    ]
+)
+kitti_class_names = [
+    "empty",
+    "car",
+    "bicycle",
+    "motorcycle",
+    "truck",
+    "other-vehicle",
+    "person",
+    "bicyclist",
+    "motorcyclist",
+    "road",
+    "parking",
+    "sidewalk",
+    "other-ground",
+    "building",
+    "fence",
+    "vegetation",
+    "trunk",
+    "terrain",
+    "pole",
+    "traffic-sign",
+]

monoscene/data/semantic_kitti/preprocess.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Code partly taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/labels_downscale.py
+"""
+import numpy as np
+from tqdm import tqdm
+import numpy.matlib
+import os
+import glob
+import hydra
+from omegaconf import DictConfig
+import monoscene.data.semantic_kitti.io_data as SemanticKittiIO
+from hydra.utils import get_original_cwd
+from monoscene.data.NYU.preprocess import _downsample_label
+def majority_pooling(grid, k_size=2):
+    result = np.zeros(
+        (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
+    )
+    for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
+        for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
+            for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
+                sub_m = grid[
+                    (xx * k_size) : (xx * k_size) + k_size,
+                    (yy * k_size) : (yy * k_size) + k_size,
+                    (zz * k_size) : (zz * k_size) + k_size,
+                ]
+                unique, counts = np.unique(sub_m, return_counts=True)
+                if True in ((unique != 0) & (unique != 255)):
+                    # Remove counts with 0 and 255
+                    counts = counts[((unique != 0) & (unique != 255))]
+                    unique = unique[((unique != 0) & (unique != 255))]
+                else:
+                    if True in (unique == 0):
+                        counts = counts[(unique != 255)]
+                        unique = unique[(unique != 255)]
+                value = unique[np.argmax(counts)]
+                result[xx, yy, zz] = value
+    return result
+@hydra.main(config_name="../../config/monoscene.yaml")
+def main(config: DictConfig):
+    scene_size = (256, 256, 32)
+    sequences = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
+    remap_lut = SemanticKittiIO.get_remap_lut(
+        os.path.join(
+            get_original_cwd(),
+            "monoscene",
+            "data",
+            "semantic_kitti",
+            "semantic-kitti.yaml",
+        )
+    )
+    for sequence in sequences:
+        sequence_path = os.path.join(
+            config.kitti_root, "dataset", "sequences", sequence
+        )
+        label_paths = sorted(
+            glob.glob(os.path.join(sequence_path, "voxels", "*.label"))
+        )
+        invalid_paths = sorted(
+            glob.glob(os.path.join(sequence_path, "voxels", "*.invalid"))
+        )
+        out_dir = os.path.join(config.kitti_preprocess_root, "labels", sequence)
+        os.makedirs(out_dir, exist_ok=True)
+        downscaling = {"1_1": 1, "1_8": 8}
+        for i in tqdm(range(len(label_paths))):
+            frame_id, extension = os.path.splitext(os.path.basename(label_paths[i]))
+            LABEL = SemanticKittiIO._read_label_SemKITTI(label_paths[i])
+            INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_paths[i])
+            LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
+                np.float32
+            )  # Remap 20 classes semanticKITTI SSC
+            LABEL[
+                np.isclose(INVALID, 1)
+            ] = 255  # Setting to unknown all voxels marked on invalid mask...
+            LABEL = LABEL.reshape([256, 256, 32])
+            for scale in downscaling:
+                filename = frame_id + "_" + scale + ".npy"
+                label_filename = os.path.join(out_dir, filename)
+                # If files have not been created...
+                if not os.path.exists(label_filename):
+                    if scale == "1_8":
+                        LABEL_ds = _downsample_label(
+                            LABEL, (256, 256, 32), downscaling[scale]
+                        )
+                    else:
+                        LABEL_ds = LABEL
+                    np.save(label_filename, LABEL_ds)
+                    print("wrote to", label_filename)
+if __name__ == "__main__":
+    main()

monoscene/data/semantic_kitti/semantic-kitti.yaml ADDED Viewed

	@@ -0,0 +1,213 @@

+# This file is covered by the LICENSE file in the root of this project.
+nbr_classes: 20
+grid_dims: [256, 32, 256]  # (W, H, D)
+labels:
+  0 : "unlabeled"
+  1 : "outlier"
+  10: "car"
+  11: "bicycle"
+  13: "bus"
+  15: "motorcycle"
+  16: "on-rails"
+  18: "truck"
+  20: "other-vehicle"
+  30: "person"
+  31: "bicyclist"
+  32: "motorcyclist"
+  40: "road"
+  44: "parking"
+  48: "sidewalk"
+  49: "other-ground"
+  50: "building"
+  51: "fence"
+  52: "other-structure"
+  60: "lane-marking"
+  70: "vegetation"
+  71: "trunk"
+  72: "terrain"
+  80: "pole"
+  81: "traffic-sign"
+  99: "other-object"
+  252: "moving-car"
+  253: "moving-bicyclist"
+  254: "moving-person"
+  255: "moving-motorcyclist"
+  256: "moving-on-rails"
+  257: "moving-bus"
+  258: "moving-truck"
+  259: "moving-other-vehicle"
+color_map: # bgr
+  0 : [0, 0, 0]
+  1 : [0, 0, 255]
+  10: [245, 150, 100]
+  11: [245, 230, 100]
+  13: [250, 80, 100]
+  15: [150, 60, 30]
+  16: [255, 0, 0]
+  18: [180, 30, 80]
+  20: [255, 0, 0]
+  30: [30, 30, 255]
+  31: [200, 40, 255]
+  32: [90, 30, 150]
+  40: [255, 0, 255]
+  44: [255, 150, 255]
+  48: [75, 0, 75]
+  49: [75, 0, 175]
+  50: [0, 200, 255]
+  51: [50, 120, 255]
+  52: [0, 150, 255]
+  60: [170, 255, 150]
+  70: [0, 175, 0]
+  71: [0, 60, 135]
+  72: [80, 240, 150]
+  80: [150, 240, 255]
+  81: [0, 0, 255]
+  99: [255, 255, 50]
+  252: [245, 150, 100]
+  256: [255, 0, 0]
+  253: [200, 40, 255]
+  254: [30, 30, 255]
+  255: [90, 30, 150]
+  257: [250, 80, 100]
+  258: [180, 30, 80]
+  259: [255, 0, 0]
+content: # as a ratio with the total number of points
+  0: 0.018889854628292943
+  1: 0.0002937197336781505
+  10: 0.040818519255974316
+  11: 0.00016609538710764618
+  13: 2.7879693665067774e-05
+  15: 0.00039838616015114444
+  16: 0.0
+  18: 0.0020633612104619787
+  20: 0.0016218197275284021
+  30: 0.00017698551338515307
+  31: 1.1065903904919655e-08
+  32: 5.532951952459828e-09
+  40: 0.1987493871255525
+  44: 0.014717169549888214
+  48: 0.14392298360372
+  49: 0.0039048553037472045
+  50: 0.1326861944777486
+  51: 0.0723592229456223
+  52: 0.002395131480328884
+  60: 4.7084144280367186e-05
+  70: 0.26681502148037506
+  71: 0.006035012012626033
+  72: 0.07814222006271769
+  80: 0.002855498193863172
+  81: 0.0006155958086189918
+  99: 0.009923127583046915
+  252: 0.001789309418528068
+  253: 0.00012709999297008662
+  254: 0.00016059776092534436
+  255: 3.745553104802113e-05
+  256: 0.0
+  257: 0.00011351574470342043
+  258: 0.00010157861367183268
+  259: 4.3840131989471124e-05
+# classes that are indistinguishable from single scan or inconsistent in
+# ground truth are mapped to their closest equivalent
+learning_map:
+  0 : 0     # "unlabeled"
+  1 : 0     # "outlier" mapped to "unlabeled" --------------------------mapped
+  10: 1     # "car"
+  11: 2     # "bicycle"
+  13: 5     # "bus" mapped to "other-vehicle" --------------------------mapped
+  15: 3     # "motorcycle"
+  16: 5     # "on-rails" mapped to "other-vehicle" ---------------------mapped
+  18: 4     # "truck"
+  20: 5     # "other-vehicle"
+  30: 6     # "person"
+  31: 7     # "bicyclist"
+  32: 8     # "motorcyclist"
+  40: 9     # "road"
+  44: 10    # "parking"
+  48: 11    # "sidewalk"
+  49: 12    # "other-ground"
+  50: 13    # "building"
+  51: 14    # "fence"
+  52: 0     # "other-structure" mapped to "unlabeled" ------------------mapped
+  60: 9     # "lane-marking" to "road" ---------------------------------mapped
+  70: 15    # "vegetation"
+  71: 16    # "trunk"
+  72: 17    # "terrain"
+  80: 18    # "pole"
+  81: 19    # "traffic-sign"
+  99: 0     # "other-object" to "unlabeled" ----------------------------mapped
+  252: 1    # "moving-car" to "car" ------------------------------------mapped
+  253: 7    # "moving-bicyclist" to "bicyclist" ------------------------mapped
+  254: 6    # "moving-person" to "person" ------------------------------mapped
+  255: 8    # "moving-motorcyclist" to "motorcyclist" ------------------mapped
+  256: 5    # "moving-on-rails" mapped to "other-vehicle" --------------mapped
+  257: 5    # "moving-bus" mapped to "other-vehicle" -------------------mapped
+  258: 4    # "moving-truck" to "truck" --------------------------------mapped
+  259: 5    # "moving-other"-vehicle to "other-vehicle" ----------------mapped
+learning_map_inv: # inverse of previous map
+  0: 0      # "unlabeled", and others ignored
+  1: 10     # "car"
+  2: 11     # "bicycle"
+  3: 15     # "motorcycle"
+  4: 18     # "truck"
+  5: 20     # "other-vehicle"
+  6: 30     # "person"
+  7: 31     # "bicyclist"
+  8: 32     # "motorcyclist"
+  9: 40     # "road"
+  10: 44    # "parking"
+  11: 48    # "sidewalk"
+  12: 49    # "other-ground"
+  13: 50    # "building"
+  14: 51    # "fence"
+  15: 70    # "vegetation"
+  16: 71    # "trunk"
+  17: 72    # "terrain"
+  18: 80    # "pole"
+  19: 81    # "traffic-sign"
+learning_ignore: # Ignore classes
+  0: True      # "unlabeled", and others ignored
+  1: False     # "car"
+  2: False     # "bicycle"
+  3: False     # "motorcycle"
+  4: False     # "truck"
+  5: False     # "other-vehicle"
+  6: False     # "person"
+  7: False     # "bicyclist"
+  8: False     # "motorcyclist"
+  9: False     # "road"
+  10: False    # "parking"
+  11: False    # "sidewalk"
+  12: False    # "other-ground"
+  13: False    # "building"
+  14: False    # "fence"
+  15: False    # "vegetation"
+  16: False    # "trunk"
+  17: False    # "terrain"
+  18: False    # "pole"
+  19: False    # "traffic-sign"
+split: # sequence numbers
+  train:
+    - 0
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 9
+    - 10
+  valid:
+    - 8
+  test:
+    - 11
+    - 12
+    - 13
+    - 14
+    - 15
+    - 16
+    - 17
+    - 18
+    - 19
+    - 20
+    - 21

monoscene/data/utils/fusion.py ADDED Viewed

	@@ -0,0 +1,507 @@

+"""
+Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
+@inproceedings{zeng20163dmatch,
+    title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
+    author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
+    booktitle={CVPR},
+    year={2017}
+}
+"""
+import numpy as np
+from numba import njit, prange
+from skimage import measure
+FUSION_GPU_MODE = 0
+class TSDFVolume:
+    """Volumetric TSDF Fusion of RGB-D Images."""
+    def __init__(self, vol_bnds, voxel_size, use_gpu=True):
+        """Constructor.
+        Args:
+          vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
+            xyz bounds (min/max) in meters.
+          voxel_size (float): The volume discretization in meters.
+        """
+        vol_bnds = np.asarray(vol_bnds)
+        assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
+        # Define voxel volume parameters
+        self._vol_bnds = vol_bnds
+        self._voxel_size = float(voxel_size)
+        self._trunc_margin = 5 * self._voxel_size  # truncation on SDF
+        # self._trunc_margin = 10  # truncation on SDF
+        self._color_const = 256 * 256
+        # Adjust volume bounds and ensure C-order contiguous
+        self._vol_dim = (
+            np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
+            .copy(order="C")
+            .astype(int)
+        )
+        self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
+        self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
+        print(
+            "Voxel volume size: {} x {} x {} - # points: {:,}".format(
+                self._vol_dim[0],
+                self._vol_dim[1],
+                self._vol_dim[2],
+                self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
+            )
+        )
+        # Initialize pointers to voxel volume in CPU memory
+        self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        # for computing the cumulative moving average of observations per voxel
+        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        self.gpu_mode = use_gpu and FUSION_GPU_MODE
+        # Copy voxel volumes to GPU
+        if self.gpu_mode:
+            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
+            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
+            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
+            # Cuda kernel function (C++)
+            self._cuda_src_mod = SourceModule(
+                """
+        __global__ void integrate(float * tsdf_vol,
+                                  float * weight_vol,
+                                  float * color_vol,
+                                  float * vol_dim,
+                                  float * vol_origin,
+                                  float * cam_intr,
+                                  float * cam_pose,
+                                  float * other_params,
+                                  float * color_im,
+                                  float * depth_im) {
+          // Get voxel index
+          int gpu_loop_idx = (int) other_params[0];
+          int max_threads_per_block = blockDim.x;
+          int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
+          int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
+          int vol_dim_x = (int) vol_dim[0];
+          int vol_dim_y = (int) vol_dim[1];
+          int vol_dim_z = (int) vol_dim[2];
+          if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
+              return;
+          // Get voxel grid coordinates (note: be careful when casting)
+          float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
+          float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
+          float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
+          // Voxel grid coordinates to world coordinates
+          float voxel_size = other_params[1];
+          float pt_x = vol_origin[0]+voxel_x*voxel_size;
+          float pt_y = vol_origin[1]+voxel_y*voxel_size;
+          float pt_z = vol_origin[2]+voxel_z*voxel_size;
+          // World coordinates to camera coordinates
+          float tmp_pt_x = pt_x-cam_pose[0*4+3];
+          float tmp_pt_y = pt_y-cam_pose[1*4+3];
+          float tmp_pt_z = pt_z-cam_pose[2*4+3];
+          float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
+          float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
+          float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
+          // Camera coordinates to image pixels
+          int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
+          int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
+          // Skip if outside view frustum
+          int im_h = (int) other_params[2];
+          int im_w = (int) other_params[3];
+          if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
+              return;
+          // Skip invalid depth
+          float depth_value = depth_im[pixel_y*im_w+pixel_x];
+          if (depth_value == 0)
+              return;
+          // Integrate TSDF
+          float trunc_margin = other_params[4];
+          float depth_diff = depth_value-cam_pt_z;
+          if (depth_diff < -trunc_margin)
+              return;
+          float dist = fmin(1.0f,depth_diff/trunc_margin);
+          float w_old = weight_vol[voxel_idx];
+          float obs_weight = other_params[5];
+          float w_new = w_old + obs_weight;
+          weight_vol[voxel_idx] = w_new;
+          tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
+          // Integrate color
+          float old_color = color_vol[voxel_idx];
+          float old_b = floorf(old_color/(256*256));
+          float old_g = floorf((old_color-old_b*256*256)/256);
+          float old_r = old_color-old_b*256*256-old_g*256;
+          float new_color = color_im[pixel_y*im_w+pixel_x];
+          float new_b = floorf(new_color/(256*256));
+          float new_g = floorf((new_color-new_b*256*256)/256);
+          float new_r = new_color-new_b*256*256-new_g*256;
+          new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
+          new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
+          new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
+          color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
+        }"""
+            )
+            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
+            # Determine block/grid size on GPU
+            gpu_dev = cuda.Device(0)
+            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
+            n_blocks = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(self._max_gpu_threads_per_block)
+                )
+            )
+            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
+            grid_dim_y = min(
+                gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
+            )
+            grid_dim_z = min(
+                gpu_dev.MAX_GRID_DIM_Z,
+                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
+            )
+            self._max_gpu_grid_dim = np.array(
+                [grid_dim_x, grid_dim_y, grid_dim_z]
+            ).astype(int)
+            self._n_gpu_loops = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(
+                        np.prod(self._max_gpu_grid_dim)
+                        * self._max_gpu_threads_per_block
+                    )
+                )
+            )
+        else:
+            # Get voxel grid coordinates
+            xv, yv, zv = np.meshgrid(
+                range(self._vol_dim[0]),
+                range(self._vol_dim[1]),
+                range(self._vol_dim[2]),
+                indexing="ij",
+            )
+            self.vox_coords = (
+                np.concatenate(
+                    [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
+                )
+                .astype(int)
+                .T
+            )
+    @staticmethod
+    @njit(parallel=True)
+    def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
+        """Convert voxel grid coordinates to world coordinates."""
+        vol_origin = vol_origin.astype(np.float32)
+        vox_coords = vox_coords.astype(np.float32)
+        #    print(np.min(vox_coords))
+        cam_pts = np.empty_like(vox_coords, dtype=np.float32)
+        for i in prange(vox_coords.shape[0]):
+            for j in range(3):
+                cam_pts[i, j] = (
+                    vol_origin[j]
+                    + (vox_size * vox_coords[i, j])
+                    + vox_size * offsets[j]
+                )
+        return cam_pts
+    @staticmethod
+    @njit(parallel=True)
+    def cam2pix(cam_pts, intr):
+        """Convert camera coordinates to pixel coordinates."""
+        intr = intr.astype(np.float32)
+        fx, fy = intr[0, 0], intr[1, 1]
+        cx, cy = intr[0, 2], intr[1, 2]
+        pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
+        for i in prange(cam_pts.shape[0]):
+            pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
+            pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
+        return pix
+    @staticmethod
+    @njit(parallel=True)
+    def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
+        """Integrate the TSDF volume."""
+        tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
+        # print(tsdf_vol.shape)
+        w_new = np.empty_like(w_old, dtype=np.float32)
+        for i in prange(len(tsdf_vol)):
+            w_new[i] = w_old[i] + obs_weight
+            tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
+        return tsdf_vol_int, w_new
+    def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
+        """Integrate an RGB-D frame into the TSDF volume.
+        Args:
+          color_im (ndarray): An RGB image of shape (H, W, 3).
+          depth_im (ndarray): A depth image of shape (H, W).
+          cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
+          cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
+          obs_weight (float): The weight to assign for the current observation. A higher
+            value
+        """
+        im_h, im_w = depth_im.shape
+        # Fold RGB color image into a single channel image
+        color_im = color_im.astype(np.float32)
+        color_im = np.floor(
+            color_im[..., 2] * self._color_const
+            + color_im[..., 1] * 256
+            + color_im[..., 0]
+        )
+        if self.gpu_mode:  # GPU mode: integrate voxel volume (calls CUDA kernel)
+            for gpu_loop_idx in range(self._n_gpu_loops):
+                self._cuda_integrate(
+                    self._tsdf_vol_gpu,
+                    self._weight_vol_gpu,
+                    self._color_vol_gpu,
+                    cuda.InOut(self._vol_dim.astype(np.float32)),
+                    cuda.InOut(self._vol_origin.astype(np.float32)),
+                    cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
+                    cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
+                    cuda.InOut(
+                        np.asarray(
+                            [
+                                gpu_loop_idx,
+                                self._voxel_size,
+                                im_h,
+                                im_w,
+                                self._trunc_margin,
+                                obs_weight,
+                            ],
+                            np.float32,
+                        )
+                    ),
+                    cuda.InOut(color_im.reshape(-1).astype(np.float32)),
+                    cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
+                    block=(self._max_gpu_threads_per_block, 1, 1),
+                    grid=(
+                        int(self._max_gpu_grid_dim[0]),
+                        int(self._max_gpu_grid_dim[1]),
+                        int(self._max_gpu_grid_dim[2]),
+                    ),
+                )
+        else:  # CPU mode: integrate voxel volume (vectorized implementation)
+            # Convert voxel grid coordinates to pixel coordinates
+            cam_pts = self.vox2world(
+                self._vol_origin, self.vox_coords, self._voxel_size
+            )
+            cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
+            pix_z = cam_pts[:, 2]
+            pix = self.cam2pix(cam_pts, cam_intr)
+            pix_x, pix_y = pix[:, 0], pix[:, 1]
+            # Eliminate pixels outside view frustum
+            valid_pix = np.logical_and(
+                pix_x >= 0,
+                np.logical_and(
+                    pix_x < im_w,
+                    np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
+                ),
+            )
+            depth_val = np.zeros(pix_x.shape)
+            depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
+            # Integrate TSDF
+            depth_diff = depth_val - pix_z
+            valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
+            dist = depth_diff
+            valid_vox_x = self.vox_coords[valid_pts, 0]
+            valid_vox_y = self.vox_coords[valid_pts, 1]
+            valid_vox_z = self.vox_coords[valid_pts, 2]
+            w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            valid_dist = dist[valid_pts]
+            tsdf_vol_new, w_new = self.integrate_tsdf(
+                tsdf_vals, valid_dist, w_old, obs_weight
+            )
+            self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
+            self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
+            # Integrate color
+            old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            old_b = np.floor(old_color / self._color_const)
+            old_g = np.floor((old_color - old_b * self._color_const) / 256)
+            old_r = old_color - old_b * self._color_const - old_g * 256
+            new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
+            new_b = np.floor(new_color / self._color_const)
+            new_g = np.floor((new_color - new_b * self._color_const) / 256)
+            new_r = new_color - new_b * self._color_const - new_g * 256
+            new_b = np.minimum(
+                255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
+            )
+            new_g = np.minimum(
+                255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
+            )
+            new_r = np.minimum(
+                255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
+            )
+            self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
+                new_b * self._color_const + new_g * 256 + new_r
+            )
+    def get_volume(self):
+        if self.gpu_mode:
+            cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
+            cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
+        return self._tsdf_vol_cpu, self._color_vol_cpu
+    def get_point_cloud(self):
+        """Extract a point cloud from the voxel volume."""
+        tsdf_vol, color_vol = self.get_volume()
+        # Marching cubes
+        verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
+        verts_ind = np.round(verts).astype(int)
+        verts = verts * self._voxel_size + self._vol_origin
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+        pc = np.hstack([verts, colors])
+        return pc
+    def get_mesh(self):
+        """Compute a mesh from the voxel volume using marching cubes."""
+        tsdf_vol, color_vol = self.get_volume()
+        # Marching cubes
+        verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
+        verts_ind = np.round(verts).astype(int)
+        verts = (
+            verts * self._voxel_size + self._vol_origin
+        )  # voxel grid coordinates to world coordinates
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+        return verts, faces, norms, colors
+def rigid_transform(xyz, transform):
+    """Applies a rigid transform to an (N, 3) pointcloud."""
+    xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
+    xyz_t_h = np.dot(transform, xyz_h.T).T
+    return xyz_t_h[:, :3]
+def get_view_frustum(depth_im, cam_intr, cam_pose):
+    """Get corners of 3D camera view frustum of depth image"""
+    im_h = depth_im.shape[0]
+    im_w = depth_im.shape[1]
+    max_depth = np.max(depth_im)
+    view_frust_pts = np.array(
+        [
+            (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[0, 0],
+            (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[1, 1],
+            np.array([0, max_depth, max_depth, max_depth, max_depth]),
+        ]
+    )
+    view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
+    return view_frust_pts
+def meshwrite(filename, verts, faces, norms, colors):
+    """Save a 3D mesh to a polygon .ply file."""
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (verts.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property float nx\n")
+    ply_file.write("property float ny\n")
+    ply_file.write("property float nz\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("element face %d\n" % (faces.shape[0]))
+    ply_file.write("property list uchar int vertex_index\n")
+    ply_file.write("end_header\n")
+    # Write vertex list
+    for i in range(verts.shape[0]):
+        ply_file.write(
+            "%f %f %f %f %f %f %d %d %d\n"
+            % (
+                verts[i, 0],
+                verts[i, 1],
+                verts[i, 2],
+                norms[i, 0],
+                norms[i, 1],
+                norms[i, 2],
+                colors[i, 0],
+                colors[i, 1],
+                colors[i, 2],
+            )
+        )
+    # Write face list
+    for i in range(faces.shape[0]):
+        ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
+    ply_file.close()
+def pcwrite(filename, xyzrgb):
+    """Save a point cloud to a polygon .ply file."""
+    xyz = xyzrgb[:, :3]
+    rgb = xyzrgb[:, 3:].astype(np.uint8)
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (xyz.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("end_header\n")
+    # Write vertex list
+    for i in range(xyz.shape[0]):
+        ply_file.write(
+            "%f %f %f %d %d %d\n"
+            % (
+                xyz[i, 0],
+                xyz[i, 1],
+                xyz[i, 2],
+                rgb[i, 0],
+                rgb[i, 1],
+                rgb[i, 2],
+            )
+        )

monoscene/data/utils/helpers.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import numpy as np
+import monoscene.data.utils.fusion as fusion
+import torch
+def compute_CP_mega_matrix(target, is_binary=False):
+    """
+    Parameters
+    ---------
+    target: (H, W, D)
+        contains voxels semantic labels
+    is_binary: bool
+        if True, return binary voxels relations else return 4-way relations
+    """
+    label = target.reshape(-1)
+    label_row = label
+    N = label.shape[0]
+    super_voxel_size = [i//2 for i in target.shape]
+    if is_binary:
+        matrix = np.zeros((2, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
+    else:
+        matrix = np.zeros((4, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
+    for xx in range(super_voxel_size[0]):
+        for yy in range(super_voxel_size[1]):
+            for zz in range(super_voxel_size[2]):
+                col_idx = xx * (super_voxel_size[1] * super_voxel_size[2]) + yy * super_voxel_size[2] + zz
+                label_col_megas = np.array([
+                    target[xx * 2,     yy * 2,     zz * 2],
+                    target[xx * 2 + 1, yy * 2,     zz * 2],
+                    target[xx * 2,     yy * 2 + 1, zz * 2],
+                    target[xx * 2,     yy * 2,     zz * 2 + 1],
+                    target[xx * 2 + 1, yy * 2 + 1, zz * 2],
+                    target[xx * 2 + 1, yy * 2,     zz * 2 + 1],
+                    target[xx * 2,     yy * 2 + 1, zz * 2 + 1],
+                    target[xx * 2 + 1, yy * 2 + 1, zz * 2 + 1],
+                ])
+                label_col_megas = label_col_megas[label_col_megas != 255]
+                for label_col_mega in label_col_megas:
+                    label_col = np.ones(N)  * label_col_mega
+                    if not is_binary:
+                        matrix[0, (label_row != 255) & (label_col == label_row) & (label_col != 0), col_idx] = 1.0 # non non same
+                        matrix[1, (label_row != 255) & (label_col != label_row) & (label_col != 0) & (label_row != 0), col_idx] = 1.0 # non non diff
+                        matrix[2, (label_row != 255) & (label_row == label_col) & (label_col == 0), col_idx] = 1.0 # empty empty
+                        matrix[3, (label_row != 255) & (label_row != label_col) & ((label_row == 0) | (label_col == 0)), col_idx] = 1.0 # nonempty empty
+                    else:
+                        matrix[0, (label_row != 255) & (label_col != label_row), col_idx] = 1.0 # diff
+                        matrix[1, (label_row != 255) & (label_col == label_row), col_idx] = 1.0 # same
+    return matrix
+def vox2pix(cam_E, cam_k,
+            vox_origin, voxel_size,
+            img_W, img_H,
+            scene_size):
+    """
+    compute the 2D projection of voxels centroids
+    Parameters:
+    ----------
+    cam_E: 4x4
+       =camera pose in case of NYUv2 dataset
+       =Transformation from camera to lidar coordinate in case of SemKITTI
+    cam_k: 3x3
+        camera intrinsics
+    vox_origin: (3,)
+        world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
+    img_W: int
+        image width
+    img_H: int
+        image height
+    scene_size: (3,)
+        scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
+    Returns
+    -------
+    projected_pix: (N, 2)
+        Projected 2D positions of voxels
+    fov_mask: (N,)
+        Voxels mask indice voxels inside image's FOV
+    pix_z: (N,)
+        Voxels'distance to the sensor in meter
+    """
+    # Compute the x, y, z bounding of the scene in meter
+    vol_bnds = np.zeros((3,2))
+    vol_bnds[:,0] = vox_origin
+    vol_bnds[:,1] = vox_origin + np.array(scene_size)
+    # Compute the voxels centroids in lidar cooridnates
+    vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
+    xv, yv, zv = np.meshgrid(
+            range(vol_dim[0]),
+            range(vol_dim[1]),
+            range(vol_dim[2]),
+            indexing='ij'
+          )
+    vox_coords = np.concatenate([
+            xv.reshape(1,-1),
+            yv.reshape(1,-1),
+            zv.reshape(1,-1)
+          ], axis=0).astype(int).T
+    # Project voxels'centroid from lidar coordinates to camera coordinates
+    cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
+    cam_pts = fusion.rigid_transform(cam_pts, cam_E)
+    # Project camera coordinates to pixel positions
+    projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
+    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+    # Eliminate pixels outside view frustum
+    pix_z = cam_pts[:, 2]
+    fov_mask = np.logical_and(pix_x >= 0,
+                np.logical_and(pix_x < img_W,
+                np.logical_and(pix_y >= 0,
+                np.logical_and(pix_y < img_H,
+                pix_z > 0))))
+    return projected_pix, fov_mask, pix_z
+def compute_local_frustum(pix_x, pix_y, min_x, max_x, min_y, max_y, pix_z):
+    valid_pix = np.logical_and(pix_x >= min_x,
+                np.logical_and(pix_x < max_x,
+                np.logical_and(pix_y >= min_y,
+                np.logical_and(pix_y < max_y,
+                pix_z > 0))))
+    return valid_pix
+def compute_local_frustums(projected_pix, pix_z, target, img_W, img_H, dataset, n_classes, size=4):
+    """
+    Compute the local frustums mask and their class frequencies
+    Parameters:
+    ----------
+    projected_pix: (N, 2)
+        2D projected pix of all voxels
+    pix_z: (N,)
+        Distance of the camera sensor to voxels
+    target: (H, W, D)
+        Voxelized sematic labels
+    img_W: int
+        Image width
+    img_H: int
+        Image height
+    dataset: str
+        ="NYU" or "kitti" (for both SemKITTI and KITTI-360)
+    n_classes: int
+        Number of classes (12 for NYU and 20 for SemKITTI)
+    size: int
+        determine the number of local frustums i.e. size * size
+    Returns
+    -------
+    frustums_masks: (n_frustums, N)
+        List of frustums_masks, each indicates the belonging voxels
+    frustums_class_dists: (n_frustums, n_classes)
+        Contains the class frequencies in each frustum
+    """
+    H, W, D = target.shape
+    ranges = [(i * 1.0/size, (i * 1.0 + 1)/size) for i in range(size)]
+    local_frustum_masks = []
+    local_frustum_class_dists = []
+    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+    for y in ranges:
+        for x in ranges:
+            start_x = x[0] * img_W
+            end_x = x[1] * img_W
+            start_y = y[0] * img_H
+            end_y = y[1] * img_H
+            local_frustum = compute_local_frustum(pix_x, pix_y, start_x, end_x, start_y, end_y, pix_z)
+            if dataset == "NYU":
+                mask = (target != 255) & np.moveaxis(local_frustum.reshape(60, 60, 36), [0, 1, 2], [0, 2, 1])
+            elif dataset == "kitti":
+                mask = (target != 255) & local_frustum.reshape(H, W, D)
+            local_frustum_masks.append(mask)
+            classes, cnts = np.unique(target[mask], return_counts=True)
+            class_counts = np.zeros(n_classes)
+            class_counts[classes.astype(int)] = cnts
+            local_frustum_class_dists.append(class_counts)
+    frustums_masks, frustums_class_dists = np.array(local_frustum_masks), np.array(local_frustum_class_dists)
+    return frustums_masks, frustums_class_dists

monoscene/data/utils/torch_util.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import numpy as np
+import torch
+def worker_init_fn(worker_id):
+    """The function is designed for pytorch multi-process dataloader.
+    Note that we use the pytorch random generator to generate a base_seed.
+    Please try to be consistent.
+    References:
+        https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
+    """
+    base_seed = torch.IntTensor(1).random_().item()
+    np.random.seed(base_seed + worker_id)

monoscene/loss/CRP_loss.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+def compute_super_CP_multilabel_loss(pred_logits, CP_mega_matrices):
+    logits = []
+    labels = []
+    bs, n_relations, _, _ = pred_logits.shape
+    for i in range(bs):
+        pred_logit = pred_logits[i, :, :, :].permute(
+            0, 2, 1
+        )  # n_relations, N, n_mega_voxels
+        CP_mega_matrix = CP_mega_matrices[i]  # n_relations, N, n_mega_voxels
+        logits.append(pred_logit.reshape(n_relations, -1))
+        labels.append(CP_mega_matrix.reshape(n_relations, -1))
+    logits = torch.cat(logits, dim=1).T  # M, 4
+    labels = torch.cat(labels, dim=1).T  # M, 4
+    cnt_neg = (labels == 0).sum(0)
+    cnt_pos = labels.sum(0)
+    pos_weight = cnt_neg / cnt_pos
+    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+    loss_bce = criterion(logits, labels.float())
+    return loss_bce

monoscene/loss/sscMetrics.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Part of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py
+"""
+import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+def get_iou(iou_sum, cnt_class):
+    _C = iou_sum.shape[0]  # 12
+    iou = np.zeros(_C, dtype=np.float32)  # iou for each class
+    for idx in range(_C):
+        iou[idx] = iou_sum[idx] / cnt_class[idx] if cnt_class[idx] else 0
+    mean_iou = np.sum(iou[1:]) / np.count_nonzero(cnt_class[1:])
+    return iou, mean_iou
+def get_accuracy(predict, target, weight=None):  # 0.05s
+    _bs = predict.shape[0]  # batch size
+    _C = predict.shape[1]  # _C = 12
+    target = np.int32(target)
+    target = target.reshape(_bs, -1)  # (_bs, 60*36*60) 129600
+    predict = predict.reshape(_bs, _C, -1)  # (_bs, _C, 60*36*60)
+    predict = np.argmax(
+        predict, axis=1
+    )  # one-hot: _bs x _C x 60*36*60 -->  label: _bs x 60*36*60.
+    correct = predict == target  # (_bs, 129600)
+    if weight:  # 0.04s, add class weights
+        weight_k = np.ones(target.shape)
+        for i in range(_bs):
+            for n in range(target.shape[1]):
+                idx = 0 if target[i, n] == 255 else target[i, n]
+                weight_k[i, n] = weight[idx]
+        correct = correct * weight_k
+    acc = correct.sum() / correct.size
+    return acc
+class SSCMetrics:
+    def __init__(self, n_classes):
+        self.n_classes = n_classes
+        self.reset()
+    def hist_info(self, n_cl, pred, gt):
+        assert pred.shape == gt.shape
+        k = (gt >= 0) & (gt < n_cl)  # exclude 255
+        labeled = np.sum(k)
+        correct = np.sum((pred[k] == gt[k]))
+        return (
+            np.bincount(
+                n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
+            ).reshape(n_cl, n_cl),
+            correct,
+            labeled,
+        )
+    @staticmethod
+    def compute_score(hist, correct, labeled):
+        iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
+        mean_IU = np.nanmean(iu)
+        mean_IU_no_back = np.nanmean(iu[1:])
+        freq = hist.sum(1) / hist.sum()
+        freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
+        mean_pixel_acc = correct / labeled if labeled != 0 else 0
+        return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
+    def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
+        self.count += 1
+        mask = y_true != 255
+        if nonempty is not None:
+            mask = mask & nonempty
+        if nonsurface is not None:
+            mask = mask & nonsurface
+        tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)
+        self.completion_tp += tp
+        self.completion_fp += fp
+        self.completion_fn += fn
+        mask = y_true != 255
+        if nonempty is not None:
+            mask = mask & nonempty
+        tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
+            y_pred, y_true, mask
+        )
+        self.tps += tp_sum
+        self.fps += fp_sum
+        self.fns += fn_sum
+    def get_stats(self):
+        if self.completion_tp != 0:
+            precision = self.completion_tp / (self.completion_tp + self.completion_fp)
+            recall = self.completion_tp / (self.completion_tp + self.completion_fn)
+            iou = self.completion_tp / (
+                self.completion_tp + self.completion_fp + self.completion_fn
+            )
+        else:
+            precision, recall, iou = 0, 0, 0
+        iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
+        return {
+            "precision": precision,
+            "recall": recall,
+            "iou": iou,
+            "iou_ssc": iou_ssc,
+            "iou_ssc_mean": np.mean(iou_ssc[1:]),
+        }
+    def reset(self):
+        self.completion_tp = 0
+        self.completion_fp = 0
+        self.completion_fn = 0
+        self.tps = np.zeros(self.n_classes)
+        self.fps = np.zeros(self.n_classes)
+        self.fns = np.zeros(self.n_classes)
+        self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
+        self.labeled_ssc = 0
+        self.correct_ssc = 0
+        self.precision = 0
+        self.recall = 0
+        self.iou = 0
+        self.count = 1e-8
+        self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
+        self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)
+    def get_score_completion(self, predict, target, nonempty=None):
+        predict = np.copy(predict)
+        target = np.copy(target)
+        """for scene completion, treat the task as two-classes problem, just empty or occupancy"""
+        _bs = predict.shape[0]  # batch size
+        # ---- ignore
+        predict[target == 255] = 0
+        target[target == 255] = 0
+        # ---- flatten
+        target = target.reshape(_bs, -1)  # (_bs, 129600)
+        predict = predict.reshape(_bs, -1)  # (_bs, _C, 129600), 60*36*60=129600
+        # ---- treat all non-empty object class as one category, set them to label 1
+        b_pred = np.zeros(predict.shape)
+        b_true = np.zeros(target.shape)
+        b_pred[predict > 0] = 1
+        b_true[target > 0] = 1
+        p, r, iou = 0.0, 0.0, 0.0
+        tp_sum, fp_sum, fn_sum = 0, 0, 0
+        for idx in range(_bs):
+            y_true = b_true[idx, :]  # GT
+            y_pred = b_pred[idx, :]
+            if nonempty is not None:
+                nonempty_idx = nonempty[idx, :].reshape(-1)
+                y_true = y_true[nonempty_idx == 1]
+                y_pred = y_pred[nonempty_idx == 1]
+            tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
+            fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
+            fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
+            tp_sum += tp
+            fp_sum += fp
+            fn_sum += fn
+        return tp_sum, fp_sum, fn_sum
+    def get_score_semantic_and_completion(self, predict, target, nonempty=None):
+        target = np.copy(target)
+        predict = np.copy(predict)
+        _bs = predict.shape[0]  # batch size
+        _C = self.n_classes  # _C = 12
+        # ---- ignore
+        predict[target == 255] = 0
+        target[target == 255] = 0
+        # ---- flatten
+        target = target.reshape(_bs, -1)  # (_bs, 129600)
+        predict = predict.reshape(_bs, -1)  # (_bs, 129600), 60*36*60=129600
+        cnt_class = np.zeros(_C, dtype=np.int32)  # count for each class
+        iou_sum = np.zeros(_C, dtype=np.float32)  # sum of iou for each class
+        tp_sum = np.zeros(_C, dtype=np.int32)  # tp
+        fp_sum = np.zeros(_C, dtype=np.int32)  # fp
+        fn_sum = np.zeros(_C, dtype=np.int32)  # fn
+        for idx in range(_bs):
+            y_true = target[idx, :]  # GT
+            y_pred = predict[idx, :]
+            if nonempty is not None:
+                nonempty_idx = nonempty[idx, :].reshape(-1)
+                y_pred = y_pred[
+                    np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
+                ]
+                y_true = y_true[
+                    np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
+                ]
+            for j in range(_C):  # for each class
+                tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
+                fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
+                fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size
+                tp_sum[j] += tp
+                fp_sum[j] += fp
+                fn_sum[j] += fn
+        return tp_sum, fp_sum, fn_sum

monoscene/loss/ssc_loss.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def KL_sep(p, target):
+    """
+    KL divergence on nonzeros classes
+    """
+    nonzeros = target != 0
+    nonzero_p = p[nonzeros]
+    kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
+    return kl_term
+def geo_scal_loss(pred, ssc_target):
+    # Get softmax probabilities
+    pred = F.softmax(pred, dim=1)
+    # Compute empty and nonempty probabilities
+    empty_probs = pred[:, 0, :, :, :]
+    nonempty_probs = 1 - empty_probs
+    # Remove unknown voxels
+    mask = ssc_target != 255
+    nonempty_target = ssc_target != 0
+    nonempty_target = nonempty_target[mask].float()
+    nonempty_probs = nonempty_probs[mask]
+    empty_probs = empty_probs[mask]
+    intersection = (nonempty_target * nonempty_probs).sum()
+    precision = intersection / nonempty_probs.sum()
+    recall = intersection / nonempty_target.sum()
+    spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
+    return (
+        F.binary_cross_entropy(precision, torch.ones_like(precision))
+        + F.binary_cross_entropy(recall, torch.ones_like(recall))
+        + F.binary_cross_entropy(spec, torch.ones_like(spec))
+    )
+def sem_scal_loss(pred, ssc_target):
+    # Get softmax probabilities
+    pred = F.softmax(pred, dim=1)
+    loss = 0
+    count = 0
+    mask = ssc_target != 255
+    n_classes = pred.shape[1]
+    for i in range(0, n_classes):
+        # Get probability of class i
+        p = pred[:, i, :, :, :]
+        # Remove unknown voxels
+        target_ori = ssc_target
+        p = p[mask]
+        target = ssc_target[mask]
+        completion_target = torch.ones_like(target)
+        completion_target[target != i] = 0
+        completion_target_ori = torch.ones_like(target_ori).float()
+        completion_target_ori[target_ori != i] = 0
+        if torch.sum(completion_target) > 0:
+            count += 1.0
+            nominator = torch.sum(p * completion_target)
+            loss_class = 0
+            if torch.sum(p) > 0:
+                precision = nominator / (torch.sum(p))
+                loss_precision = F.binary_cross_entropy(
+                    precision, torch.ones_like(precision)
+                )
+                loss_class += loss_precision
+            if torch.sum(completion_target) > 0:
+                recall = nominator / (torch.sum(completion_target))
+                loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
+                loss_class += loss_recall
+            if torch.sum(1 - completion_target) > 0:
+                specificity = torch.sum((1 - p) * (1 - completion_target)) / (
+                    torch.sum(1 - completion_target)
+                )
+                loss_specificity = F.binary_cross_entropy(
+                    specificity, torch.ones_like(specificity)
+                )
+                loss_class += loss_specificity
+            loss += loss_class
+    return loss / count
+def CE_ssc_loss(pred, target, class_weights):
+    """
+    :param: prediction: the predicted tensor, must be [BS, C, H, W, D]
+    """
+    criterion = nn.CrossEntropyLoss(
+        weight=class_weights, ignore_index=255, reduction="mean"
+    )
+    loss = criterion(pred, target.long())
+    return loss

monoscene/{CRP3D.py → models/CRP3D.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from monoscene.modules import (
     Process,
     ASPP,
 )

 import torch
 import torch.nn as nn
+from monoscene.models.modules import (
     Process,
     ASPP,
 )

monoscene/{DDR.py → models/DDR.py} RENAMED Viewed

File without changes

monoscene/{flosp.py → models/flosp.py} RENAMED Viewed

File without changes

monoscene/{modules.py → models/modules.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from monoscene.DDR import Bottleneck3D
 class ASPP(nn.Module):

 import torch
 import torch.nn as nn
+from monoscene.models.DDR import Bottleneck3D
 class ASPP(nn.Module):

monoscene/{.ipynb_checkpoints/monoscene-checkpoint.py → models/monoscene.py} RENAMED Viewed

@@ -1,19 +1,25 @@
 import pytorch_lightning as pl
 import torch
 import torch.nn as nn
-from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
-from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
-from monoscene.flosp import FLoSP
 import numpy as np
 import torch.nn.functional as F
-from monoscene.unet2d import UNet2D
 class MonoScene(pl.LightningModule):
     def __init__(
         self,
         n_classes,
         feature,
         project_scale,
         full_scene_size,
         dataset,
@@ -36,11 +42,13 @@ class MonoScene(pl.LightningModule):
         self.dataset = dataset
         self.context_prior = context_prior
         self.frustum_size = frustum_size
         self.relation_loss = relation_loss
         self.CE_ssc_loss = CE_ssc_loss
         self.sem_scal_loss = sem_scal_loss
         self.geo_scal_loss = geo_scal_loss
         self.project_scale = project_scale
         self.lr = lr
         self.weight_decay = weight_decay
@@ -73,6 +81,13 @@ class MonoScene(pl.LightningModule):
             )
         self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
     def forward(self, batch):
         img = batch["img"]
@@ -111,13 +126,165 @@ class MonoScene(pl.LightningModule):
             "x3d": torch.stack(x3ds),
         }
-        out_dict = self.net_3d_decoder(input_dict)
         ssc_pred = out_dict["ssc_logit"]
         y_pred = ssc_pred.detach().cpu().numpy()
         y_pred = np.argmax(y_pred, axis=1)
-        return y_pred

 import pytorch_lightning as pl
 import torch
 import torch.nn as nn
+from monoscene.models.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.models.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.loss.sscMetrics import SSCMetrics
+from monoscene.loss.ssc_loss import sem_scal_loss, CE_ssc_loss, KL_sep, geo_scal_loss
+from monoscene.models.flosp import FLoSP
+from monoscene.loss.CRP_loss import compute_super_CP_multilabel_loss
 import numpy as np
 import torch.nn.functional as F
+from monoscene.models.unet2d import UNet2D
+from torch.optim.lr_scheduler import MultiStepLR
 class MonoScene(pl.LightningModule):
     def __init__(
         self,
         n_classes,
+        class_names,
         feature,
+        class_weights,
         project_scale,
         full_scene_size,
         dataset,
         self.dataset = dataset
         self.context_prior = context_prior
         self.frustum_size = frustum_size
+        self.class_names = class_names
         self.relation_loss = relation_loss
         self.CE_ssc_loss = CE_ssc_loss
         self.sem_scal_loss = sem_scal_loss
         self.geo_scal_loss = geo_scal_loss
         self.project_scale = project_scale
+        self.class_weights = class_weights
         self.lr = lr
         self.weight_decay = weight_decay
             )
         self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
+        # log hyperparameters
+        self.save_hyperparameters()
+        self.train_metrics = SSCMetrics(self.n_classes)
+        self.val_metrics = SSCMetrics(self.n_classes)
+        self.test_metrics = SSCMetrics(self.n_classes)
     def forward(self, batch):
         img = batch["img"]
             "x3d": torch.stack(x3ds),
         }
+        out = self.net_3d_decoder(input_dict)
+        return out
+    def step(self, batch, step_type, metric):
+        bs = len(batch["img"])
+        loss = 0
+        out_dict = self(batch)
         ssc_pred = out_dict["ssc_logit"]
+        target = batch["target"]
+        if self.context_prior:
+            P_logits = out_dict["P_logits"]
+            CP_mega_matrices = batch["CP_mega_matrices"]
+            if self.relation_loss:
+                loss_rel_ce = compute_super_CP_multilabel_loss(
+                    P_logits, CP_mega_matrices
+                )
+                loss += loss_rel_ce
+                self.log(
+                    step_type + "/loss_relation_ce_super",
+                    loss_rel_ce.detach(),
+                    on_epoch=True,
+                    sync_dist=True,
+                )
+        class_weight = self.class_weights.type_as(batch["img"])
+        if self.CE_ssc_loss:
+            loss_ssc = CE_ssc_loss(ssc_pred, target, class_weight)
+            loss += loss_ssc
+            self.log(
+                step_type + "/loss_ssc",
+                loss_ssc.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+        if self.sem_scal_loss:
+            loss_sem_scal = sem_scal_loss(ssc_pred, target)
+            loss += loss_sem_scal
+            self.log(
+                step_type + "/loss_sem_scal",
+                loss_sem_scal.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+        if self.geo_scal_loss:
+            loss_geo_scal = geo_scal_loss(ssc_pred, target)
+            loss += loss_geo_scal
+            self.log(
+                step_type + "/loss_geo_scal",
+                loss_geo_scal.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+        if self.fp_loss and step_type != "test":
+            frustums_masks = torch.stack(batch["frustums_masks"])
+            frustums_class_dists = torch.stack(
+                batch["frustums_class_dists"]
+            ).float()  # (bs, n_frustums, n_classes)
+            n_frustums = frustums_class_dists.shape[1]
+            pred_prob = F.softmax(ssc_pred, dim=1)
+            batch_cnt = frustums_class_dists.sum(0)  # (n_frustums, n_classes)
+            frustum_loss = 0
+            frustum_nonempty = 0
+            for frus in range(n_frustums):
+                frustum_mask = frustums_masks[:, frus, :, :, :].unsqueeze(1).float()
+                prob = frustum_mask * pred_prob  # bs, n_classes, H, W, D
+                prob = prob.reshape(bs, self.n_classes, -1).permute(1, 0, 2)
+                prob = prob.reshape(self.n_classes, -1)
+                cum_prob = prob.sum(dim=1)  # n_classes
+                total_cnt = torch.sum(batch_cnt[frus])
+                total_prob = prob.sum()
+                if total_prob > 0 and total_cnt > 0:
+                    frustum_target_proportion = batch_cnt[frus] / total_cnt
+                    cum_prob = cum_prob / total_prob  # n_classes
+                    frustum_loss_i = KL_sep(cum_prob, frustum_target_proportion)
+                    frustum_loss += frustum_loss_i
+                    frustum_nonempty += 1
+            frustum_loss = frustum_loss / frustum_nonempty
+            loss += frustum_loss
+            self.log(
+                step_type + "/loss_frustums",
+                frustum_loss.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+        y_true = target.cpu().numpy()
         y_pred = ssc_pred.detach().cpu().numpy()
         y_pred = np.argmax(y_pred, axis=1)
+        metric.add_batch(y_pred, y_true)
+        self.log(step_type + "/loss", loss.detach(), on_epoch=True, sync_dist=True)
+        return loss
+    def training_step(self, batch, batch_idx):
+        return self.step(batch, "train", self.train_metrics)
+    def validation_step(self, batch, batch_idx):
+        self.step(batch, "val", self.val_metrics)
+    def validation_epoch_end(self, outputs):
+        metric_list = [("train", self.train_metrics), ("val", self.val_metrics)]
+        for prefix, metric in metric_list:
+            stats = metric.get_stats()
+            for i, class_name in enumerate(self.class_names):
+                self.log(
+                    "{}_SemIoU/{}".format(prefix, class_name),
+                    stats["iou_ssc"][i],
+                    sync_dist=True,
+                )
+            self.log("{}/mIoU".format(prefix), stats["iou_ssc_mean"], sync_dist=True)
+            self.log("{}/IoU".format(prefix), stats["iou"], sync_dist=True)
+            self.log("{}/Precision".format(prefix), stats["precision"], sync_dist=True)
+            self.log("{}/Recall".format(prefix), stats["recall"], sync_dist=True)
+            metric.reset()
+    def test_step(self, batch, batch_idx):
+        self.step(batch, "test", self.test_metrics)
+    def test_epoch_end(self, outputs):
+        classes = self.class_names
+        metric_list = [("test", self.test_metrics)]
+        for prefix, metric in metric_list:
+            print("{}======".format(prefix))
+            stats = metric.get_stats()
+            print(
+                "Precision={:.4f}, Recall={:.4f}, IoU={:.4f}".format(
+                    stats["precision"] * 100, stats["recall"] * 100, stats["iou"] * 100
+                )
+            )
+            print("class IoU: {}, ".format(classes))
+            print(
+                " ".join(["{:.4f}, "] * len(classes)).format(
+                    *(stats["iou_ssc"] * 100).tolist()
+                )
+            )
+            print("mIoU={:.4f}".format(stats["iou_ssc_mean"] * 100))
+            metric.reset()
+    def configure_optimizers(self):
+        if self.dataset == "NYU":
+            optimizer = torch.optim.AdamW(
+                self.parameters(), lr=self.lr, weight_decay=self.weight_decay
+            )
+            scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
+            return [optimizer], [scheduler]
+        elif self.dataset == "kitti":
+            optimizer = torch.optim.AdamW(
+                self.parameters(), lr=self.lr, weight_decay=self.weight_decay
+            )
+            scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
+            return [optimizer], [scheduler]

monoscene/{unet2d.py → models/unet2d.py} RENAMED Viewed

File without changes

monoscene/{unet3d_kitti.py → models/unet3d_kitti.py} RENAMED Viewed

@@ -2,9 +2,9 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from monoscene.modules import SegmentationHead
-from monoscene.CRP3D import CPMegaVoxels
-from monoscene.modules import Process, Upsample, Downsample
 class UNet3D(nn.Module):

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from monoscene.models.modules import SegmentationHead
+from monoscene.models.CRP3D import CPMegaVoxels
+from monoscene.models.modules import Process, Upsample, Downsample
 class UNet3D(nn.Module):

monoscene/{unet3d_nyu.py → models/unet3d_nyu.py} RENAMED Viewed

@@ -3,8 +3,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
-from monoscene.CRP3D import CPMegaVoxels
-from monoscene.modules import (
     Process,
     Upsample,
     Downsample,

 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
+from monoscene.models.CRP3D import CPMegaVoxels
+from monoscene.models.modules import (
     Process,
     Upsample,
     Downsample,

monoscene/monoscene.py DELETED Viewed

@@ -1,125 +0,0 @@
-import pytorch_lightning as pl
-import torch
-import torch.nn as nn
-from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
-from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
-from monoscene.flosp import FLoSP
-import numpy as np
-import torch.nn.functional as F
-from monoscene.unet2d import UNet2D
-class MonoScene(pl.LightningModule):
-    def __init__(
-        self,
-        n_classes,
-        feature,
-        project_scale,
-        full_scene_size,
-        dataset,
-        project_res=["1", "2", "4", "8"],
-        n_relations=4,
-        context_prior=True,
-        fp_loss=True,
-        frustum_size=4,
-        relation_loss=False,
-        CE_ssc_loss=True,
-        geo_scal_loss=True,
-        sem_scal_loss=True,
-        lr=1e-4,
-        weight_decay=1e-4,
-    ):
-        super().__init__()
-        self.project_res = project_res
-        self.fp_loss = fp_loss
-        self.dataset = dataset
-        self.context_prior = context_prior
-        self.frustum_size = frustum_size
-        self.relation_loss = relation_loss
-        self.CE_ssc_loss = CE_ssc_loss
-        self.sem_scal_loss = sem_scal_loss
-        self.geo_scal_loss = geo_scal_loss
-        self.project_scale = project_scale
-        self.lr = lr
-        self.weight_decay = weight_decay
-        self.projects = {}
-        self.scale_2ds = [1, 2, 4, 8]  # 2D scales
-        for scale_2d in self.scale_2ds:
-            self.projects[str(scale_2d)] = FLoSP(
-                full_scene_size, project_scale=self.project_scale, dataset=self.dataset
-            )
-        self.projects = nn.ModuleDict(self.projects)
-        self.n_classes = n_classes
-        if self.dataset == "NYU":
-            self.net_3d_decoder = UNet3DNYU(
-                self.n_classes,
-                nn.BatchNorm3d,
-                n_relations=n_relations,
-                feature=feature,
-                full_scene_size=full_scene_size,
-                context_prior=context_prior,
-            )
-        elif self.dataset == "kitti":
-            self.net_3d_decoder = UNet3DKitti(
-                self.n_classes,
-                nn.BatchNorm3d,
-                project_scale=project_scale,
-                feature=feature,
-                full_scene_size=full_scene_size,
-                context_prior=context_prior,
-            )
-        self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
-    def forward(self, batch):
-        img = batch["img"]
-        bs = len(img)
-        out = {}
-        x_rgb = self.net_rgb(img)
-        x3ds = []
-        for i in range(bs):
-            x3d = None
-            for scale_2d in self.project_res:
-                # project features at each 2D scale to target 3D scale
-                scale_2d = int(scale_2d)
-                projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda()
-                fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda()
-                # Sum all the 3D features
-                if x3d is None:
-                    x3d = self.projects[str(scale_2d)](
-                        x_rgb["1_" + str(scale_2d)][i],
-                        # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
-                        projected_pix // scale_2d,
-                        fov_mask,
-                    )
-                else:
-                    x3d += self.projects[str(scale_2d)](
-                        x_rgb["1_" + str(scale_2d)][i],
-                        # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
-                        projected_pix // scale_2d,
-                        fov_mask,
-                    )
-            x3ds.append(x3d)
-        input_dict = {
-            "x3d": torch.stack(x3ds),
-        }
-        out_dict = self.net_3d_decoder(input_dict)
-        ssc_pred = out_dict["ssc_logit"]
-        y_pred = ssc_pred.detach().cpu().numpy()
-        y_pred = np.argmax(y_pred, axis=1)
-        return y_pred

monoscene/monoscene_model.py DELETED Viewed

@@ -1,21 +0,0 @@
-from transformers import PreTrainedModel
-from .config import MonoSceneConfig
-from monoscene.monoscene import MonoScene
-class MonoSceneModel(PreTrainedModel):
-    config_class = MonoSceneConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = MonoScene(
-            dataset=config.dataset,
-            n_classes=config.n_classes,
-            feature=config.feature,
-            project_scale=config.project_scale,
-            full_scene_size=config.full_scene_size
-        )
-    def forward(self, tensor):
-        return self.model.forward(tensor)

monoscene/scripts/eval_monoscene.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from pytorch_lightning import Trainer
+from monoscene.models.monoscene import MonoScene
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+import hydra
+from omegaconf import DictConfig
+import torch
+import os
+from hydra.utils import get_original_cwd
+@hydra.main(config_name="../config/monoscene.yaml")
+def main(config: DictConfig):
+    torch.set_grad_enabled(False)
+    if config.dataset == "kitti":
+        config.batch_size = 1
+        n_classes = 20
+        feature = 64
+        project_scale = 2
+        full_scene_size = (256, 256, 32)
+        data_module = KittiDataModule(
+            root=config.kitti_root,
+            preprocess_root=config.kitti_preprocess_root,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+    elif config.dataset == "NYU":
+        config.batch_size = 2
+        project_scale = 1
+        n_classes = 12
+        feature = 200
+        full_scene_size = (60, 36, 60)
+        data_module = NYUDataModule(
+            root=config.NYU_root,
+            preprocess_root=config.NYU_preprocess_root,
+            n_relations=config.n_relations,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+    trainer = Trainer(
+        sync_batchnorm=True, deterministic=True, gpus=config.n_gpus, accelerator="ddp"
+    )
+    if config.dataset == "NYU":
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
+        )
+    else:
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
+        )
+    model = MonoScene.load_from_checkpoint(
+        model_path,
+        feature=feature,
+        project_scale=project_scale,
+        fp_loss=config.fp_loss,
+        full_scene_size=full_scene_size,
+    )
+    model.eval()
+    data_module.setup()
+    val_dataloader = data_module.val_dataloader()
+    trainer.test(model, test_dataloaders=val_dataloader)
+if __name__ == "__main__":
+    main()

monoscene/scripts/generate_output.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from pytorch_lightning import Trainer
+from monoscene.models.monoscene import MonoScene
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
+import hydra
+from omegaconf import DictConfig
+import torch
+import numpy as np
+import os
+from hydra.utils import get_original_cwd
+from tqdm import tqdm
+import pickle
+@hydra.main(config_name="../config/monoscene.yaml")
+def main(config: DictConfig):
+    torch.set_grad_enabled(False)
+    # Setup dataloader
+    if config.dataset == "kitti" or config.dataset == "kitti_360":
+        feature = 64
+        project_scale = 2
+        full_scene_size = (256, 256, 32)
+        if config.dataset == "kitti":
+            data_module = KittiDataModule(
+                root=config.kitti_root,
+                preprocess_root=config.kitti_preprocess_root,
+                frustum_size=config.frustum_size,
+                batch_size=int(config.batch_size / config.n_gpus),
+                num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+            )
+            data_module.setup()
+            data_loader = data_module.val_dataloader()
+            # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+        else:
+            data_module = Kitti360DataModule(
+                root=config.kitti_360_root,
+                sequences=[config.kitti_360_sequence],
+                n_scans=2000,
+                batch_size=1,
+                num_workers=3,
+            )
+            data_module.setup()
+            data_loader = data_module.dataloader()
+    elif config.dataset == "NYU":
+        project_scale = 1
+        feature = 200
+        full_scene_size = (60, 36, 60)
+        data_module = NYUDataModule(
+            root=config.NYU_root,
+            preprocess_root=config.NYU_preprocess_root,
+            n_relations=config.n_relations,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+        data_module.setup()
+        data_loader = data_module.val_dataloader()
+        # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+    else:
+        print("dataset not support")
+    # Load pretrained models
+    if config.dataset == "NYU":
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
+        )
+    else:
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
+        )
+    model = MonoScene.load_from_checkpoint(
+        model_path,
+        feature=feature,
+        project_scale=project_scale,
+        fp_loss=config.fp_loss,
+        full_scene_size=full_scene_size,
+    )
+    model.cuda()
+    model.eval()
+    # Save prediction and additional data
+    # to draw the viewing frustum and remove scene outside the room for NYUv2
+    output_path = os.path.join(config.output_path, config.dataset)
+    with torch.no_grad():
+        for batch in tqdm(data_loader):
+            batch["img"] = batch["img"].cuda()
+            pred = model(batch)
+            y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
+            y_pred = np.argmax(y_pred, axis=1)
+            for i in range(config.batch_size):
+                out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
+                if "target" in batch:
+                    out_dict["target"] = (
+                        batch["target"][i].detach().cpu().numpy().astype(np.uint16)
+                    )
+                if config.dataset == "NYU":
+                    write_path = output_path
+                    filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
+                    out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
+                    out_dict["vox_origin"] = (
+                        batch["vox_origin"][i].detach().cpu().numpy()
+                    )
+                else:
+                    write_path = os.path.join(output_path, batch["sequence"][i])
+                    filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
+                    out_dict["fov_mask_1"] = (
+                        batch["fov_mask_1"][i].detach().cpu().numpy()
+                    )
+                    out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
+                    out_dict["T_velo_2_cam"] = (
+                        batch["T_velo_2_cam"][i].detach().cpu().numpy()
+                    )
+                os.makedirs(write_path, exist_ok=True)
+                with open(filepath, "wb") as handle:
+                    pickle.dump(out_dict, handle)
+                    print("wrote to", filepath)
+if __name__ == "__main__":
+    main()

monoscene/scripts/train_monoscene.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+from monoscene.data.semantic_kitti.params import (
+    semantic_kitti_class_frequencies,
+    kitti_class_names,
+)
+from monoscene.data.NYU.params import (
+    class_weights as NYU_class_weights,
+    NYU_class_names,
+)
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from torch.utils.data.dataloader import DataLoader
+from monoscene.models.monoscene import MonoScene
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+import os
+import hydra
+from omegaconf import DictConfig
+import numpy as np
+import torch
+hydra.output_subdir = None
+@hydra.main(config_name="../config/monoscene.yaml")
+def main(config: DictConfig):
+    exp_name = config.exp_prefix
+    exp_name += "_{}_{}".format(config.dataset, config.run)
+    exp_name += "_FrusSize_{}".format(config.frustum_size)
+    exp_name += "_nRelations{}".format(config.n_relations)
+    exp_name += "_WD{}_lr{}".format(config.weight_decay, config.lr)
+    if config.CE_ssc_loss:
+        exp_name += "_CEssc"
+    if config.geo_scal_loss:
+        exp_name += "_geoScalLoss"
+    if config.sem_scal_loss:
+        exp_name += "_semScalLoss"
+    if config.fp_loss:
+        exp_name += "_fpLoss"
+    if config.relation_loss:
+        exp_name += "_CERel"
+    if config.context_prior:
+        exp_name += "_3DCRP"
+    # Setup dataloaders
+    if config.dataset == "kitti":
+        class_names = kitti_class_names
+        max_epochs = 30
+        logdir = config.kitti_logdir
+        full_scene_size = (256, 256, 32)
+        project_scale = 2
+        feature = 64
+        n_classes = 20
+        class_weights = torch.from_numpy(
+            1 / np.log(semantic_kitti_class_frequencies + 0.001)
+        )
+        data_module = KittiDataModule(
+            root=config.kitti_root,
+            preprocess_root=config.kitti_preprocess_root,
+            frustum_size=config.frustum_size,
+            project_scale=project_scale,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu),
+        )
+    elif config.dataset == "NYU":
+        class_names = NYU_class_names
+        max_epochs = 30
+        logdir = config.logdir
+        full_scene_size = (60, 36, 60)
+        project_scale = 1
+        feature = 200
+        n_classes = 12
+        class_weights = NYU_class_weights
+        data_module = NYUDataModule(
+            root=config.NYU_root,
+            preprocess_root=config.NYU_preprocess_root,
+            n_relations=config.n_relations,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+    project_res = ["1"]
+    if config.project_1_2:
+        exp_name += "_Proj_2"
+        project_res.append("2")
+    if config.project_1_4:
+        exp_name += "_4"
+        project_res.append("4")
+    if config.project_1_8:
+        exp_name += "_8"
+        project_res.append("8")
+    print(exp_name)
+    # Initialize MonoScene model
+    model = MonoScene(
+        dataset=config.dataset,
+        frustum_size=config.frustum_size,
+        project_scale=project_scale,
+        n_relations=config.n_relations,
+        fp_loss=config.fp_loss,
+        feature=feature,
+        full_scene_size=full_scene_size,
+        project_res=project_res,
+        n_classes=n_classes,
+        class_names=class_names,
+        context_prior=config.context_prior,
+        relation_loss=config.relation_loss,
+        CE_ssc_loss=config.CE_ssc_loss,
+        sem_scal_loss=config.sem_scal_loss,
+        geo_scal_loss=config.geo_scal_loss,
+        lr=config.lr,
+        weight_decay=config.weight_decay,
+        class_weights=class_weights,
+    )
+    if config.enable_log:
+        logger = TensorBoardLogger(save_dir=logdir, name=exp_name, version="")
+        lr_monitor = LearningRateMonitor(logging_interval="step")
+        checkpoint_callbacks = [
+            ModelCheckpoint(
+                save_last=True,
+                monitor="val/mIoU",
+                save_top_k=1,
+                mode="max",
+                filename="{epoch:03d}-{val/mIoU:.5f}",
+            ),
+            lr_monitor,
+        ]
+    else:
+        logger = False
+        checkpoint_callbacks = False
+    model_path = os.path.join(logdir, exp_name, "checkpoints/last.ckpt")
+    if os.path.isfile(model_path):
+        # Continue training from last.ckpt
+        trainer = Trainer(
+            callbacks=checkpoint_callbacks,
+            resume_from_checkpoint=model_path,
+            sync_batchnorm=True,
+            deterministic=False,
+            max_epochs=max_epochs,
+            gpus=config.n_gpus,
+            logger=logger,
+            check_val_every_n_epoch=1,
+            log_every_n_steps=10,
+            flush_logs_every_n_steps=100,
+            accelerator="ddp",
+        )
+    else:
+        # Train from scratch
+        trainer = Trainer(
+            callbacks=checkpoint_callbacks,
+            sync_batchnorm=True,
+            deterministic=False,
+            max_epochs=max_epochs,
+            gpus=config.n_gpus,
+            logger=logger,
+            check_val_every_n_epoch=1,
+            log_every_n_steps=10,
+            flush_logs_every_n_steps=100,
+            accelerator="ddp",
+        )
+    trainer.fit(model, data_module)
+if __name__ == "__main__":
+    main()

monoscene/scripts/visualization/NYU_vis_pred.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import pickle
+import os
+from omegaconf import DictConfig
+import numpy as np
+import hydra
+from mayavi import mlab
+def get_grid_coords(dims, resolution):
+    """
+    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+    :return coords_grid: is the center coords of voxels in the grid
+    """
+    g_xx = np.arange(0, dims[0] + 1)
+    g_yy = np.arange(0, dims[1] + 1)
+    g_zz = np.arange(0, dims[2] + 1)
+    # Obtaining the grid with coords...
+    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+    coords_grid = coords_grid.astype(np.float)
+    coords_grid = (coords_grid * resolution) + resolution / 2
+    temp = np.copy(coords_grid)
+    temp[:, 0] = coords_grid[:, 1]
+    temp[:, 1] = coords_grid[:, 0]
+    coords_grid = np.copy(temp)
+    return coords_grid
+def draw(
+    voxels,
+    cam_pose,
+    vox_origin,
+    voxel_size=0.08,
+    d=0.75,  # 0.75m - determine the size of the mesh representing the camera
+):
+    # Compute the coordinates of the mesh representing camera
+    y = d * 480 / (2 * 518.8579)
+    x = d * 640 / (2 * 518.8579)
+    tri_points = np.array(
+        [
+            [0, 0, 0],
+            [x, y, d],
+            [-x, y, d],
+            [-x, -y, d],
+            [x, -y, d],
+        ]
+    )
+    tri_points = np.hstack([tri_points, np.ones((5, 1))])
+    tri_points = (cam_pose @ tri_points.T).T
+    x = tri_points[:, 0] - vox_origin[0]
+    y = tri_points[:, 1] - vox_origin[1]
+    z = tri_points[:, 2] - vox_origin[2]
+    triangles = [
+        (0, 1, 2),
+        (0, 1, 4),
+        (0, 3, 4),
+        (0, 2, 3),
+    ]
+    # Compute the voxels coordinates
+    grid_coords = get_grid_coords(
+        [voxels.shape[0], voxels.shape[2], voxels.shape[1]], voxel_size
+    )
+    # Attach the predicted class to every voxel
+    grid_coords = np.vstack(
+        (grid_coords.T, np.moveaxis(voxels, [0, 1, 2], [0, 2, 1]).reshape(-1))
+    ).T
+    # Remove empty and unknown voxels
+    occupied_voxels = grid_coords[(grid_coords[:, 3] > 0) & (grid_coords[:, 3] < 255)]
+    figure = mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1))
+    # Draw the camera
+    mlab.triangular_mesh(
+        x,
+        y,
+        z,
+        triangles,
+        representation="wireframe",
+        color=(0, 0, 0),
+        line_width=5,
+    )
+    # Draw occupied voxels
+    plt_plot = mlab.points3d(
+        occupied_voxels[:, 0],
+        occupied_voxels[:, 1],
+        occupied_voxels[:, 2],
+        occupied_voxels[:, 3],
+        colormap="viridis",
+        scale_factor=voxel_size - 0.1 * voxel_size,
+        mode="cube",
+        opacity=1.0,
+        vmin=0,
+        vmax=12,
+    )
+    colors = np.array(
+        [
+            [22, 191, 206, 255],
+            [214, 38, 40, 255],
+            [43, 160, 43, 255],
+            [158, 216, 229, 255],
+            [114, 158, 206, 255],
+            [204, 204, 91, 255],
+            [255, 186, 119, 255],
+            [147, 102, 188, 255],
+            [30, 119, 181, 255],
+            [188, 188, 33, 255],
+            [255, 127, 12, 255],
+            [196, 175, 214, 255],
+            [153, 153, 153, 255],
+        ]
+    )
+    plt_plot.glyph.scale_mode = "scale_by_vector"
+    plt_plot.module_manager.scalar_lut_manager.lut.table = colors
+    mlab.show()
+@hydra.main(config_path=None)
+def main(config: DictConfig):
+    scan = config.file
+    with open(scan, "rb") as handle:
+        b = pickle.load(handle)
+    cam_pose = b["cam_pose"]
+    vox_origin = b["vox_origin"]
+    gt_scene = b["target"]
+    pred_scene = b["y_pred"]
+    scan = os.path.basename(scan)[:12]
+    pred_scene[(gt_scene == 255)] = 255  # only draw scene inside the room
+    draw(
+        pred_scene,
+        cam_pose,
+        vox_origin,
+        voxel_size=0.08,
+        d=0.75,
+    )
+if __name__ == "__main__":
+    main()

monoscene/scripts/visualization/kitti_vis_pred.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# from operator import gt
+import pickle
+import numpy as np
+from omegaconf import DictConfig
+import hydra
+from mayavi import mlab
+def get_grid_coords(dims, resolution):
+    """
+    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+    :return coords_grid: is the center coords of voxels in the grid
+    """
+    g_xx = np.arange(0, dims[0] + 1)
+    g_yy = np.arange(0, dims[1] + 1)
+    sensor_pose = 10
+    g_zz = np.arange(0, dims[2] + 1)
+    # Obtaining the grid with coords...
+    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+    coords_grid = coords_grid.astype(np.float)
+    coords_grid = (coords_grid * resolution) + resolution / 2
+    temp = np.copy(coords_grid)
+    temp[:, 0] = coords_grid[:, 1]
+    temp[:, 1] = coords_grid[:, 0]
+    coords_grid = np.copy(temp)
+    return coords_grid
+def draw(
+    voxels,
+    T_velo_2_cam,
+    vox_origin,
+    fov_mask,
+    img_size,
+    f,
+    voxel_size=0.2,
+    d=7,  # 7m - determine the size of the mesh representing the camera
+):
+    # Compute the coordinates of the mesh representing camera
+    x = d * img_size[0] / (2 * f)
+    y = d * img_size[1] / (2 * f)
+    tri_points = np.array(
+        [
+            [0, 0, 0],
+            [x, y, d],
+            [-x, y, d],
+            [-x, -y, d],
+            [x, -y, d],
+        ]
+    )
+    tri_points = np.hstack([tri_points, np.ones((5, 1))])
+    tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T
+    x = tri_points[:, 0] - vox_origin[0]
+    y = tri_points[:, 1] - vox_origin[1]
+    z = tri_points[:, 2] - vox_origin[2]
+    triangles = [
+        (0, 1, 2),
+        (0, 1, 4),
+        (0, 3, 4),
+        (0, 2, 3),
+    ]
+    # Compute the voxels coordinates
+    grid_coords = get_grid_coords(
+        [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
+    )
+    # Attach the predicted class to every voxel
+    grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
+    # Get the voxels inside FOV
+    fov_grid_coords = grid_coords[fov_mask, :]
+    # Get the voxels outside FOV
+    outfov_grid_coords = grid_coords[~fov_mask, :]
+    # Remove empty and unknown voxels
+    fov_voxels = fov_grid_coords[
+        (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255)
+    ]
+    outfov_voxels = outfov_grid_coords[
+        (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255)
+    ]
+    figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
+    # Draw the camera
+    mlab.triangular_mesh(
+        x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5
+    )
+    # Draw occupied inside FOV voxels
+    plt_plot_fov = mlab.points3d(
+        fov_voxels[:, 0],
+        fov_voxels[:, 1],
+        fov_voxels[:, 2],
+        fov_voxels[:, 3],
+        colormap="viridis",
+        scale_factor=voxel_size - 0.05 * voxel_size,
+        mode="cube",
+        opacity=1.0,
+        vmin=1,
+        vmax=19,
+    )
+    # Draw occupied outside FOV voxels
+    plt_plot_outfov = mlab.points3d(
+        outfov_voxels[:, 0],
+        outfov_voxels[:, 1],
+        outfov_voxels[:, 2],
+        outfov_voxels[:, 3],
+        colormap="viridis",
+        scale_factor=voxel_size - 0.05 * voxel_size,
+        mode="cube",
+        opacity=1.0,
+        vmin=1,
+        vmax=19,
+    )
+    colors = np.array(
+        [
+            [100, 150, 245, 255],
+            [100, 230, 245, 255],
+            [30, 60, 150, 255],
+            [80, 30, 180, 255],
+            [100, 80, 250, 255],
+            [255, 30, 30, 255],
+            [255, 40, 200, 255],
+            [150, 30, 90, 255],
+            [255, 0, 255, 255],
+            [255, 150, 255, 255],
+            [75, 0, 75, 255],
+            [175, 0, 75, 255],
+            [255, 200, 0, 255],
+            [255, 120, 50, 255],
+            [0, 175, 0, 255],
+            [135, 60, 0, 255],
+            [150, 240, 80, 255],
+            [255, 240, 150, 255],
+            [255, 0, 0, 255],
+        ]
+    ).astype(np.uint8)
+    plt_plot_fov.glyph.scale_mode = "scale_by_vector"
+    plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
+    plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
+    outfov_colors = colors
+    outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
+    plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
+    mlab.show()
+@hydra.main(config_path=None)
+def main(config: DictConfig):
+    scan = config.file
+    with open(scan, "rb") as handle:
+        b = pickle.load(handle)
+    fov_mask_1 = b["fov_mask_1"]
+    T_velo_2_cam = b["T_velo_2_cam"]
+    vox_origin = np.array([0, -25.6, -2])
+    y_pred = b["y_pred"]
+    if config.dataset == "kitti_360":
+        # Visualize KITTI-360
+        draw(
+            y_pred,
+            T_velo_2_cam,
+            vox_origin,
+            fov_mask_1,
+            voxel_size=0.2,
+            f=552.55426,
+            img_size=(1408, 376),
+            d=7,
+        )
+    else:
+        # Visualize Semantic KITTI
+        draw(
+            y_pred,
+            T_velo_2_cam,
+            vox_origin,
+            fov_mask_1,
+            img_size=(1220, 370),
+            f=707.0912,
+            voxel_size=0.2,
+            d=7,
+        )
+if __name__ == "__main__":
+    main()