Spaces:

mkalia
/

DepthPoseEstimation

Sleeping

App Files Files Community

mkalia commited on Jun 27, 2024

Commit

a50312e

verified ·

1 Parent(s): bf4a223

Upload 4 files

Browse files

Files changed (4) hide show

depth_decoder.py +80 -0
pose_cnn.py +52 -0
pose_decoder.py +54 -0
resnet_encoder.py +114 -0

depth_decoder.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+from __future__ import absolute_import, division, print_function
+import numpy as np
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+from layers import *
+class DepthDecoder(nn.Module):
+    def __init__(self, num_ch_enc, scales=range(4), num_output_channels=1, use_skips=True, batch_norm = True):
+        super(DepthDecoder, self).__init__()
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = 'nearest'
+        self.scales = scales
+        self.batch_norm = batch_norm
+        self.num_ch_enc = num_ch_enc
+        self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+        self.convs = OrderedDict()
+        self.bn = {}
+        for i in range(4, -1, -1):
+            self.convs[("deconv", i, 0)] = nn.ConvTranspose2d(self.num_ch_dec[i], self.num_ch_dec[i], 3, stride=2, padding = 1, output_padding = 1)
+            if self.batch_norm:
+                self.bn[('bn', i)] = batchNorm(self.num_ch_dec[i])
+        # decoder
+        for i in range(4, -1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+        for s in self.scales:
+            self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels)
+        self.decoder = nn.ModuleList(list(self.convs.values()))
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, input_features):
+        self.outputs = {}
+        # decoder
+        x = input_features[-1]
+        for i in range(4, -1, -1):
+            x = self.convs[("upconv", i, 0)](x)
+            x = [upsample(x)]
+            # x = [self.convs[("deconv", i, 0)](x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = torch.cat(x, 1)
+            x = self.convs[("upconv", i, 1)](x)
+            if self.batch_norm:
+                x = self.bn[('bn', i)].cuda()(x)
+            # batchnorm
+            if i in self.scales:
+                self.outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", i)](x))
+        return self.outputs

pose_cnn.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+from __future__ import absolute_import, division, print_function
+import torch
+import torch.nn as nn
+class PoseCNN(nn.Module):
+    def __init__(self, num_input_frames):
+        super(PoseCNN, self).__init__()
+        self.num_input_frames = num_input_frames
+        self.convs = {}
+        self.convs[0] = nn.Conv2d(3 * num_input_frames, 16, 7, 2, 3)
+        self.convs[1] = nn.Conv2d(16, 32, 5, 2, 2)
+        self.convs[2] = nn.Conv2d(32, 64, 3, 2, 1)
+        self.convs[3] = nn.Conv2d(64, 128, 3, 2, 1)
+        self.convs[4] = nn.Conv2d(128, 256, 3, 2, 1)
+        self.convs[5] = nn.Conv2d(256, 256, 3, 2, 1)
+        self.convs[6] = nn.Conv2d(256, 256, 3, 2, 1)
+        self.pose_conv = nn.Conv2d(256, 6 * (num_input_frames - 1), 1)
+        self.num_convs = len(self.convs)
+        self.relu = nn.ReLU(True)
+        self.net = nn.ModuleList(list(self.convs.values()))
+    def forward(self, out):
+        for i in range(self.num_convs):
+            out = self.convs[i](out)
+            out = self.relu(out)
+        out = self.pose_conv(out)
+        out = out.mean(3).mean(2)
+        # out = 0.01 * out.view(-1, self.num_input_frames - 1, 1, 6) # original
+        out = out.view(-1, self.num_input_frames - 1, 1, 6)
+        axisangle = out[..., :3]
+        translation = out[..., 3:]
+        return axisangle, translation

pose_decoder.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+from __future__ import absolute_import, division, print_function
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+class PoseDecoder(nn.Module):
+    def __init__(self, num_ch_enc, num_input_features, num_frames_to_predict_for=None, stride=1):
+        super(PoseDecoder, self).__init__()
+        self.num_ch_enc = num_ch_enc
+        self.num_input_features = num_input_features
+        if num_frames_to_predict_for is None:
+            num_frames_to_predict_for = num_input_features - 1
+        self.num_frames_to_predict_for = num_frames_to_predict_for
+        self.convs = OrderedDict()
+        self.convs[("squeeze")] = nn.Conv2d(self.num_ch_enc[-1], 256, 1)
+        self.convs[("pose", 0)] = nn.Conv2d(num_input_features * 256, 256, 3, stride, 1)
+        self.convs[("pose", 1)] = nn.Conv2d(256, 256, 3, stride, 1)
+        self.convs[("pose", 2)] = nn.Conv2d(256, 6 * num_frames_to_predict_for, 1)
+        self.relu = nn.ReLU()
+        self.net = nn.ModuleList(list(self.convs.values()))
+    def forward(self, input_features):
+        last_features = [f[-1] for f in input_features]
+        cat_features = [self.relu(self.convs["squeeze"](f)) for f in last_features]
+        cat_features = torch.cat(cat_features, 1)
+        out = cat_features
+        for i in range(3):
+            out = self.convs[("pose", i)](out)
+            if i != 2:
+                out = self.relu(out)
+        out = out.mean(3).mean(2)
+        out = 0.01 * out.view(-1, self.num_frames_to_predict_for, 1, 6)
+        axisangle = out[..., :3]
+        translation = out[..., 3:]
+        return axisangle, translation

resnet_encoder.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+from __future__ import absolute_import, division, print_function
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torch.utils.model_zoo as model_zoo
+from torchvision.models.resnet import ResNet18_Weights, ResNet50_Weights
+class ResNetMultiImageInput(models.ResNet):
+    """Constructs a resnet model with varying number of input images.
+    Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+    """
+    def __init__(self, block, layers, num_classes=1000, num_input_images=1):
+        super(ResNetMultiImageInput, self).__init__(block, layers)
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            num_input_images * 3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+def resnet_multiimage_input(num_layers, pretrained=False, num_input_images=1):
+    """Constructs a ResNet model.
+    Args:
+        num_layers (int): Number of resnet layers. Must be 18 or 50
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        num_input_images (int): Number of frames stacked as input
+    """
+    assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet"
+    blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+    block_type = {18: models.resnet.BasicBlock, 50: models.resnet.Bottleneck}[num_layers]
+    model = ResNetMultiImageInput(block_type, blocks, num_input_images=num_input_images)
+    if pretrained:
+        # loaded = torch.utils.model_zoo.load_url(ResNet50_Weights.IMAGENET1K_V1.url)
+        loaded = torch.utils.model_zoo.load_url(ResNet18_Weights.IMAGENET1K_V1.url)
+        # loaded = model_zoo.load_url(models.resnet.model_urls['resnet{}'.format(num_layers)])
+        loaded['conv1.weight'] = torch.cat(
+            [loaded['conv1.weight']] * num_input_images, 1) / num_input_images
+        model.load_state_dict(loaded)
+    return model
+class ResnetEncoder(nn.Module):
+    """Pytorch module for a resnet encoder
+    """
+    def __init__(self, num_layers, pretrained, num_input_images=1, batch_norm_apply = False):
+        super(ResnetEncoder, self).__init__()
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+        resnets = {18: models.resnet18,
+                   34: models.resnet34,
+                   50: models.resnet50,
+                   101: models.resnet101,
+                   152: models.resnet152}
+        if num_layers not in resnets:
+            raise ValueError("{} is not a valid number of resnet layers".format(num_layers))
+        if num_input_images > 1:
+            self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images)
+        else:
+            self.encoder = resnets[num_layers](pretrained)
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+        self.drop = True
+        self.dropout = torch.nn.Dropout(p=0.2)
+    def forward(self, input_image):
+        self.features = []
+        # x = (input_image - 0.45) / 0.225 # ?
+        x = input_image
+        x = self.encoder.conv1(x)
+        x = self.encoder.bn1(x)
+        self.features.append(self.encoder.relu(x))
+        if self.drop:
+            self.features.append(self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
+            self.features.append(self.encoder.layer2(self.dropout(self.features[-1])))
+            self.features.append(self.encoder.layer3(self.dropout(self.features[-1])))
+            self.features.append(self.encoder.layer4(self.dropout(self.features[-1])))
+        else:
+            self.features.append(self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
+            self.features.append(self.encoder.layer2((self.features[-1])))
+            self.features.append(self.encoder.layer3((self.features[-1])))
+            self.features.append(self.encoder.layer4((self.features[-1])))
+        return self.features