Spaces:

Tej3
/

DepthEstimation

Running

App Files Files Community

Tej3 commited on Apr 20, 2023

Commit

54d726d

1 Parent(s): 1f8842a

Adding Application, models and ckpt files

Browse files

Files changed (20) hide show

app.py +116 -0
ckpt/densenet_epoch_15_model.ckpt +3 -0
ckpt/densenet_nyu_then_kitti_epoch_10_model.ckpt +3 -0
ckpt/nyudepthv2_swin_base.ckpt +3 -0
ckpt/resnet18_unet_epoch_08_model_kitti_and_nyu.ckpt +3 -0
ckpt/resnet50_unet_epoch_02_model_nyuandkitti.ckpt +3 -0
ckpt/resnet_encdecmodel_epoch_05_model_nyu_and_kitti.ckpt +3 -0
ckpt/resnet_nyu_best.ckpt +3 -0
demo_data/Bathroom.jpg +0 -0
demo_data/Bedroom.jpg +0 -0
demo_data/Bookstore.jpg +0 -0
demo_data/Classroom.jpg +0 -0
demo_data/Computerlab.jpg +0 -0
demo_data/kitti_1.png +0 -0
demo_data/kitti_2.png +0 -0
demo_data/kitti_3.png +0 -0
models/densenet_v2.py +179 -0
models/pretrained_decv2.py +126 -0
models/unet_resnet18.py +94 -0
models/unet_resnet50.py +150 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import gradio as gr
+import torch
+from models.pretrained_decv2 import enc_dec_model
+from models.densenet_v2 import Densenet
+from models.unet_resnet18 import ResNet18UNet
+from models.unet_resnet50 import UNetWithResnet50Encoder
+import numpy as np
+import cv2
+# kb cropping
+def cropping(img):
+    h_im, w_im = img.shape[:2]
+    margin_top = int(h_im - 352)
+    margin_left = int((w_im - 1216) / 2)
+    img = img[margin_top: margin_top + 352,
+                margin_left: margin_left + 1216]
+    return img
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(DEVICE)
+CWD = "."
+CKPT_FILE_NAMES = {
+    'Indoor':{
+        'Resnet_enc':'resnet_nyu_best.ckpt',
+        'Unet':'resnet18_unet_epoch_08_model_kitti_and_nyu.ckpt',
+        'Densenet_enc':'densenet_epoch_15_model.ckpt'
+    },
+    'Outdoor':{
+        'Resnet_enc':'resnet_encdecmodel_epoch_05_model_nyu_and_kitti.ckpt',
+        'Unet':'resnet50_unet_epoch_02_model_nyuandkitti.ckpt',
+        'Densenet_enc':'densenet_nyu_then_kitti_epoch_10_model.ckpt'
+    }
+}
+MODEL_CLASSES = {
+    'Indoor': {
+        'Resnet_enc':enc_dec_model,
+        'Unet':ResNet18UNet,
+        'Densenet_enc':Densenet
+    },
+    'Outdoor': {
+        'Resnet_enc':enc_dec_model,
+        'Unet':UNetWithResnet50Encoder,
+        'Densenet_enc':Densenet
+    },
+}
+def load_model(ckpt, model, optimizer=None):
+    ckpt_dict = torch.load(ckpt, map_location='cpu')
+    # keep backward compatibility
+    if 'model' not in ckpt_dict and 'optimizer' not in ckpt_dict:
+        state_dict = ckpt_dict
+    else:
+        state_dict = ckpt_dict['model']
+    weights = {}
+    for key, value in state_dict.items():
+        if key.startswith('module.'):
+            weights[key[len('module.'):]] = value
+        else:
+            weights[key] = value
+    model.load_state_dict(weights)
+    if optimizer is not None:
+        optimizer_state = ckpt_dict['optimizer']
+        optimizer.load_state_dict(optimizer_state)
+def predict(location, model_name, img):
+    ckpt_dir = f"{CWD}/ckpt/{CKPT_FILE_NAMES[location][model_name]}"
+    if location == 'nyu':
+        max_depth = 10
+    else:
+        max_depth = 80
+    model = MODEL_CLASSES[location][model_name](max_depth).to(DEVICE)
+    load_model(ckpt_dir,model)
+    # print(img.shape)
+    # assert False
+    if img.shape ==  (375,1242,3):
+        img = cropping(img)
+    img = torch.tensor(img).permute(2, 0, 1).float().to(DEVICE)
+    input_RGB = img.unsqueeze(0)
+    print(input_RGB.shape)
+    with torch.no_grad():
+        pred = model(input_RGB)
+        pred_d = pred['pred_d']
+        pred_d_numpy = pred_d.squeeze().cpu().numpy()
+        # pred_d_numpy = (pred_d_numpy - pred_d_numpy.mean())/pred_d_numpy.std()
+        pred_d_numpy = np.clip((pred_d_numpy / pred_d_numpy[15:,:].max()) * 255, 0,255)
+        # pred_d_numpy = (pred_d_numpy / pred_d_numpy.max()) * 255
+        pred_d_numpy = pred_d_numpy.astype(np.uint8)
+        pred_d_color = cv2.applyColorMap(pred_d_numpy, cv2.COLORMAP_RAINBOW)
+        pred_d_color = cv2.cvtColor(pred_d_color, cv2.COLOR_BGR2RGB)
+        # del model
+    return pred_d_color
+with gr.Blocks() as demo:
+    gr.Markdown("# Monocular Depth Estimation")
+    with gr.Row():
+        location = gr.Radio(choices=['Indoor', 'Outdoor'],value='Indoor', label = "Select Location Type")
+        model_name = gr.Radio(['Unet', 'Resnet_enc', 'Densenet_enc'],value="Densenet_enc" ,label="Select model")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(label = "Input Image for Depth Estimation")
+        with gr.Column():
+            output_depth_map = gr.Image(label = "Depth prediction Heatmap")
+    with gr.Row():
+        predict_btn = gr.Button("Generate Depthmap")
+        predict_btn.click(fn=predict, inputs=[location, model_name, input_image], outputs=output_depth_map)
+    with gr.Row():
+        gr.Examples(['./demo_data/Bathroom.jpg', './demo_data/Bedroom.jpg', './demo_data/Bookstore.jpg', './demo_data/Classroom.jpg', './demo_data/Computerlab.jpg', './demo_data/kitti_1.png'], inputs=input_image)
+demo.launch()

ckpt/densenet_epoch_15_model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e50a3c6bb7a24e3ece8f323dc759e0822145e81e2770dad0d00e12ac306c37c
+size 1748720589

ckpt/densenet_nyu_then_kitti_epoch_10_model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bd5ad153cd4363c061d6ca666899b1fe8a2c425fc26423081907cc144d204f5
+size 1748720589

ckpt/nyudepthv2_swin_base.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1c748dc3e0add9ee18b43dcfa1f2c8d5734d3e523ab7872398f203d5d36b605
+size 493044547

ckpt/resnet18_unet_epoch_08_model_kitti_and_nyu.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae171c2ab1a22570395a284eca5ecd392b653ab4bbf0f9b5edd6a9dbdbd8d2fc
+size 215834813

ckpt/resnet50_unet_epoch_02_model_nyuandkitti.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a26301214906364877b1b71bc1dca3e40b4013948a36b8ea1eb8e99cb56ce49
+size 1774319297

ckpt/resnet_encdecmodel_epoch_05_model_nyu_and_kitti.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa4f12fc178424a4205f241fa9081a8769fe10c36bcc7839dda53bacaa3676d1
+size 174548419

ckpt/resnet_nyu_best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fa8bbc121457976cb1c4d2b1396f517befe46a1e578afe53fa9c6ce920ffe48
+size 210256970

demo_data/Bathroom.jpg ADDED Viewed

demo_data/Bedroom.jpg ADDED Viewed

demo_data/Bookstore.jpg ADDED Viewed

demo_data/Classroom.jpg ADDED Viewed

demo_data/Computerlab.jpg ADDED Viewed

demo_data/kitti_1.png ADDED Viewed

demo_data/kitti_2.png ADDED Viewed

demo_data/kitti_3.png ADDED Viewed

models/densenet_v2.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import torch
+import torch.nn as nn
+import torchvision
+import torch.nn.functional as F
+from torchinfo import summary
+from math import sqrt
+# torch.autograd.set_detect_anomaly(True)
+class attention_gate(nn.Module):
+    def __init__(self, in_c, out_c):
+        super().__init__()
+        self.Wg = nn.Sequential(
+            nn.Conv2d(in_c[0], out_c, kernel_size=1, padding=0),
+            nn.BatchNorm2d(out_c)
+        )
+        self.Ws = nn.Sequential(
+            nn.Conv2d(in_c[1], out_c, kernel_size=1, padding=0),
+            nn.BatchNorm2d(out_c)
+        )
+        self.relu = nn.ReLU(inplace=True)
+        self.output = nn.Sequential(
+            nn.Conv2d(out_c, out_c, kernel_size=1, padding=0),
+            nn.Sigmoid()
+        )
+    def forward(self, g, s):
+        Wg = self.Wg(g)
+        Ws = self.Ws(s)
+        out = self.relu(Wg + Ws)
+        out = self.output(out)
+        return out
+class Conv_Block(nn.Module):
+    def __init__(self, in_c, out_c, activation_fn=nn.LeakyReLU):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_c, out_c, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm2d(out_c)
+        self.conv2 = nn.Conv2d(out_c, out_c, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm2d(out_c)
+        self.activfn = activation_fn()
+        self.dropout = nn.Dropout(0.25)
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.bn1(x)
+        x = self.activfn(x)
+        # x = self.dropout(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.activfn(x)
+        # x = self.dropout(x)
+        return x
+class Encoder_Block(nn.Module):
+    def __init__(self, in_c, out_c):
+        super().__init__()
+        self.conv = Conv_Block(in_c, out_c)
+        self.pool = nn.MaxPool2d((2, 2))
+    def forward(self, inputs):
+        x = self.conv(inputs)
+        p = self.pool(x)
+        return x, p
+class Enc_Dec_Model(nn.Module):
+    def __init__(self):
+        super(Enc_Dec_Model, self).__init__()
+        self.encoder1 = Encoder_Block(3, 64)
+        self.encoder2 = Encoder_Block(64, 128)
+        self.encoder3 = Encoder_Block(128, 256)
+        """ Bottleneck """
+        self.bottleneck = Conv_Block(256, 512)
+        """ Decoder """
+        self.d1 = Decoder_Block([512, 256], 256)
+        self.d2 = Decoder_Block([256, 128], 128)
+        self.d3 = Decoder_Block([128, 64], 64)
+        """ Classifier """
+        self.outputs = nn.Conv2d(64, 1, kernel_size=1, padding=0)
+    def forward(self, x):
+        """ Encoder """
+        s1, p1 = self.encoder1(x)
+        s2, p2 = self.encoder2(p1)
+        s3, p3 = self.encoder3(p2)
+        """ Bottleneck """
+        b = self.bottleneck(p3)
+        """ Decoder """
+        d1 = self.d1(b, s3)
+        d2 = self.d2(d1, s2)
+        d3 = self.d3(d2, s1)
+        """ Classifier """
+        outputs = self.outputs(d3)
+        out_depth = torch.sigmoid(outputs)
+        return out_depth
+class Decoder(nn.Module):
+    def __init__(self):
+        super(Decoder, self).__init__()
+        """ Decoder """
+        self.d1 = Decoder_Block(1920, 2048)
+        self.d2 = Decoder_Block(2048, 1024)
+        self.d3 = Decoder_Block(1024, 512)
+        self.d4 = Decoder_Block(512, 256)
+        self.d5 = Decoder_Block(256, 128)
+        # self.d6 = Decoder_Block(128, 64)
+        """ Classifier """
+        self.outputs = nn.Conv2d(128, 1, kernel_size=1, padding=0)
+    def forward(self, x):
+        """ Decoder """
+        # b = self.MHA2(b)
+        x = self.d1(x)
+        x = self.d2(x)
+        x = self.d3(x)
+        x = self.d4(x)
+        x = self.d5(x)
+        # x = self.d6(x)
+        """ Classifier """
+        outputs = self.outputs(x)
+        out_depth = torch.sigmoid(outputs)
+        return out_depth
+class Decoder_Block(nn.Module):
+    def __init__(self, in_c, out_c, activation_fn=nn.LeakyReLU):
+        super().__init__()
+        self.up = nn.ConvTranspose2d(in_c, out_c, kernel_size=2, stride=2, padding=0)
+        self.conv = Conv_Block(out_c, out_c, activation_fn)
+    def forward(self, inputs):
+        x = self.up(inputs)
+        x = self.conv(x)
+        return x
+class Densenet(nn.Module):
+    def __init__(self, max_depth) -> None:
+        super().__init__()
+        self.densenet = torchvision.models.densenet201(weights=torchvision.models.DenseNet201_Weights.DEFAULT)
+        for param in self.densenet.features.parameters():
+            param.requires_grad = False
+        self.densenet = torch.nn.Sequential(*(list(self.densenet.children())[:-1]))
+        self.decoder = Decoder()
+        # self.enc_dec_model = Enc_Dec_Model()
+        self.max_depth = max_depth
+    def forward(self, x):
+        x = self.densenet(x)
+        x = self.decoder(x)
+        # x = self.enc_dec_model(x)
+        x = x*self.max_depth
+        # print(x.shape)
+        return {'pred_d':x}
+if __name__ == "__main__":
+    model = Densenet(max_depth=10).cuda()
+    print(model)
+    summary(model, input_size=(64,3,448,448))

models/pretrained_decv2.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+import torch.nn as nn
+import torchvision
+from torchinfo import summary
+class conv_block(nn.Module):
+    def __init__(self, in_c, out_c, act):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_c, out_c, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm2d(out_c)
+        self.conv2 = nn.Conv2d(out_c, out_c, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm2d(out_c)
+        if act == 'relu':
+            self.activation = nn.ReLU()
+        elif act == 'sigmoid':
+            self.activation = nn.Sigmoid()
+        else:
+            self.activation = nn.Identity()
+        # self.relu = nn.ReLU()
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.bn1(x)
+        x = self.activation(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.activation(x)
+        return x
+class Decoder_block(nn.Module):
+    def __init__(self, in_channel, out_channel, kernel, stride, padding=1, out_padding=1, act = 'relu') -> None:
+        super().__init__()
+        self.upsample = nn.ConvTranspose2d(in_channels=in_channel,\
+                                            out_channels=out_channel,\
+                                            kernel_size=kernel,\
+                                             stride=stride,\
+                                              padding=padding,
+                                              output_padding=out_padding)
+        if act == 'relu':
+            self.activation = nn.ReLU()
+        elif act == 'sigmoid':
+            self.activation = nn.Sigmoid()
+        else:
+            self.activation = nn.Identity()
+    def forward(self, x):
+        return self.activation(self.upsample(x))
+class Decoder(nn.Module):
+    def __init__(self, num_layers, channels, kernels, strides, activations) -> None:
+        super().__init__()
+        assert len(channels) -1 == len(kernels) and len(strides) == len(kernels) and num_layers == len(strides)
+        assert num_layers == len(activations)
+        self.layers = []
+        for i in range(num_layers):
+            self.layers.append(Decoder_block(in_channel=channels[i],\
+                                             out_channel=channels[i+1],\
+                                              kernel=kernels[i],\
+                                                stride=strides[i],\
+                                                 act=activations[i]))
+            self.layers.append(conv_block(in_c=channels[i+1],out_c=channels[i+1], act= activations[i]))
+        self.model = nn.Sequential(*self.layers)
+    def forward(self, x):
+        return self.model(x)
+class enc_dec_model(nn.Module):
+    def __init__(self, max_depth=10, backbone='resnet', unfreeze = False) -> None:
+        super().__init__()
+        if backbone == 'resnet':
+            self.encoder = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)
+            num_layers=5
+            channels=[2048,256,128,64,32,1]
+            kernels=[3,3,3,3,3]
+            strides = [2,2,2,2,2]
+            activations=['relu', 'relu', 'relu' ,'relu', 'sigmoid']
+        if unfreeze:
+            for param in self.encoder.parameters():
+                param.requires_grad = True
+        else:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+            for i, child in enumerate(self.encoder.children()):
+                if i == 7:
+                    for j, child2 in enumerate(child.children()):
+                        if j == 2:
+                            # print("count:", j)
+                            # print(child2)
+                            for param in child2.parameters():
+                                param.requires_grad = True
+                if i>=8:
+                    # print("count:", i)
+                    # print(child)
+                    for param in child.parameters():
+                        param.requires_grad = True
+        # input(":")
+        self.encoder = torch.nn.Sequential(*(list(self.encoder.children())[:-2]))
+        # self.bridge = nn.Conv2d(2048, 2048, 1, 1)
+        self.decoder = Decoder(num_layers=num_layers,\
+                                channels=channels,\
+                                kernels=kernels,\
+                                strides = strides,\
+                                 activations=activations)
+        self.max_depth = max_depth
+    def forward(self, x):
+        x = self.encoder(x)
+        # x = self.bridge(x)
+        # print(x)
+        x = self.decoder(x)
+        # print(x)
+        x = x*self.max_depth
+        return {'pred_d':x}
+if __name__ == "__main__":
+    # model = Decoder(num_layers=5,\
+    #                 channels=[2048,256,128,64,32,1],\
+    #                 kernels=[3,3,3,3,3],\
+    #                 strides = [2,2,2,2,2])
+    model = enc_dec_model(unfreeze=True).cuda()
+    print(model)
+    summary(model, input_size=(64,3,448,448))

models/unet_resnet18.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch.nn as nn
+from torchinfo import summary
+import torchvision.models
+import torch
+def convrelu(in_channels, out_channels, kernel, padding):
+  return nn.Sequential(
+    nn.Conv2d(in_channels, out_channels, kernel, padding=padding),
+    nn.ReLU(inplace=True),
+  )
+class ResNet18UNet(nn.Module):
+  def __init__(self, max_depth, n_class=1):
+    super().__init__()
+    self.base_model = torchvision.models.resnet18(pretrained=True)
+    self.base_layers = list(self.base_model.children())
+    self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2)
+    self.layer0_1x1 = convrelu(64, 64, 1, 0)
+    self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4)
+    self.layer1_1x1 = convrelu(64, 64, 1, 0)
+    self.layer2 = self.base_layers[5]  # size=(N, 128, x.H/8, x.W/8)
+    self.layer2_1x1 = convrelu(128, 128, 1, 0)
+    self.layer3 = self.base_layers[6]  # size=(N, 256, x.H/16, x.W/16)
+    self.layer3_1x1 = convrelu(256, 256, 1, 0)
+    self.layer4 = self.base_layers[7]  # size=(N, 512, x.H/32, x.W/32)
+    self.layer4_1x1 = convrelu(512, 512, 1, 0)
+    self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+    self.conv_up3 = convrelu(256 + 512, 512, 3, 1)
+    self.conv_up2 = convrelu(128 + 512, 256, 3, 1)
+    self.conv_up1 = convrelu(64 + 256, 256, 3, 1)
+    self.conv_up0 = convrelu(64 + 256, 128, 3, 1)
+    self.conv_original_size0 = convrelu(3, 64, 3, 1)
+    self.conv_original_size1 = convrelu(64, 64, 3, 1)
+    self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1)
+    self.conv_last = nn.Conv2d(64, n_class, 1)
+    self.max_depth = max_depth
+  def forward(self, input):
+    x_original = self.conv_original_size0(input)
+    x_original = self.conv_original_size1(x_original)
+    layer0 = self.layer0(input)
+    layer1 = self.layer1(layer0)
+    layer2 = self.layer2(layer1)
+    layer3 = self.layer3(layer2)
+    layer4 = self.layer4(layer3)
+    layer4 = self.layer4_1x1(layer4)
+    x = self.upsample(layer4)
+    layer3 = self.layer3_1x1(layer3)
+    x = torch.cat([x, layer3], dim=1)
+    x = self.conv_up3(x)
+    x = self.upsample(x)
+    layer2 = self.layer2_1x1(layer2)
+    print(x.shape)
+    print(layer2.shape)
+    x = torch.cat([x, layer2], dim=1)
+    x = self.conv_up2(x)
+    x = self.upsample(x)
+    layer1 = self.layer1_1x1(layer1)
+    x = torch.cat([x, layer1], dim=1)
+    x = self.conv_up1(x)
+    x = self.upsample(x)
+    layer0 = self.layer0_1x1(layer0)
+    x = torch.cat([x, layer0], dim=1)
+    x = self.conv_up0(x)
+    x = self.upsample(x)
+    x = torch.cat([x, x_original], dim=1)
+    x = self.conv_original_size2(x)
+    out = self.conv_last(x)
+    out_depth = torch.sigmoid(out) * self.max_depth
+    return {'pred_d': out_depth}
+if __name__ == "__main__":
+    model = ResNet18UNet(max_depth=10).cuda()
+    # print(model)
+    summary(model, input_size=(1,3,256,256))

models/unet_resnet50.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+import torch.nn as nn
+from torchinfo import summary
+import torchvision
+resnet = torchvision.models.resnet.resnet50(pretrained=True)
+class ConvBlock(nn.Module):
+    """
+    Helper module that consists of a Conv -> BN -> ReLU
+    """
+    def __init__(self, in_channels, out_channels, padding=1, kernel_size=3, stride=1, with_nonlinearity=True):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, padding=padding, kernel_size=kernel_size, stride=stride)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU()
+        self.with_nonlinearity = with_nonlinearity
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.with_nonlinearity:
+            x = self.relu(x)
+        return x
+class Bridge(nn.Module):
+    """
+    This is the middle layer of the UNet which just consists of some
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.bridge = nn.Sequential(
+            ConvBlock(in_channels, out_channels),
+            ConvBlock(out_channels, out_channels)
+        )
+    def forward(self, x):
+        return self.bridge(x)
+class UpBlockForUNetWithResNet50(nn.Module):
+    """
+    Up block that encapsulates one up-sampling step which consists of Upsample -> ConvBlock -> ConvBlock
+    """
+    def __init__(self, in_channels, out_channels, up_conv_in_channels=None, up_conv_out_channels=None,
+                 upsampling_method="conv_transpose"):
+        super().__init__()
+        if up_conv_in_channels == None:
+            up_conv_in_channels = in_channels
+        if up_conv_out_channels == None:
+            up_conv_out_channels = out_channels
+        if upsampling_method == "conv_transpose":
+            self.upsample = nn.ConvTranspose2d(up_conv_in_channels, up_conv_out_channels, kernel_size=2, stride=2)
+        elif upsampling_method == "bilinear":
+            self.upsample = nn.Sequential(
+                nn.Upsample(mode='bilinear', scale_factor=2),
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
+            )
+        self.conv_block_1 = ConvBlock(in_channels, out_channels)
+        self.conv_block_2 = ConvBlock(out_channels, out_channels)
+    def forward(self, up_x, down_x):
+        """
+        :param up_x: this is the output from the previous up block
+        :param down_x: this is the output from the down block
+        :return: upsampled feature map
+        """
+        x = self.upsample(up_x)
+        print(x.shape)
+        print(down_x.shape)
+        x = torch.cat([x, down_x], 1)
+        x = self.conv_block_1(x)
+        x = self.conv_block_2(x)
+        return x
+class UNetWithResnet50Encoder(nn.Module):
+    DEPTH = 6
+    def __init__(self, max_depth, n_classes=1):
+        super().__init__()
+        resnet = torchvision.models.resnet.resnet50(pretrained=True)
+        down_blocks = []
+        up_blocks = []
+        self.input_block = nn.Sequential(*list(resnet.children()))[:3]
+        self.input_pool = list(resnet.children())[3]
+        for bottleneck in list(resnet.children()):
+            if isinstance(bottleneck, nn.Sequential):
+                down_blocks.append(bottleneck)
+        self.down_blocks = nn.ModuleList(down_blocks)
+        self.bridge = Bridge(2048, 2048)
+        up_blocks.append(UpBlockForUNetWithResNet50(2048, 1024))
+        up_blocks.append(UpBlockForUNetWithResNet50(1024, 512))
+        up_blocks.append(UpBlockForUNetWithResNet50(512, 256))
+        up_blocks.append(UpBlockForUNetWithResNet50(in_channels=128 + 64, out_channels=128,
+                                                    up_conv_in_channels=256, up_conv_out_channels=128))
+        up_blocks.append(UpBlockForUNetWithResNet50(in_channels=64 + 3, out_channels=64,
+                                                    up_conv_in_channels=128, up_conv_out_channels=64))
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out = nn.Conv2d(64, n_classes, kernel_size=1, stride=1)
+        self.max_depth = max_depth
+    def forward(self, x, with_output_feature_map=False):
+        pre_pools = dict()
+        pre_pools[f"layer_0"] = x
+        x = self.input_block(x)
+        pre_pools[f"layer_1"] = x
+        x = self.input_pool(x)
+        for i, block in enumerate(self.down_blocks, 2):
+            x = block(x)
+            if i == (UNetWithResnet50Encoder.DEPTH - 1):
+                continue
+            pre_pools[f"layer_{i}"] = x
+        x = self.bridge(x)
+        for i, block in enumerate(self.up_blocks, 1):
+            key = f"layer_{UNetWithResnet50Encoder.DEPTH - 1 - i}"
+            x = block(x, pre_pools[key])
+        output_feature_map = x
+        x = self.out(x)
+        del pre_pools
+        # if with_output_feature_map:
+        #     return x, output_feature_map
+        # else:
+        #     return x
+        out_depth = torch.sigmoid(x) * self.max_depth
+        return {'pred_d': out_depth}
+# model = UNetWithResnet50Encoder().cuda()
+# inp = torch.rand((2, 3, 512, 512)).cuda()
+# out = model(inp)
+if __name__ == "__main__":
+    model = UNetWithResnet50Encoder(max_depth=10).cuda()
+    # print(model)
+    summary(model, input_size=(1,3,256,256))