Spaces:

zxhuang1698
/

ZeroShape

Build error

App Files Files Community

zxhuang1698 commited on Jan 14, 2024

Commit

414b431

1 Parent(s): 1a618eb

initial commit

Browse files

Files changed (48) hide show

README.md +5 -6
app.py +303 -0
examples/images/armchair.png +0 -0
examples/images/bolt.png +0 -0
examples/images/bucket.png +0 -0
examples/images/case.png +0 -0
examples/images/dispenser.png +0 -0
examples/images/hat.png +0 -0
examples/images/teddy_bear.png +0 -0
examples/images/tiger.png +0 -0
examples/images/toy.png +0 -0
examples/images/wedding_cake.png +0 -0
examples/masks/armchair.png +0 -0
examples/masks/bolt.png +0 -0
examples/masks/bucket.png +0 -0
examples/masks/case.png +0 -0
examples/masks/dispenser.png +0 -0
examples/masks/hat.png +0 -0
examples/masks/teddy_bear.png +0 -0
examples/masks/tiger.png +0 -0
examples/masks/toy.png +0 -0
examples/masks/wedding_cake.png +0 -0
model/compute_graph/graph_depth.py +106 -0
model/compute_graph/graph_shape.py +202 -0
model/depth/__init__.py +0 -0
model/depth/base_model.py +17 -0
model/depth/blocks.py +343 -0
model/depth/dpt_depth.py +123 -0
model/depth/midas_loss.py +185 -0
model/depth/vit.py +492 -0
model/depth_engine.py +445 -0
model/shape/implicit.py +288 -0
model/shape/rgb_enc.py +137 -0
model/shape/seen_coord_enc.py +195 -0
model/shape_engine.py +598 -0
options/depth.yaml +72 -0
options/shape.yaml +110 -0
requirements.txt +95 -0
utils/camera.py +230 -0
utils/eval_3D.py +133 -0
utils/eval_depth.py +110 -0
utils/layers.py +147 -0
utils/loss.py +42 -0
utils/options.py +127 -0
utils/pos_embed.py +118 -0
utils/util.py +413 -0
utils/util_vis.py +511 -0
weights/.gitignore +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: ZeroShape
-emoji: 📚
 colorFrom: green
 colorTo: blue
 sdk: gradio
-sdk_version: 4.14.0
 app_file: app.py
-pinned: false
 license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: ZeroShape
+emoji: 🔥
 colorFrom: green
 colorTo: blue
 sdk: gradio
+sdk_version: 4.5.0
+python_version: 3.10
 app_file: app.py
+pinned: true
 license: mit
+---

app.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import gradio as gr
+import torch
+import torchvision.transforms.functional as torchvision_F
+import numpy as np
+import os
+import shutil
+import importlib
+import trimesh
+import tempfile
+import subprocess
+import utils.options as options
+import shlex
+import time
+import rembg
+from utils.util import EasyDict as edict
+from PIL import Image
+from utils.eval_3D import get_dense_3D_grid, compute_level_grid, convert_to_explicit
+def get_1d_bounds(arr):
+        nz = np.flatnonzero(arr)
+        return nz[0], nz[-1]
+def get_bbox_from_mask(mask, thr):
+    masks_for_box = (mask > thr).astype(np.float32)
+    assert masks_for_box.sum() > 0, "Empty mask!"
+    x0, x1 = get_1d_bounds(masks_for_box.sum(axis=-2))
+    y0, y1 = get_1d_bounds(masks_for_box.sum(axis=-1))
+    return x0, y0, x1, y1
+def square_crop(image, bbox, crop_ratio=1.):
+    x1, y1, x2, y2 = bbox
+    h, w = y2-y1, x2-x1
+    yc, xc = (y1+y2)/2, (x1+x2)/2
+    S = max(h, w)*1.2
+    scale = S*crop_ratio
+    image = torchvision_F.crop(image, top=int(yc-scale/2), left=int(xc-scale/2), height=int(scale), width=int(scale))
+    return image
+def preprocess_image(opt, image, bbox):
+    image = square_crop(image, bbox=bbox)
+    if image.size[0] != opt.W or image.size[1] != opt.H:
+        image = image.resize((opt.W, opt.H))
+    image = torchvision_F.to_tensor(image)
+    rgb, mask = image[:3], image[3:]
+    if opt.data.bgcolor is not None:
+        # replace background color using mask
+        rgb = rgb * mask + opt.data.bgcolor * (1 - mask)
+        mask = (mask > 0.5).float()
+    return rgb, mask
+def get_image(opt, image_fname, mask_fname):
+    image = Image.open(image_fname).convert("RGB")
+    mask = Image.open(mask_fname).convert("L")
+    mask_np = np.array(mask)
+    #binarize
+    mask_np[mask_np <= 127] = 0
+    mask_np[mask_np >= 127] = 1.0
+    image = Image.merge("RGBA", (*image.split(), mask))
+    bbox = get_bbox_from_mask(mask_np, 0.5)
+    rgb_input_map, mask_input_map = preprocess_image(opt, image, bbox=bbox)
+    return rgb_input_map, mask_input_map
+def get_intr(opt):
+    # load camera
+    f = 1.3875
+    K = torch.tensor([[f*opt.W, 0, opt.W/2],
+                      [0, f*opt.H, opt.H/2],
+                      [0, 0, 1]]).float()
+    return K
+def get_pixel_grid(H, W, device='cuda'):
+    y_range = torch.arange(H, dtype=torch.float32).to(device)
+    x_range = torch.arange(W, dtype=torch.float32).to(device)
+    Y, X = torch.meshgrid(y_range, x_range, indexing='ij')
+    Z = torch.ones_like(Y).to(device)
+    xyz_grid = torch.stack([X, Y, Z],dim=-1).view(-1,3)
+    return xyz_grid
+def unproj_depth(depth, intr):
+    '''
+    depth: [B, H, W]
+    intr: [B, 3, 3]
+    '''
+    batch_size, H, W = depth.shape
+    intr = intr.to(depth.device)
+    # [B, 3, 3]
+    K_inv = torch.linalg.inv(intr).float()
+    # [1, H*W,3]
+    pixel_grid = get_pixel_grid(H, W, depth.device).unsqueeze(0)
+    # [B, H*W,3]
+    pixel_grid = pixel_grid.repeat(batch_size, 1, 1)
+    # [B, 3, H*W]
+    ray_dirs = K_inv @ pixel_grid.permute(0, 2, 1).contiguous()
+    # [B, H*W, 3], in camera coordinates
+    seen_points = ray_dirs.permute(0, 2, 1).contiguous() * depth.view(batch_size, H*W, 1)
+    # [B, H, W, 3]
+    seen_points = seen_points.view(batch_size, H, W, 3)
+    return seen_points
+def prepare_data(opt, image_path, mask_path):
+    var = edict()
+    rgb_input_map, mask_input_map = get_image(opt, image_path, mask_path)
+    intr = get_intr(opt)
+    var.rgb_input_map = rgb_input_map.unsqueeze(0).to(opt.device)
+    var.mask_input_map = mask_input_map.unsqueeze(0).to(opt.device)
+    var.intr = intr.unsqueeze(0).to(opt.device)
+    var.idx = torch.tensor([0]).to(opt.device).long()
+    var.pose_gt = False
+    return var
+@torch.no_grad()
+def marching_cubes(opt, var, impl_network, visualize_attn=False):
+    points_3D = get_dense_3D_grid(opt, var) # [B, N, N, N, 3]
+    level_vox, attn_vis = compute_level_grid(opt, impl_network, var.latent_depth, var.latent_semantic,
+                                             points_3D, var.rgb_input_map, visualize_attn)
+    if attn_vis: var.attn_vis = attn_vis
+    # occ_grids: a list of length B, each is [N, N, N]
+    *level_grids, = level_vox.cpu().numpy()
+    meshes = convert_to_explicit(opt, level_grids, isoval=0.5, to_pointcloud=False)
+    var.mesh_pred = meshes
+    return var
+@torch.no_grad()
+def infer_sample(opt, var, graph):
+    var = graph.forward(opt, var, training=False, get_loss=False)
+    var = marching_cubes(opt, var, graph.impl_network, visualize_attn=True)
+    return var.mesh_pred[0]
+def infer(input_image_path, input_mask_path):
+    opt_cmd = options.parse_arguments(["--yaml=options/shape.yaml", "--datadir=examples", "--eval.vox_res=128", "--ckpt=weights/shape.ckpt"])
+    opt = options.set(opt_cmd=opt_cmd, safe_check=False)
+    opt.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    # build model
+    print("Building model...")
+    opt.pretrain.depth = None
+    opt.arch.depth.pretrained = None
+    module = importlib.import_module("model.compute_graph.graph_shape")
+    graph = module.Graph(opt).to(opt.device)
+    # download checkpoint
+    if not os.path.isfile(opt.ckpt):
+        print("Downloading checkpoint...")
+        subprocess.run(
+            shlex.split(
+                "wget -q -O weights/shape.ckpt https://www.dropbox.com/scl/fi/hv3w9z59dqytievwviko4/shape.ckpt?rlkey=a2gut89kavrldmnt8b3df92oi&dl=0"
+            )
+        )
+    # wait if the checkpoint is still downloading
+    while not os.path.isfile(opt.ckpt):
+        time.sleep(1)
+    # load checkpoint
+    print("Loading checkpoint...")
+    checkpoint = torch.load(opt.ckpt, map_location=torch.device(opt.device))
+    graph.load_state_dict(checkpoint["graph"], strict=True)
+    graph.eval()
+    # load the data
+    print("Loading data...")
+    var = prepare_data(opt, input_image_path, input_mask_path)
+    # create the save dir
+    save_folder = os.path.join(opt.datadir, 'preds')
+    if os.path.isdir(save_folder):
+        shutil.rmtree(save_folder)
+    os.makedirs(save_folder)
+    opt.output_path = opt.datadir
+    # inference the model and save the results
+    print("Inferencing...")
+    mesh_pred = infer_sample(opt, var, graph)
+    # rotate the mesh upside down
+    mesh_pred.apply_transform(trimesh.transformations.rotation_matrix(np.pi, [1, 0, 0]))
+    mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
+    mesh_pred.export(mesh_path.name, file_type="glb")
+    return mesh_path.name
+def infer_wrapper_mask(input_image_path, input_mask_path):
+    return infer(input_image_path, input_mask_path)
+def infer_wrapper_nomask(input_image_path):
+    input = Image.open(input_image_path)
+    segmented = rembg.remove(input)
+    mask = segmented.split()[-1]
+    mask_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+    mask.save(mask_path.name)
+    return infer(input_image_path, mask_path.name), mask_path.name
+def assert_input_image(input_image):
+    if input_image is None:
+        raise gr.Error("No image selected or uploaded!")
+def assert_mask_image(input_mask):
+    if input_mask is None:
+        raise gr.Error("No mask selected or uploaded! Please check the box if you do not have the mask.")
+def demo_gradio():
+    with gr.Blocks(analytics_enabled=False) as demo_ui:
+        # HEADERS
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown('# ZeroShape: Regression-based Zero-shot Shape Reconstruction')
+        gr.Markdown("[\[Arxiv\]](https://arxiv.org/pdf/2312.14198.pdf) | [\[Project\]](https://zixuanh.com/projects/zeroshape.html) | [\[GitHub\]](https://github.com/zxhuang1698/ZeroShape)")
+        gr.Markdown("Please switch to the \"Estimated Mask\" tab if you do not have the foreground mask. The demo will try to estimate the mask for you.")
+        # with mask
+        with gr.Tab("Groundtruth Mask"):
+            with gr.Row():
+                input_image_tab1 = gr.Image(label="Input Image", image_mode="RGB", sources="upload", type="filepath", elem_id="content_image", width=300)
+                mask_tab1 = gr.Image(label="Foreground Mask", image_mode="RGB", sources="upload", type="filepath", elem_id="content_image", width=300)
+                output_mesh_tab1 = gr.Model3D(label="Output Mesh")
+            with gr.Row():
+                submit_tab1 = gr.Button('Reconstruct', elem_id="recon_button_tab1", variant='primary')
+            # examples
+            with gr.Row():
+                examples_tab1 = [
+                    ['examples/images/armchair.png', 'examples/masks/armchair.png'],
+                    ['examples/images/bolt.png', 'examples/masks/bolt.png'],
+                    ['examples/images/bucket.png', 'examples/masks/bucket.png'],
+                    ['examples/images/case.png', 'examples/masks/case.png'],
+                    ['examples/images/dispenser.png', 'examples/masks/dispenser.png'],
+                    ['examples/images/hat.png', 'examples/masks/hat.png'],
+                    ['examples/images/teddy_bear.png', 'examples/masks/teddy_bear.png'],
+                    ['examples/images/tiger.png', 'examples/masks/tiger.png'],
+                    ['examples/images/toy.png', 'examples/masks/toy.png'],
+                    ['examples/images/wedding_cake.png', 'examples/masks/wedding_cake.png'],
+                ]
+                gr.Examples(
+                    examples=examples_tab1,
+                    inputs=[input_image_tab1, mask_tab1],
+                    outputs=[output_mesh_tab1],
+                    fn=infer_wrapper_mask,
+                    cache_examples=False#os.getenv('SYSTEM') == 'spaces',
+                )
+        # without mask
+        with gr.Tab("Estimated Mask"):
+            with gr.Row():
+                input_image_tab2 = gr.Image(label="Input Image", image_mode="RGB", sources="upload", type="filepath", elem_id="content_image", width=300)
+                mask_tab2 = gr.Image(label="Foreground Mask", image_mode="RGB", sources="upload", type="filepath", elem_id="content_image", width=300)
+                output_mesh_tab2 = gr.Model3D(label="Output Mesh")
+            with gr.Row():
+                submit_tab2 = gr.Button('Reconstruct', elem_id="recon_button_tab2", variant='primary')
+            # examples
+            with gr.Row():
+                examples_tab2 = [
+                    ['examples/images/armchair.png'],
+                    ['examples/images/bolt.png'],
+                    ['examples/images/bucket.png'],
+                    ['examples/images/case.png'],
+                    ['examples/images/dispenser.png'],
+                    ['examples/images/hat.png'],
+                    ['examples/images/teddy_bear.png'],
+                    ['examples/images/tiger.png'],
+                    ['examples/images/toy.png'],
+                    ['examples/images/wedding_cake.png'],
+                ]
+                gr.Examples(
+                    examples=examples_tab2,
+                    inputs=[input_image_tab2],
+                    outputs=[output_mesh_tab2, mask_tab2],
+                    fn=infer_wrapper_nomask,
+                    cache_examples=False#os.getenv('SYSTEM') == 'spaces',
+                )
+        submit_tab1.click(
+            fn=assert_input_image,
+            inputs=[input_image_tab1],
+            queue=False
+        ).success(
+            fn=assert_mask_image,
+            inputs=[mask_tab1],
+            queue=False
+        ).success(
+            fn=infer_wrapper_mask,
+            inputs=[input_image_tab1, mask_tab1],
+            outputs=[output_mesh_tab1],
+        )
+        submit_tab2.click(
+            fn=assert_input_image,
+            inputs=[input_image_tab2],
+            queue=False
+        ).success(
+            fn=infer_wrapper_nomask,
+            inputs=[input_image_tab2],
+            outputs=[output_mesh_tab2, mask_tab2],
+        )
+    return demo_ui
+if __name__ == "__main__":
+    demo_ui = demo_gradio()
+    demo_ui.queue(max_size=10)
+    demo_ui.launch()

examples/images/armchair.png ADDED Viewed

examples/images/bolt.png ADDED Viewed

examples/images/bucket.png ADDED Viewed

examples/images/case.png ADDED Viewed

examples/images/dispenser.png ADDED Viewed

examples/images/hat.png ADDED Viewed

examples/images/teddy_bear.png ADDED Viewed

examples/images/tiger.png ADDED Viewed

examples/images/toy.png ADDED Viewed

examples/images/wedding_cake.png ADDED Viewed

examples/masks/armchair.png ADDED Viewed

examples/masks/bolt.png ADDED Viewed

examples/masks/bucket.png ADDED Viewed

examples/masks/case.png ADDED Viewed

examples/masks/dispenser.png ADDED Viewed

examples/masks/hat.png ADDED Viewed

examples/masks/teddy_bear.png ADDED Viewed

examples/masks/tiger.png ADDED Viewed

examples/masks/toy.png ADDED Viewed

examples/masks/wedding_cake.png ADDED Viewed

model/compute_graph/graph_depth.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+import torch.nn as nn
+from utils.util import EasyDict as edict
+from utils.loss import Loss
+from model.depth.dpt_depth import DPTDepthModel
+from utils.layers import Bottleneck_Conv
+from utils.camera import unproj_depth, valid_norm_fac
+class Graph(nn.Module):
+    def __init__(self, opt):
+        super().__init__()
+        # define the depth pred model based on omnidata
+        self.dpt_depth = DPTDepthModel(backbone='vitb_rn50_384')
+        if opt.arch.depth.pretrained is not None:
+            checkpoint = torch.load(opt.arch.depth.pretrained, map_location="cuda:{}".format(opt.device))
+            state_dict = checkpoint['model_state_dict']
+            self.dpt_depth.load_state_dict(state_dict)
+        if opt.loss_weight.intr is not None:
+            self.intr_feat_channels = 768
+            self.intr_head = nn.Sequential(
+                Bottleneck_Conv(self.intr_feat_channels, kernel_size=3),
+                Bottleneck_Conv(self.intr_feat_channels, kernel_size=3),
+            )
+            self.intr_pool = nn.AdaptiveAvgPool2d((1, 1))
+            self.intr_proj = nn.Linear(self.intr_feat_channels, 3)
+            # init the last linear layer so it outputs zeros
+            nn.init.zeros_(self.intr_proj.weight)
+            nn.init.zeros_(self.intr_proj.bias)
+        self.loss_fns = Loss(opt)
+    def intr_param2mtx(self, opt, intr_params):
+        '''
+        Parameters:
+            opt: config
+            intr_params: [B, 3], [scale_f, delta_cx, delta_cy]
+        Return:
+            intr: [B, 3, 3]
+        '''
+        batch_size = len(intr_params)
+        f = 1.3875
+        intr = torch.zeros(3, 3).float().to(intr_params.device).unsqueeze(0).repeat(batch_size, 1, 1)
+        intr[:, 2, 2] += 1
+        # scale the focal length
+        # range: [-1, 1], symmetric
+        scale_f = torch.tanh(intr_params[:, 0])
+        # range: [1/4, 4], symmetric
+        scale_f = torch.pow(4. , scale_f)
+        intr[:, 0, 0] += f * opt.W * scale_f
+        intr[:, 1, 1] += f * opt.H * scale_f
+        # shift the optic center, (at most to the image border)
+        shift_cx = torch.tanh(intr_params[:, 1]) * opt.W / 2
+        shift_cy = torch.tanh(intr_params[:, 2]) * opt.H / 2
+        intr[:, 0, 2] += opt.W / 2 + shift_cx
+        intr[:, 1, 2] += opt.H / 2 + shift_cy
+        return intr
+    def forward(self, opt, var, training=False, get_loss=True):
+        batch_size = len(var.idx)
+        # predict the depth map and feature maps if needed
+        if opt.loss_weight.intr is None:
+            var.depth_pred = self.dpt_depth(var.rgb_input_map)
+        else:
+            var.depth_pred, intr_feat = self.dpt_depth(var.rgb_input_map, get_feat=True)
+            # predict the intrinsics
+            intr_feat = self.intr_head(intr_feat)
+            intr_feat = self.intr_pool(intr_feat).squeeze(-1).squeeze(-1)
+            intr_params = self.intr_proj(intr_feat)
+            # [B, 3, 3]
+            var.intr_pred = self.intr_param2mtx(opt, intr_params)
+            # project the predicted depth map to 3D points and normalize, [B, H*W, 3]
+            seen_points_3D_pred = unproj_depth(opt, var.depth_pred, var.intr_pred)
+            seen_points_mean_pred, seen_points_scale_pred = valid_norm_fac(seen_points_3D_pred, var.mask_input_map > 0.5)
+            var.seen_points_pred = (seen_points_3D_pred - seen_points_mean_pred.unsqueeze(1)) / seen_points_scale_pred.unsqueeze(-1).unsqueeze(-1)
+            var.seen_points_pred[(var.mask_input_map<=0.5).view(batch_size, -1)] = 0
+            if 'depth_input_map' in var or training:
+                # project the ground truth depth map to 3D points and normalize, [B, H*W, 3]
+                seen_points_3D_gt = unproj_depth(opt, var.depth_input_map, var.intr)
+                seen_points_mean_gt, seen_points_scale_gt = valid_norm_fac(seen_points_3D_gt, var.mask_input_map > 0.5)
+                var.seen_points_gt = (seen_points_3D_gt - seen_points_mean_gt.unsqueeze(1)) / seen_points_scale_gt.unsqueeze(-1).unsqueeze(-1)
+                var.seen_points_gt[(var.mask_input_map<=0.5).view(batch_size, -1)] = 0
+                # record the validity mask, [B, H*W]
+                var.validity_mask = (var.mask_input_map>0.5).float().view(batch_size, -1)
+        # calculate the loss if needed
+        if get_loss:
+            loss = self.compute_loss(opt, var, training)
+            return var, loss
+        return var
+    def compute_loss(self, opt, var, training=False):
+        loss = edict()
+        if opt.loss_weight.depth is not None:
+            loss.depth = self.loss_fns.depth_loss(var.depth_pred, var.depth_input_map, var.mask_input_map)
+        if opt.loss_weight.intr is not None:
+            loss.intr = self.loss_fns.intr_loss(var.seen_points_pred, var.seen_points_gt, var.validity_mask)
+        return loss

model/compute_graph/graph_shape.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+import torch.nn as nn
+from utils.util import EasyDict as edict
+from utils.loss import Loss
+from model.shape.implicit import Implicit
+from model.shape.seen_coord_enc import CoordEncAtt, CoordEncRes
+from model.shape.rgb_enc import RGBEncAtt, RGBEncRes
+from model.depth.dpt_depth import DPTDepthModel
+from utils.util import toggle_grad, interpolate_coordmap, get_child_state_dict
+from utils.camera import unproj_depth, valid_norm_fac
+from utils.layers import Bottleneck_Conv
+class Graph(nn.Module):
+    def __init__(self, opt):
+        super().__init__()
+        # define the intrinsics head
+        self.intr_feat_channels = 768
+        self.intr_head = nn.Sequential(
+            Bottleneck_Conv(self.intr_feat_channels, kernel_size=3),
+            Bottleneck_Conv(self.intr_feat_channels, kernel_size=3),
+        )
+        self.intr_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.intr_proj = nn.Linear(self.intr_feat_channels, 3)
+        # init the last linear layer so it outputs zeros
+        nn.init.zeros_(self.intr_proj.weight)
+        nn.init.zeros_(self.intr_proj.bias)
+        # define the depth pred model based on omnidata
+        self.dpt_depth = DPTDepthModel(backbone='vitb_rn50_384')
+        # load the pretrained depth model
+        # when intrinsics need to be predicted we need to load that part as well
+        self.load_pretrained_depth(opt)
+        if opt.optim.fix_dpt:
+            toggle_grad(self.dpt_depth, False)
+            toggle_grad(self.intr_head, False)
+            toggle_grad(self.intr_proj, False)
+        # encoder that encode seen surface to impl conditioning vec
+        if opt.arch.depth.encoder == 'resnet':
+            opt.arch.depth.dsp = 1
+            self.coord_encoder = CoordEncRes(opt)
+        else:
+            self.coord_encoder = CoordEncAtt(embed_dim=opt.arch.latent_dim, n_blocks=opt.arch.depth.n_blocks,
+                                        num_heads=opt.arch.num_heads, win_size=opt.arch.win_size//opt.arch.depth.dsp)
+        # rgb branch (not used in final model, keep here for extension)
+        if opt.arch.rgb.encoder == 'resnet':
+            self.rgb_encoder = RGBEncRes(opt)
+        elif opt.arch.rgb.encoder == 'transformer':
+            self.rgb_encoder = RGBEncAtt(img_size=opt.H, embed_dim=opt.arch.latent_dim, n_blocks=opt.arch.rgb.n_blocks,
+                                        num_heads=opt.arch.num_heads, win_size=opt.arch.win_size)
+        else:
+            self.rgb_encoder = None
+        # implicit function
+        feat_res = opt.H // opt.arch.win_size
+        self.impl_network = Implicit(feat_res**2, latent_dim=opt.arch.latent_dim*2 if self.rgb_encoder else opt.arch.latent_dim,
+                                     semantic=self.rgb_encoder is not None, n_channels=opt.arch.impl.n_channels,
+                                     n_blocks_attn=opt.arch.impl.att_blocks, n_layers_mlp=opt.arch.impl.mlp_layers,
+                                     num_heads=opt.arch.num_heads, posenc_3D=opt.arch.impl.posenc_3D,
+                                     mlp_ratio=opt.arch.impl.mlp_ratio, skip_in=opt.arch.impl.skip_in,
+                                     pos_perlayer=opt.arch.impl.posenc_perlayer)
+        # loss functions
+        self.loss_fns = Loss(opt)
+    def load_pretrained_depth(self, opt):
+        if opt.pretrain.depth:
+            # loading from our pretrained depth and intr model
+            if opt.device == 0:
+                print("loading dpt depth from {}...".format(opt.pretrain.depth))
+            checkpoint = torch.load(opt.pretrain.depth, map_location="cuda:{}".format(opt.device))
+            self.dpt_depth.load_state_dict(get_child_state_dict(checkpoint["graph"], "dpt_depth"))
+            # load the intr head
+            if opt.device == 0:
+                print("loading pretrained intr from {}...".format(opt.pretrain.depth))
+            self.intr_head.load_state_dict(get_child_state_dict(checkpoint["graph"], "intr_head"))
+            self.intr_proj.load_state_dict(get_child_state_dict(checkpoint["graph"], "intr_proj"))
+        elif opt.arch.depth.pretrained:
+            # loading from omnidata weights
+            if opt.device == 0:
+                print("loading dpt depth from {}...".format(opt.arch.depth.pretrained))
+            checkpoint = torch.load(opt.arch.depth.pretrained, map_location="cuda:{}".format(opt.device))
+            state_dict = checkpoint['model_state_dict']
+            self.dpt_depth.load_state_dict(state_dict)
+    def intr_param2mtx(self, opt, intr_params):
+        '''
+        Parameters:
+            opt: config
+            intr_params: [B, 3], [scale_f, delta_cx, delta_cy]
+        Return:
+            intr: [B, 3, 3]
+        '''
+        batch_size = len(intr_params)
+        f = 1.3875
+        intr = torch.zeros(3, 3).float().to(intr_params.device).unsqueeze(0).repeat(batch_size, 1, 1)
+        intr[:, 2, 2] += 1
+        # scale the focal length
+        # range: [-1, 1], symmetric
+        scale_f = torch.tanh(intr_params[:, 0])
+        # range: [1/4, 4], symmetric
+        scale_f = torch.pow(4. , scale_f)
+        intr[:, 0, 0] += f * opt.W * scale_f
+        intr[:, 1, 1] += f * opt.H * scale_f
+        # shift the optic center, (at most to the image border)
+        shift_cx = torch.tanh(intr_params[:, 1]) * opt.W / 2
+        shift_cy = torch.tanh(intr_params[:, 2]) * opt.H / 2
+        intr[:, 0, 2] += opt.W / 2 + shift_cx
+        intr[:, 1, 2] += opt.H / 2 + shift_cy
+        return intr
+    def forward(self, opt, var, training=False, get_loss=True):
+        batch_size = len(var.idx)
+        # encode the rgb, [B, 3, H, W] -> [B, 1+H/(ws)*W/(ws), C], not used in our final model
+        var.latent_semantic = self.rgb_encoder(var.rgb_input_map) if self.rgb_encoder else None
+        # predict the depth map and intrinsics
+        var.depth_pred, intr_feat = self.dpt_depth(var.rgb_input_map, get_feat=True)
+        depth_map = var.depth_pred
+        # predict the intrinsics
+        intr_feat = self.intr_head(intr_feat)
+        intr_feat = self.intr_pool(intr_feat).squeeze(-1).squeeze(-1)
+        intr_params = self.intr_proj(intr_feat)
+        # [B, 3, 3]
+        var.intr_pred = self.intr_param2mtx(opt, intr_params)
+        intr_forward = var.intr_pred
+        # record the validity mask, [B, H*W]
+        var.validity_mask = (var.mask_input_map>0.5).float().view(batch_size, -1)
+        # project the depth to 3D points in view-centric frame
+        # [B, H*W, 3], in camera coordinates
+        seen_points_3D_pred = unproj_depth(opt, depth_map, intr_forward)
+        # [B, H*W, 3], [B, 1, H, W] (boolean) -> [B, 3], [B]
+        seen_points_mean_pred, seen_points_scale_pred = valid_norm_fac(seen_points_3D_pred, var.mask_input_map > 0.5)
+        # normalize the seen surface, [B, H*W, 3]
+        var.seen_points = (seen_points_3D_pred - seen_points_mean_pred.unsqueeze(1)) / seen_points_scale_pred.unsqueeze(-1).unsqueeze(-1)
+        var.seen_points[(var.mask_input_map<=0.5).view(batch_size, -1)] = 0
+        # [B, 3, H, W]
+        seen_3D_map = var.seen_points.view(batch_size, opt.H, opt.W, 3).permute(0, 3, 1, 2).contiguous()
+        seen_3D_dsp, mask_dsp = interpolate_coordmap(seen_3D_map, var.mask_input_map, (opt.H//opt.arch.depth.dsp, opt.W//opt.arch.depth.dsp))
+        # encode the depth, [B, 1, H/k, W/k] -> [B, 1+H/(ws)*W/(ws), C]
+        if opt.arch.depth.encoder == 'resnet':
+            var.latent_depth = self.coord_encoder(seen_3D_dsp, mask_dsp)
+        else:
+            var.latent_depth = self.coord_encoder(seen_3D_dsp.permute(0, 2, 3, 1).contiguous(), mask_dsp.squeeze(1)>0.5)
+        var.pose = var.pose_gt
+        # forward for loss calculation (only during training)
+        if 'gt_sample_points' in var and 'gt_sample_sdf' in var:
+            with torch.no_grad():
+                # get the normalizing fac based on the GT seen surface
+                # project the GT depth to 3D points in view-centric frame
+                # [B, H*W, 3], in camera coordinates
+                seen_points_3D_gt = unproj_depth(opt, var.depth_input_map, var.intr)
+                # [B, H*W, 3], [B, 1, H, W] (boolean) -> [B, 3], [B]
+                seen_points_mean_gt, seen_points_scale_gt = valid_norm_fac(seen_points_3D_gt, var.mask_input_map > 0.5)
+                var.seen_points_gt = (seen_points_3D_gt - seen_points_mean_gt.unsqueeze(1)) / seen_points_scale_gt.unsqueeze(-1).unsqueeze(-1)
+                var.seen_points_gt[(var.mask_input_map<=0.5).view(batch_size, -1)] = 0
+                # transform the GT points accordingly
+                # [B, 3, 3]
+                R_gt = var.pose_gt[:, :, :3]
+                # [B, 3, 1]
+                T_gt = var.pose_gt[:, :, 3:]
+                # [B, 3, N]
+                gt_sample_points_transposed = var.gt_sample_points.permute(0, 2, 1).contiguous()
+                # camera coordinates, [B, N, 3]
+                gt_sample_points_cam = (R_gt @ gt_sample_points_transposed + T_gt).permute(0, 2, 1).contiguous()
+                # normalize with seen std and mean, [B, N, 3]
+                var.gt_points_cam = (gt_sample_points_cam - seen_points_mean_gt.unsqueeze(1)) / seen_points_scale_gt.unsqueeze(-1).unsqueeze(-1)
+                # get near-surface points for visualization
+                # [B, 100, 3]
+                close_surf_idx = torch.topk(var.gt_sample_sdf.abs(), k=100, dim=1, largest=False)[1].unsqueeze(-1).repeat(1, 1, 3)
+                # [B, 100, 3]
+                var.gt_surf_points = torch.gather(var.gt_points_cam, dim=1, index=close_surf_idx)
+            # [B, N], [B, N, 1+feat_res**2], inference the impl_network for 3D loss
+            var.pred_sample_occ, attn = self.impl_network(var.latent_depth, var.latent_semantic, var.gt_points_cam)
+        # calculate the loss if needed
+        if get_loss:
+            loss = self.compute_loss(opt, var, training)
+            return var, loss
+        return var
+    def compute_loss(self, opt, var, training=False):
+        loss = edict()
+        if opt.loss_weight.depth is not None:
+            loss.depth = self.loss_fns.depth_loss(var.depth_pred, var.depth_input_map, var.mask_input_map)
+        if opt.loss_weight.intr is not None and training:
+            loss.intr = self.loss_fns.intr_loss(var.seen_points, var.seen_points_gt, var.validity_mask)
+        if opt.loss_weight.shape is not None and training:
+            loss.shape = self.loss_fns.shape_loss(var.pred_sample_occ, var.gt_sample_sdf)
+        return loss

model/depth/__init__.py ADDED Viewed

File without changes

model/depth/base_model.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# modified from https://github.com/isl-org/DPT
+import torch
+class BaseModel(torch.nn.Module):
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+        if "optimizer" in parameters:
+            parameters = parameters["model"]
+        self.load_state_dict(parameters)

model/depth/blocks.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# modified from https://github.com/isl-org/DPT
+import torch
+import torch.nn as nn
+from .vit import (
+    _make_pretrained_vitb_rn50_384,
+    _make_pretrained_vitl16_384,
+    _make_pretrained_vitb16_384,
+    forward_vit,
+)
+def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
+    if backbone == "vitl16_384":
+        pretrained = _make_pretrained_vitl16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout
+        )
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups, expand=expand
+        )  # ViT-L/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb_rn50_384":
+        pretrained = _make_pretrained_vitb_rn50_384(
+            use_pretrained,
+            hooks=hooks,
+            use_vit_only=use_vit_only,
+            use_readout=use_readout,
+        )
+        scratch = _make_scratch(
+            [256, 512, 768, 768], features, groups=groups, expand=expand
+        )  # ViT-H/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb16_384":
+        pretrained = _make_pretrained_vitb16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout
+        )
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups, expand=expand
+        )  # ViT-B/16 - 84.6% Top1 (backbone)
+    elif backbone == "resnext101_wsl":
+        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
+        scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand)     # efficientnet_lite3
+    elif backbone == "efficientnet_lite3":
+        pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
+        scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand)  # efficientnet_lite3
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+    return pretrained, scratch
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand==True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape*2
+        out_shape3 = out_shape*4
+        out_shape4 = out_shape*8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    return scratch
+def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
+    efficientnet = torch.hub.load(
+        "rwightman/gen-efficientnet-pytorch",
+        "tf_efficientnet_lite3",
+        pretrained=use_pretrained,
+        exportable=exportable
+    )
+    return _make_efficientnet_backbone(efficientnet)
+def _make_efficientnet_backbone(effnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(
+        effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
+    )
+    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
+    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
+    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
+    return pretrained
+def _make_resnet_backbone(resnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(
+        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
+    )
+    pretrained.layer2 = resnet.layer2
+    pretrained.layer3 = resnet.layer3
+    pretrained.layer4 = resnet.layer4
+    return pretrained
+def _make_pretrained_resnext101_wsl(use_pretrained):
+    resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
+    return _make_resnet_backbone(resnet)
+class Interpolate(nn.Module):
+    """Interpolation module.
+    """
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
+        )
+        return x
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        return out + x
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=True
+        )
+        return output
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=1
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        if self.bn==True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn==True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn==True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+        # return out + x
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=1
+        self.expand = expand
+        out_features = features
+        if self.expand==True:
+            out_features = features//2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output

model/depth/dpt_depth.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# modified from https://github.com/isl-org/DPT
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .base_model import BaseModel
+from .blocks import (
+    FeatureFusionBlock,
+    FeatureFusionBlock_custom,
+    Interpolate,
+    _make_encoder,
+    forward_vit,
+)
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+class DPT(BaseModel):
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone="vitb_rn50_384",
+        readout="project",
+        channels_last=False,
+        use_bn=False,
+    ):
+        super(DPT, self).__init__()
+        self.channels_last = channels_last
+        hooks = {
+            "vitb_rn50_384": [0, 1, 8, 11],
+            "vitb16_384": [2, 5, 8, 11],
+            "vitl16_384": [5, 11, 17, 23],
+        }
+        # Instantiate backbone and reassemble blocks
+        self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            True, # Set to true of you want to train from scratch, uses ImageNet weights
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        self.scratch.output_conv = head
+    def forward(self, x, get_feat=False):
+        if self.channels_last == True:
+            x.contiguous(memory_format=torch.channels_last)
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+        # res 8x -> 4x -> 2x -> 1x base_size
+        # base_size = H / 32
+        # all n_channels same (256 by default) after these
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        # upsample by two with out changing n_channels each time, conv-sum for fusing
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        # save the feat if required
+        if get_feat:
+            return out, layer_4
+        return out
+class DPTDepthModel(DPT):
+    def __init__(self, path=None, non_negative=True, num_channels=1, **kwargs):
+        features = kwargs["features"] if "features" in kwargs else 256
+        head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, num_channels, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+        nn.init.constant_(head[-3].bias, 0.05)
+        super().__init__(head, **kwargs)
+        if path is not None:
+           self.load(path)
+    def forward(self, image, get_feat=False):
+        x = image * 2 - 1
+        if get_feat:
+            output, feat = super().forward(x, get_feat=get_feat)
+            output = output.clamp(min=0, max=1)
+            return output, feat
+        else:
+            output = super().forward(x, get_feat=get_feat).clamp(min=0, max=1)
+            return output

model/depth/midas_loss.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# modified from https://github.com/EPFL-VILAB/omnidata
+import torch
+import torch.nn as nn
+import numpy as np
+def masked_l1_loss(preds, target, mask_valid):
+    element_wise_loss = abs(preds - target)
+    element_wise_loss[~mask_valid] = 0
+    return element_wise_loss.sum() / (mask_valid.sum() + 1.e-6)
+def compute_scale_and_shift(prediction, target, mask):
+    # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+    a_00 = torch.sum(mask * prediction * prediction, (1, 2))
+    a_01 = torch.sum(mask * prediction, (1, 2))
+    a_11 = torch.sum(mask, (1, 2))
+    # right hand side: b = [b_0, b_1]
+    b_0 = torch.sum(mask * prediction * target, (1, 2))
+    b_1 = torch.sum(mask * target, (1, 2))
+    # solution: x = A^-1 . b = [[a_11, -a_01], [-a_10, a_00]] / (a_00 * a_11 - a_01 * a_10) . b
+    x_0 = torch.zeros_like(b_0)
+    x_1 = torch.zeros_like(b_1)
+    det = a_00 * a_11 - a_01 * a_01
+    valid = det.nonzero()
+    x_0[valid] = (a_11[valid] * b_0[valid] - a_01[valid] * b_1[valid]) / (det[valid] + 1e-6)
+    x_1[valid] = (-a_01[valid] * b_0[valid] + a_00[valid] * b_1[valid]) / (det[valid] + 1e-6)
+    return x_0, x_1
+def masked_shift_and_scale(depth_preds, depth_gt, mask_valid):
+    depth_preds_nan = depth_preds.clone()
+    depth_gt_nan = depth_gt.clone()
+    depth_preds_nan[~mask_valid] = np.nan
+    depth_gt_nan[~mask_valid] = np.nan
+    mask_diff = mask_valid.view(mask_valid.size()[:2] + (-1,)).sum(-1, keepdims=True) + 1
+    # flatten spatial dimension and take valid median [B, 1, 1, 1]
+    t_gt = depth_gt_nan.view(depth_gt_nan.size()[:2] + (-1,)).nanmedian(-1, keepdims=True)[0].unsqueeze(-1)
+    t_gt[torch.isnan(t_gt)] = 0
+    # subtract median and set invalid position to 0
+    diff_gt = torch.abs(depth_gt - t_gt)
+    diff_gt[~mask_valid] = 0
+    # get the avg abs diff value over valid regions [B, 1, 1, 1]
+    s_gt = (diff_gt.view(diff_gt.size()[:2] + (-1,)).sum(-1, keepdims=True) / mask_diff).unsqueeze(-1)
+    # normalize
+    depth_gt_aligned = (depth_gt - t_gt) / (s_gt + 1e-6)
+    # same as gt normalization
+    t_pred = depth_preds_nan.view(depth_preds_nan.size()[:2] + (-1,)).nanmedian(-1, keepdims=True)[0].unsqueeze(-1)
+    t_pred[torch.isnan(t_pred)] = 0
+    diff_pred = torch.abs(depth_preds - t_pred)
+    diff_pred[~mask_valid] = 0
+    s_pred = (diff_pred.view(diff_pred.size()[:2] + (-1,)).sum(-1, keepdims=True) / mask_diff).unsqueeze(-1)
+    depth_pred_aligned = (depth_preds - t_pred) / (s_pred + 1e-6)
+    return depth_pred_aligned, depth_gt_aligned
+def reduction_batch_based(image_loss, M):
+    # average of all valid pixels of the batch
+    # avoid division by 0 (if sum(M) = sum(sum(mask)) = 0: sum(image_loss) = 0)
+    divisor = torch.sum(M)
+    if divisor == 0:
+        return 0
+    else:
+        return torch.sum(image_loss) / divisor
+def reduction_image_based(image_loss, M):
+    # mean of average of valid pixels of an image
+    # avoid division by 0 (if M = sum(mask) = 0: image_loss = 0)
+    valid = M.nonzero()
+    image_loss[valid] = image_loss[valid] / M[valid]
+    return torch.mean(image_loss)
+def gradient_loss(prediction, target, mask, reduction=reduction_batch_based):
+    M = torch.sum(mask, (1, 2))
+    diff = prediction - target
+    diff = torch.mul(mask, diff)
+    grad_x = torch.abs(diff[:, :, 1:] - diff[:, :, :-1])
+    mask_x = torch.mul(mask[:, :, 1:], mask[:, :, :-1])
+    grad_x = torch.mul(mask_x, grad_x)
+    grad_y = torch.abs(diff[:, 1:, :] - diff[:, :-1, :])
+    mask_y = torch.mul(mask[:, 1:, :], mask[:, :-1, :])
+    grad_y = torch.mul(mask_y, grad_y)
+    image_loss = torch.sum(grad_x, (1, 2)) + torch.sum(grad_y, (1, 2))
+    return reduction(image_loss, M)
+class SSIMAE(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, depth_preds, depth_gt, mask_valid):
+        depth_pred_aligned, depth_gt_aligned = masked_shift_and_scale(depth_preds, depth_gt, mask_valid)
+        ssi_mae_loss = masked_l1_loss(depth_pred_aligned, depth_gt_aligned, mask_valid)
+        return ssi_mae_loss
+class GradientMatchingTerm(nn.Module):
+    def __init__(self, scales=4, reduction='batch-based'):
+        super().__init__()
+        if reduction == 'batch-based':
+            self.__reduction = reduction_batch_based
+        else:
+            self.__reduction = reduction_image_based
+        self.__scales = scales
+    def forward(self, prediction, target, mask):
+        total = 0
+        for scale in range(self.__scales):
+            step = pow(2, scale)
+            total += gradient_loss(prediction[:, ::step, ::step], target[:, ::step, ::step],
+                                   mask[:, ::step, ::step], reduction=self.__reduction)
+        return total
+class MidasLoss(nn.Module):
+    def __init__(self, alpha=0.1, scales=4, reduction='image-based', inverse_depth=True, shrink_mask=False):
+        super().__init__()
+        self.__ssi_mae_loss = SSIMAE()
+        self.__gradient_matching_term = GradientMatchingTerm(scales=scales, reduction=reduction)
+        self.__alpha = alpha
+        self.inverse_depth = inverse_depth
+        self.shrink_mask = shrink_mask
+    # decrease valid region via min-pooling
+    @torch.no_grad()
+    def erode_mask(self, mask, max_pool_size=4):
+        mask_float = mask.float()
+        h, w = mask_float.shape[2], mask_float.shape[3]
+        mask_float = 1 - mask_float
+        mask_float = torch.nn.functional.max_pool2d(mask_float, kernel_size=max_pool_size)
+        mask_float = torch.nn.functional.interpolate(mask_float, (h, w), mode='nearest')
+        # only if a 4x4 region is all valid then we make that valid
+        mask_valid = mask_float == 0
+        return mask_valid
+    def forward(self, prediction_raw, target_raw, mask_raw):
+        if self.shrink_mask:
+            mask = self.erode_mask(mask_raw)
+        else:
+            mask = mask_raw > 0.5
+        ssi_loss = self.__ssi_mae_loss(prediction_raw, target_raw, mask)
+        if self.__alpha <= 0:
+            return ssi_loss
+        if self.inverse_depth:
+            prediction = 1 / (prediction_raw.squeeze(1) + 1e-6)
+            target = 1 / (target_raw.squeeze(1) + 1e-6)
+        else:
+            prediction = prediction_raw.squeeze(1)
+            target = target_raw.squeeze(1)
+        # gradient loss
+        scale, shift = compute_scale_and_shift(prediction, target, mask.squeeze(1))
+        prediction_ssi = scale.view(-1, 1, 1) * prediction + shift.view(-1, 1, 1)
+        reg_loss = self.__gradient_matching_term(prediction_ssi, target, mask.squeeze(1))
+        total = ssi_loss + self.__alpha * reg_loss
+        return total

model/depth/vit.py ADDED Viewed

	@@ -0,0 +1,492 @@

+# modified from https://github.com/isl-org/DPT
+import torch
+import torch.nn as nn
+import timm
+import types
+import math
+import torch.nn.functional as F
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index :]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index :] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1).contiguous()
+        return x
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    glob = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, : self.start_index],
+        posemb[0, self.start_index :],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2).contiguous()
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear", align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1).contiguous()
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    pos_embed = self._resize_pos_embed(
+        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
+    )
+    B = x.shape[0]
+    if hasattr(self.patch_embed, "backbone"):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2).contiguous()
+    if getattr(self, "dist_token", None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == "ignore":
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == "add":
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == "project":
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+    )
+def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
+    )
+def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
+    )
+def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model(
+        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
+    )
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+    )
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=768,
+    use_vit_only=False,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    if use_vit_only == True:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation("1")
+        )
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation("2")
+        )
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    if use_vit_only == True:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitb_rn50_384(
+    pretrained, use_readout="ignore", hooks=None, use_vit_only=False
+):
+    model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
+    hooks = [0, 1, 8, 11] if hooks == None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+    )

model/depth_engine.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import numpy as np
+import os, time, datetime
+import torch
+import torch.utils.tensorboard
+import importlib
+import shutil
+import utils.util as util
+import utils.util_vis as util_vis
+from torch.nn.parallel import DistributedDataParallel as DDP
+from utils.util import print_eval, setup, cleanup
+from utils.util import EasyDict as edict
+from utils.eval_depth import DepthMetric
+from copy import deepcopy
+from model.compute_graph import graph_depth
+# ============================ main engine for training and evaluation ============================
+class Runner():
+    def __init__(self, opt):
+        super().__init__()
+        if os.path.isdir(opt.output_path) and opt.resume == False and opt.device == 0:
+            for filename in os.listdir(opt.output_path):
+                if "tfevents" in filename: os.remove(os.path.join(opt.output_path, filename))
+                if "html" in filename: os.remove(os.path.join(opt.output_path, filename))
+                if "vis" in filename: shutil.rmtree(os.path.join(opt.output_path, filename))
+                if "dump" in filename: shutil.rmtree(os.path.join(opt.output_path, filename))
+                if "embedding" in filename: shutil.rmtree(os.path.join(opt.output_path, filename))
+        if opt.device == 0:
+            os.makedirs(opt.output_path,exist_ok=True)
+        setup(opt.device, opt.world_size, opt.port)
+        opt.batch_size = opt.batch_size // opt.world_size
+    def get_viz_data(self, opt):
+        # get data for visualization
+        viz_data_list = []
+        sample_range = len(self.viz_loader)
+        viz_interval = sample_range // opt.eval.n_vis
+        for i in range(sample_range):
+            current_batch = next(self.viz_loader_iter)
+            if i % viz_interval != 0: continue
+            viz_data_list.append(current_batch)
+        return viz_data_list
+    def load_dataset(self, opt, eval_split="test"):
+        data_train = importlib.import_module('data.{}'.format(opt.data.dataset_train))
+        data_test = importlib.import_module('data.{}'.format(opt.data.dataset_test))
+        if opt.device == 0: print("loading training data...")
+        self.batch_order = []
+        self.train_data = data_train.Dataset(opt, split="train", load_3D=False)
+        self.train_loader = self.train_data.setup_loader(opt, shuffle=True, use_ddp=True, drop_last=True)
+        self.num_batches = len(self.train_loader)
+        if opt.device == 0: print("loading test data...")
+        self.test_data = data_test.Dataset(opt, split=eval_split, load_3D=False)
+        self.test_loader = self.test_data.setup_loader(opt, shuffle=False, use_ddp=True, drop_last=True, batch_size=opt.eval.batch_size)
+        self.num_batches_test = len(self.test_loader)
+        if len(self.test_loader.sampler) * opt.world_size < len(self.test_data):
+            self.aux_test_dataset = torch.utils.data.Subset(self.test_data,
+                                    range(len(self.test_loader.sampler) * opt.world_size, len(self.test_data)))
+            self.aux_test_loader = torch.utils.data.DataLoader(
+                self.aux_test_dataset, batch_size=opt.eval.batch_size, shuffle=False, drop_last=False,
+                num_workers=opt.data.num_workers)
+        if opt.device == 0:
+            print("creating data for visualization...")
+            self.viz_loader = self.test_data.setup_loader(opt, shuffle=False, use_ddp=False, drop_last=False, batch_size=1)
+            self.viz_loader_iter = iter(self.viz_loader)
+            self.viz_data = self.get_viz_data(opt)
+    def build_networks(self, opt):
+        if opt.device == 0: print("building networks...")
+        self.graph = DDP(graph_depth.Graph(opt).to(opt.device), device_ids=[opt.device], find_unused_parameters=True)
+        self.depth_metric = DepthMetric(thresholds=opt.eval.d_thresholds, depth_cap=opt.eval.depth_cap)
+# =================================================== set up training =========================================================
+    def setup_optimizer(self, opt):
+        if opt.device == 0: print("setting up optimizers...")
+        param_nodecay = []
+        param_decay = []
+        for name, param in self.graph.named_parameters():
+            # skip and fixed params
+            if not param.requires_grad:
+                continue
+            if param.ndim <= 1 or name.endswith(".bias"):
+                # print("{} -> finetune_param_nodecay".format(name))
+                param_nodecay.append(param)
+            else:
+                param_decay.append(param)
+                # print("{} -> finetune_param_decay".format(name))
+            # create the optim dictionary
+            optim_dict = [
+                {'params': param_nodecay, 'lr': opt.optim.lr, 'weight_decay': 0.},
+                {'params': param_decay, 'lr': opt.optim.lr, 'weight_decay': opt.optim.weight_decay}
+            ]
+        self.optim = torch.optim.AdamW(optim_dict, betas=(0.9, 0.95))
+        if opt.optim.sched:
+            self.sched = torch.optim.lr_scheduler.CosineAnnealingLR(self.optim, opt.max_epoch)
+        if opt.optim.amp:
+            self.scaler = torch.cuda.amp.GradScaler()
+    def restore_checkpoint(self, opt, best=False, evaluate=False):
+        epoch_start, iter_start = None, None
+        if opt.resume:
+            if opt.device == 0: print("resuming from previous checkpoint...")
+            epoch_start, iter_start, best_val, best_ep = util.restore_checkpoint(opt, self, resume=opt.resume, best=best, evaluate=evaluate)
+            self.best_val = best_val
+            self.best_ep = best_ep
+        elif opt.load is not None:
+            if opt.device == 0: print("loading weights from checkpoint {}...".format(opt.load))
+            epoch_start, iter_start, best_val, best_ep = util.restore_checkpoint(opt, self, load_name=opt.load)
+        else:
+            if opt.device == 0: print("initializing weights from scratch...")
+        self.epoch_start = epoch_start or 0
+        self.iter_start = iter_start or 0
+    def setup_visualizer(self, opt, test=False):
+        if opt.device == 0:
+            print("setting up visualizers...")
+            if opt.tb:
+                self.tb = torch.utils.tensorboard.SummaryWriter(log_dir=opt.output_path, flush_secs=10)
+    def train(self, opt):
+        # before training
+        torch.cuda.set_device(opt.device)
+        torch.cuda.empty_cache()
+        if opt.device == 0: print("TRAINING START")
+        self.train_metric_logger = util.MetricLogger(delimiter="  ")
+        self.train_metric_logger.add_meter('lr', util.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+        self.iter_skip = self.iter_start % len(self.train_loader)
+        self.it = self.iter_start
+        self.skip_dis = False
+        if not opt.resume:
+            self.best_val = np.inf
+            self.best_ep = 1
+        # training
+        if self.iter_start == 0 and not opt.debug: self.evaluate(opt, ep=0, training=True)
+        # if opt.device == 0: self.save_checkpoint(opt, ep=0, it=0, best_val=self.best_val, best_ep=self.best_ep)
+        self.ep = self.epoch_start
+        for self.ep in range(self.epoch_start, opt.max_epoch):
+            self.train_epoch(opt)
+        # after training
+        if opt.device == 0: self.save_checkpoint(opt, ep=self.ep, it=self.it, best_val=self.best_val, best_ep=self.best_ep)
+        if opt.tb and opt.device == 0:
+            self.tb.flush()
+            self.tb.close()
+        if opt.device == 0:
+            print("TRAINING DONE")
+            print("Best val: %.4f @ epoch %d" % (self.best_val, self.best_ep))
+        cleanup()
+    def train_epoch(self, opt):
+        # before train epoch
+        self.train_loader.sampler.set_epoch(self.ep)
+        if opt.device == 0:
+            print("training epoch {}".format(self.ep+1))
+        batch_progress = range(self.num_batches)
+        self.graph.train()
+        # train epoch
+        loader = iter(self.train_loader)
+        for batch_id in batch_progress:
+            # if resuming from previous checkpoint, skip until the last iteration number is reached
+            if self.iter_skip>0:
+                self.iter_skip -= 1
+                continue
+            batch = next(loader)
+            # train iteration
+            var = edict(batch)
+            opt.H, opt.W = opt.image_size
+            var = util.move_to_device(var, opt.device)
+            loss = self.train_iteration(opt, var, batch_progress)
+        # after train epoch
+        lr = self.sched.get_last_lr()[0] if opt.optim.sched else opt.optim.lr
+        if opt.optim.sched: self.sched.step()
+        if (self.ep + 1) % opt.freq.eval == 0:
+            if opt.device == 0: print("validating epoch {}".format(self.ep+1))
+            current_val = self.evaluate(opt, ep=self.ep+1, training=True)
+            if current_val < self.best_val and opt.device == 0:
+                self.best_val = current_val
+                self.best_ep = self.ep + 1
+                self.save_checkpoint(opt, ep=self.ep, it=self.it, best_val=self.best_val, best_ep=self.best_ep, best=True, latest=True)
+    def train_iteration(self, opt, var, loader):
+        # before train iteration
+        torch.distributed.barrier()
+        # train iteration
+        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=opt.optim.amp):
+            var, loss = self.graph.forward(opt, var, training=True, get_loss=True)
+            loss = self.summarize_loss(opt, var, loss)
+            loss_scaled = loss.all / opt.optim.accum
+        # backward
+        if opt.optim.amp:
+            self.scaler.scale(loss_scaled).backward()
+            # skip update if accumulating gradient
+            if (self.it + 1) % opt.optim.accum == 0:
+                self.scaler.unscale_(self.optim)
+                # gradient clipping
+                if opt.optim.clip_norm:
+                    norm = torch.nn.utils.clip_grad_norm_(self.graph.parameters(), opt.optim.clip_norm)
+                    if opt.debug: print("Grad norm: {}".format(norm))
+                self.scaler.step(self.optim)
+                self.scaler.update()
+                self.optim.zero_grad()
+        else:
+            loss_scaled.backward()
+            if (self.it + 1) % opt.optim.accum == 0:
+                if opt.optim.clip_norm:
+                    norm = torch.nn.utils.clip_grad_norm_(self.graph.parameters(), opt.optim.clip_norm)
+                    if opt.debug: print("Grad norm: {}".format(norm))
+                self.optim.step()
+                self.optim.zero_grad()
+        # after train iteration
+        lr = self.sched.get_last_lr()[0] if opt.optim.sched else opt.optim.lr
+        self.train_metric_logger.update(lr=lr)
+        self.train_metric_logger.update(loss=loss.all)
+        if opt.device == 0:
+            if (self.it) % opt.freq.vis == 0 and not opt.debug:
+                self.visualize(opt, var, step=self.it, split="train")
+            if (self.it+1) % opt.freq.ckpt_latest == 0 and not opt.debug:
+                self.save_checkpoint(opt, ep=self.ep, it=self.it+1, best_val=self.best_val, best_ep=self.best_ep, latest=True)
+            if (self.it) % opt.freq.scalar == 0 and not opt.debug:
+                self.log_scalars(opt, var, loss, step=self.it, split="train")
+            if (self.it) % (opt.freq.save_vis * (self.it//10000*10+1)) == 0 and not opt.debug:
+                self.vis_train_iter(opt)
+            if (self.it) % opt.freq.print == 0:
+                print('[{}] '.format(datetime.datetime.now().time()), end='')
+                print(f'Train Iter {self.it}/{self.num_batches*opt.max_epoch}: {self.train_metric_logger}')
+        self.it += 1
+        return loss
+    @torch.no_grad()
+    def vis_train_iter(self, opt):
+        self.graph.eval()
+        for i in range(len(self.viz_data)):
+            var_viz = edict(deepcopy(self.viz_data[i]))
+            var_viz = util.move_to_device(var_viz, opt.device)
+            var_viz = self.graph.module(opt, var_viz, training=False, get_loss=False)
+            vis_folder = "vis_log/iter_{}".format(self.it)
+            os.makedirs("{}/{}".format(opt.output_path, vis_folder), exist_ok=True)
+            util_vis.dump_images(opt, var_viz.idx, "image_input", var_viz.rgb_input_map, masks=None, from_range=(0, 1), folder=vis_folder)
+            util_vis.dump_images(opt, var_viz.idx, "mask_input", var_viz.mask_input_map, folder=vis_folder)
+            util_vis.dump_depths(opt, var_viz.idx, "depth_est", var_viz.depth_pred, var_viz.mask_input_map, rescale=True, folder=vis_folder)
+            util_vis.dump_depths(opt, var_viz.idx, "depth_input", var_viz.depth_input_map, var_viz.mask_input_map, rescale=True, folder=vis_folder)
+            if 'seen_points_pred' in var_viz and 'seen_points_gt' in var_viz:
+                util_vis.dump_pointclouds_compare(opt, var_viz.idx, "seen_surface", var_viz.seen_points_pred, var_viz.seen_points_gt, folder=vis_folder)
+        self.graph.train()
+    def summarize_loss(self, opt, var, loss, non_act_loss_key=[]):
+        loss_all = 0.
+        assert("all" not in loss)
+        # weigh losses
+        for key in loss:
+            assert(key in opt.loss_weight)
+            if opt.loss_weight[key] is not None:
+                assert not torch.isinf(loss[key].mean()), "loss {} is Inf".format(key)
+                assert not torch.isnan(loss[key].mean()), "loss {} is NaN".format(key)
+                loss_all += float(opt.loss_weight[key])*loss[key].mean() if key not in non_act_loss_key else 0.0*loss[key].mean()
+        loss.update(all=loss_all)
+        return loss
+# =================================================== set up evaluation =========================================================
+    @torch.no_grad()
+    def evaluate(self, opt, ep, training=False):
+        self.graph.eval()
+        loss_eval = edict()
+        # metric dictionary
+        metric_eval = {}
+        for metric_key in self.depth_metric.metric_keys:
+            metric_eval[metric_key] = []
+        metric_avg = {}
+        eval_metric_logger = util.MetricLogger(delimiter="  ")
+        # dataloader on the test set
+        with torch.cuda.device(opt.device):
+            for it, batch in enumerate(self.test_loader):
+                # inference the model
+                var = edict(batch)
+                var = self.evaluate_batch(opt, var, ep, it, single_gpu=False)
+                # record foreground mae for evaluation
+                sample_metrics, var.depth_pred_aligned = self.depth_metric.compute_metrics(
+                    var.depth_pred, var.depth_input_map, var.mask_eroded if 'mask_eroded' in var else var.mask_input_map)
+                var.rmse = sample_metrics['rmse']
+                curr_metrics = {}
+                for metric_key in metric_eval:
+                    metric_eval[metric_key].append(sample_metrics[metric_key])
+                    curr_metrics[metric_key] = sample_metrics[metric_key].mean()
+                eval_metric_logger.update(**curr_metrics)
+                # eval_metric_logger.update(metric_key=sample_metrics[metric_key].mean())
+                # accumulate the scores
+                if opt.device == 0 and it % opt.freq.print_eval == 0:
+                    print('[{}] '.format(datetime.datetime.now().time()), end='')
+                    print(f'Eval Iter {it}/{len(self.test_loader)} @ EP {ep}: {eval_metric_logger}')
+                # dump the result if in eval mode
+                if not training:
+                    self.dump_results(opt, var, ep, write_new=(it == 0))
+                # save the visualization
+                if it == 0 and training and opt.device == 0:
+                    print("visualizing and saving results...")
+                    for i in range(len(self.viz_data)):
+                        var_viz = edict(deepcopy(self.viz_data[i]))
+                        var_viz = self.evaluate_batch(opt, var_viz, ep, it, single_gpu=True)
+                        self.visualize(opt, var_viz, step=ep, split="eval")
+                        self.dump_results(opt, var_viz, ep, train=True)
+            # collect the eval results into tensors
+            for metric_key in metric_eval:
+                metric_eval[metric_key] = torch.cat(metric_eval[metric_key], dim=0)
+        if opt.world_size > 1:
+            metric_gather_dict = {}
+            # empty tensors for gathering
+            for metric_key in metric_eval:
+                metric_gather_dict[metric_key] = [torch.zeros_like(metric_eval[metric_key]).to(opt.device) for _ in range(opt.world_size)]
+            # gather the metrics
+            torch.distributed.barrier()
+            for metric_key in metric_eval:
+                torch.distributed.all_gather(metric_gather_dict[metric_key], metric_eval[metric_key])
+                metric_gather_dict[metric_key] = torch.cat(metric_gather_dict[metric_key], dim=0)
+        else:
+            metric_gather_dict = metric_eval
+        # handle last batch, if any
+        if len(self.test_loader.sampler) * opt.world_size < len(self.test_data):
+            for metric_key in metric_eval:
+                metric_gather_dict[metric_key] = [metric_gather_dict[metric_key]]
+            for batch in self.aux_test_loader:
+                # inference the model
+                var = edict(batch)
+                var = self.evaluate_batch(opt, var, ep, it, single_gpu=False)
+                # record MAE for evaluation
+                sample_metrics, var.depth_pred_aligned = self.depth_metric.compute_metrics(
+                    var.depth_pred, var.depth_input_map, var.mask_eroded if 'mask_eroded' in var else var.mask_input_map)
+                var.rmse = sample_metrics['rmse']
+                for metric_key in metric_eval:
+                    metric_gather_dict[metric_key].append(sample_metrics[metric_key])
+                # dump the result if in eval mode
+                if not training and opt.device == 0:
+                    self.dump_results(opt, var, ep, write_new=(it == 0))
+            for metric_key in metric_eval:
+                metric_gather_dict[metric_key] = torch.cat(metric_gather_dict[metric_key], dim=0)
+        assert metric_gather_dict['l1_err'].shape[0] == len(self.test_data)
+        # compute the mean of the metrics
+        for metric_key in metric_eval:
+            metric_avg[metric_key] = metric_gather_dict[metric_key].mean()
+        # printout and save the metrics
+        if opt.device == 0:
+            # print eval info
+            print_eval(opt, depth_metrics=metric_avg)
+            val_metric = metric_avg['l1_err']
+            if training:
+                # log/visualize results to tb/vis
+                self.log_scalars(opt, var, loss_eval, metric=metric_avg, step=ep, split="eval")
+            if not training:
+                # write to file
+                metrics_file = os.path.join(opt.output_path, 'best_val.txt')
+                with open(metrics_file, "w") as outfile:
+                    for metric_key in metric_avg:
+                        outfile.write('{}: {:.6f}\n'.format(metric_key, metric_avg[metric_key].item()))
+            return val_metric.item()
+        return float('inf')
+    def evaluate_batch(self, opt, var, ep=None, it=None, single_gpu=False):
+        var = util.move_to_device(var, opt.device)
+        if single_gpu:
+            var  = self.graph.module(opt, var, training=False, get_loss=False)
+        else:
+            var = self.graph(opt, var, training=False, get_loss=False)
+        return var
+    @torch.no_grad()
+    def log_scalars(self, opt, var, loss, metric=None, step=0, split="train"):
+        if split=="train":
+            sample_metrics, _ = self.depth_metric.compute_metrics(
+                var.depth_pred, var.depth_input_map, var.mask_eroded if 'mask_eroded' in var else var.mask_input_map)
+            metric = dict(L1_ERR=sample_metrics['l1_err'].mean().item())
+        for key, value in loss.items():
+            if key=="all": continue
+            self.tb.add_scalar("{0}/loss_{1}".format(split, key), value.mean(), step)
+        if metric is not None:
+            for key, value in metric.items():
+                self.tb.add_scalar("{0}/{1}".format(split, key), value, step)
+    @torch.no_grad()
+    def visualize(self, opt, var, step=0, split="train"):
+        pass
+    @torch.no_grad()
+    def dump_results(self, opt, var, ep, write_new=False, train=False):
+        # create the dir
+        current_folder = "dump" if train == False else "vis_{}".format(ep)
+        os.makedirs("{}/{}/".format(opt.output_path, current_folder), exist_ok=True)
+        # save the results
+        util_vis.dump_images(opt, var.idx, "image_input", var.rgb_input_map, masks=None, from_range=(0, 1), folder=current_folder)
+        util_vis.dump_images(opt, var.idx, "mask_input", var.mask_input_map, folder=current_folder)
+        util_vis.dump_depths(opt, var.idx, "depth_pred", var.depth_pred, var.mask_input_map, rescale=True, folder=current_folder)
+        util_vis.dump_depths(opt, var.idx, "depth_input", var.depth_input_map, var.mask_input_map, rescale=True, folder=current_folder)
+        if 'seen_points_pred' in var and 'seen_points_gt' in var:
+            util_vis.dump_pointclouds_compare(opt, var.idx, "seen_surface", var.seen_points_pred, var.seen_points_gt, folder=current_folder)
+        if "depth_pred_aligned" in var:
+            # get the max and min for the depth map
+            batch_size = var.depth_input_map.shape[0]
+            mask = var.mask_eroded if 'mask_eroded' in var else var.mask_input_map
+            masked_depth_far_bg = var.depth_input_map * mask + (1 - mask) * 1000
+            depth_min_gt = masked_depth_far_bg.view(batch_size, -1).min(dim=1)[0]
+            masked_depth_invalid_bg = var.depth_input_map * mask + (1 - mask) * 0
+            depth_max_gt = masked_depth_invalid_bg.view(batch_size, -1).max(dim=1)[0]
+            depth_vis_pred = (var.depth_pred_aligned - depth_min_gt.view(batch_size, 1, 1, 1)) / (depth_max_gt - depth_min_gt).view(batch_size, 1, 1, 1)
+            depth_vis_pred = depth_vis_pred * mask + (1 - mask)
+            depth_vis_gt = (var.depth_input_map - depth_min_gt.view(batch_size, 1, 1, 1)) / (depth_max_gt - depth_min_gt).view(batch_size, 1, 1, 1)
+            depth_vis_gt = depth_vis_gt * mask + (1 - mask)
+            util_vis.dump_depths(opt, var.idx, "depth_gt_aligned", depth_vis_gt.clamp(max=1, min=0), None, rescale=False, folder=current_folder)
+            util_vis.dump_depths(opt, var.idx, "depth_pred_aligned", depth_vis_pred.clamp(max=1, min=0), None, rescale=False, folder=current_folder)
+            if "mask_eroded" in var and "rmse" in var:
+                util_vis.dump_images(opt, var.idx, "image_eroded", var.rgb_input_map, masks=var.mask_eroded, metrics=var.rmse, from_range=(0, 1), folder=current_folder)
+    def save_checkpoint(self, opt, ep=0, it=0, best_val=np.inf, best_ep=1, latest=False, best=False):
+        util.save_checkpoint(opt, self, ep=ep, it=it, best_val=best_val, best_ep=best_ep, latest=latest, best=best)
+        if not latest:
+            print("checkpoint saved: ({0}) {1}, epoch {2} (iteration {3})".format(opt.group, opt.name, ep, it))
+        if best:
+            print("Saving the current model as the best...")

model/shape/implicit.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from functools import partial
+from utils.layers import get_embedder
+from utils.layers import LayerScale
+from timm.models.vision_transformer import Mlp, DropPath
+from utils.pos_embed import get_2d_sincos_pos_embed
+class ImplFuncAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., last_layer=False):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.last_layer = last_layer
+    def forward(self, x, N_points):
+        B, N, C = x.shape
+        N_latent = N - N_points
+        # [3, B, num_heads, N, C/num_heads]
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # [B, num_heads, N, C/num_heads]
+        q, k, v = qkv.unbind(0)
+        # [B, num_heads, N_latent, C/num_heads]
+        q_latent, k_latent, v_latent = q[:, :, :-N_points], k[:, :, :-N_points], v[:, :, :-N_points]
+        # [B, num_heads, N_points, C/num_heads]
+        q_points, k_points, v_points = q[:, :, -N_points:], k[:, :, -N_points:], v[:, :, -N_points:]
+        # attention weight for each point, it's only connected to the latent and itself
+        # [B, num_heads, N_points, N_latent+1]
+        # get the cross attention, [B, num_heads, N_points, N_latent]
+        attn_cross = (q_points @ k_latent.transpose(-2, -1)) * self.scale
+        # get the attention to self feature, [B, num_heads, N_points, 1]
+        attn_self = torch.sum(q_points * k_points, dim=-1, keepdim=True) * self.scale
+        # get the normalized attention, [B, num_heads, N_points, N_latent+1]
+        attn_joint = torch.cat([attn_cross, attn_self], dim=-1)
+        attn_joint = attn_joint.softmax(dim=-1)
+        attn_joint = self.attn_drop(attn_joint)
+        # break it down to weigh and sum the values
+        # [B, num_heads, N_points, N_latent] @ [B, num_heads, N_latent, C/num_heads]
+        # -> [B, num_heads, N_points, C/num_heads] -> [B, N_points, C]
+        sum_cross = (attn_joint[:, :, :, :N_latent] @ v_latent).transpose(1, 2).reshape(B, N_points, C)
+        # [B, num_heads, N_points, 1] * [B, num_heads, N_points, C/num_heads]
+        # -> [B, num_heads, N_points, C/num_heads] -> [B, N_points, C]
+        sum_self = (attn_joint[:, :, :, N_latent:] * v_points).transpose(1, 2).reshape(B, N_points, C)
+        # [B, N_points, C]
+        output_points = sum_cross + sum_self
+        if self.last_layer:
+            output = self.proj(output_points)
+            output = self.proj_drop(output)
+            # [B, N_points, C], [B, N_points, N_latent]
+            return output, attn_joint[..., :-1].mean(dim=1)
+        # attention weight for the latent vec, it's not connected to the points
+        # [B, num_heads, N_latent, N_latent]
+        attn_latent = (q_latent @ k_latent.transpose(-2, -1)) * self.scale
+        attn_latent = attn_latent.softmax(dim=-1)
+        attn_latent = self.attn_drop(attn_latent)
+        # get the output latent, [B, N_latent, C]
+        output_latent = (attn_latent @ v_latent).transpose(1, 2).reshape(B, N_latent, C)
+        # concatenate the output and return
+        output = torch.cat([output_latent, output_points], dim=1)
+        output = self.proj(output)
+        output = self.proj_drop(output)
+        # [B, N, C], [B, N_points, N_latent+1]
+        return output, attn_joint[..., :-1].mean(dim=1)
+class ImplFuncBlock(nn.Module):
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., init_values=None,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, last_layer=False):
+        super().__init__()
+        self.last_layer = last_layer
+        self.norm1 = norm_layer(dim)
+        self.attn = ImplFuncAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, last_layer=last_layer)
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x, unseen_size):
+        if self.last_layer:
+            attn_out, attn_vis = self.attn(self.norm1(x), unseen_size)
+            output = x[:, -unseen_size:] + self.drop_path1(self.ls1(attn_out))
+            output = output + self.drop_path2(self.ls2(self.mlp(self.norm2(output))))
+            return output, attn_vis
+        else:
+            attn_out, attn_vis = self.attn(self.norm1(x), unseen_size)
+            x = x + self.drop_path1(self.ls1(attn_out))
+            x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+            return x, attn_vis
+class LinearProj3D(nn.Module):
+    """
+    Linear projection of 3D point into embedding space
+    """
+    def __init__(self, embed_dim, posenc_res=0):
+        super().__init__()
+        self.embed_dim = embed_dim
+        # define positional embedder
+        self.embed_fn = None
+        input_ch = 3
+        if posenc_res > 0:
+            self.embed_fn, input_ch = get_embedder(posenc_res, input_dims=3)
+        # linear proj layer
+        self.proj = nn.Linear(input_ch, embed_dim)
+    def forward(self, points_3D):
+        if self.embed_fn is not None:
+            points_3D = self.embed_fn(points_3D)
+        return self.proj(points_3D)
+class MLPBlocks(nn.Module):
+    def __init__(self, num_hidden_layers, n_channels, latent_dim,
+                 skip_in=[], posenc_res=0):
+        super().__init__()
+        # projection to the same number of channels
+        self.dims = [3 + latent_dim] + [n_channels] * num_hidden_layers + [1]
+        self.num_layers = len(self.dims)
+        self.skip_in = skip_in
+        # define positional embedder
+        self.embed_fn = None
+        if posenc_res > 0:
+            embed_fn, input_ch = get_embedder(posenc_res, input_dims=3)
+            self.embed_fn = embed_fn
+            self.dims[0] += (input_ch - 3)
+        self.layers = nn.ModuleList([])
+        for l in range(0, self.num_layers - 1):
+            out_dim = self.dims[l + 1]
+            if l in self.skip_in:
+                in_dim = self.dims[l] + self.dims[0]
+            else:
+                in_dim = self.dims[l]
+            lin = nn.Linear(in_dim, out_dim)
+            self.layers.append(lin)
+        # register for param init
+        self.posenc_res = posenc_res
+        # activation
+        self.softplus = nn.Softplus(beta=100)
+    def forward(self, points, proj_latent):
+        # positional encoding
+        if self.embed_fn is not None:
+            points = self.embed_fn(points)
+        # forward by layer
+        # [B, N, posenc+C]
+        inputs = torch.cat([points, proj_latent], dim=-1)
+        x = inputs
+        for l in range(0, self.num_layers - 1):
+            if l in self.skip_in:
+                x = torch.cat([x, inputs], -1) / np.sqrt(2)
+            x = self.layers[l](x)
+            if l < self.num_layers - 2:
+                x = self.softplus(x)
+        return x
+class Implicit(nn.Module):
+    """
+    Implicit function conditioned on depth encodings
+    """
+    def __init__(self,
+                 num_patches, latent_dim=768, semantic=False, n_channels=512,
+                 n_blocks_attn=2, n_layers_mlp=6, num_heads=16, posenc_3D=0,
+                 mlp_ratio=4., norm_layer=partial(nn.LayerNorm, eps=1e-6), drop_path=0.1,
+                 skip_in=[], pos_perlayer=True):
+        super().__init__()
+        self.num_patches = num_patches
+        self.pos_perlayer = pos_perlayer
+        self.semantic = semantic
+        # projection to the same number of channels, no posenc
+        self.point_proj = LinearProj3D(n_channels)
+        self.latent_proj = nn.Linear(latent_dim, n_channels, bias=True)
+        # positional embedding for the depth latent codes
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, n_channels), requires_grad=False)  # fixed sin-cos embedding
+        # multi-head attention blocks
+        self.blocks_attn = nn.ModuleList([
+            ImplFuncBlock(
+                n_channels, num_heads, mlp_ratio,
+                qkv_bias=True, norm_layer=norm_layer, drop_path=drop_path
+            ) for _ in range(n_blocks_attn-1)])
+        self.blocks_attn.append(
+            ImplFuncBlock(
+                n_channels, num_heads, mlp_ratio,
+                qkv_bias=True, norm_layer=norm_layer, drop_path=drop_path, last_layer=True
+            )
+        )
+        self.norm = norm_layer(n_channels)
+        self.impl_mlp = None
+        # define the impl MLP
+        if n_layers_mlp > 0:
+            self.impl_mlp = MLPBlocks(n_layers_mlp, n_channels, n_channels,
+                skip_in=skip_in, posenc_res=posenc_3D)
+        else:
+            # occ and color prediction
+            self.pred_head = nn.Linear(n_channels, 1, bias=True)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialize the positional embedding for the depth latent codes
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.num_patches**.5), cls_token=True)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, latent_depth, latent_semantic, points_3D):
+        # concatenate latent codes if semantic is used
+        latent = torch.cat([latent_depth, latent_semantic], dim=-1) if self.semantic else latent_depth
+        # project latent code and add posenc
+        # [B, 1+n_patches, C]
+        latent = self.latent_proj(latent)
+        N_latent = latent.shape[1]
+        # project query points
+        # [B, n_points, C_dec]
+        points_feat = self.point_proj(points_3D)
+        # concat point feat with latent
+        # [B, 1+n_patches+n_points, C_dec]
+        output = torch.cat([latent, points_feat], dim=1)
+        # apply multi-head attention blocks
+        attn_vis = []
+        for l, blk in enumerate(self.blocks_attn):
+            if self.pos_perlayer or l == 0:
+                output[:, :N_latent] = output[:, :N_latent] + self.pos_embed
+            output, attn = blk(output, points_feat.shape[1])
+            attn_vis.append(attn)
+        output = self.norm(output)
+        # average of attention weights across layers, [B, N_points, N_latent+1]
+        attn_vis = torch.stack(attn_vis, dim=-1).mean(dim=-1)
+        if self.impl_mlp:
+            # apply mlp blocks
+            output = self.impl_mlp(points_3D, output)
+        else:
+            # predictor projection
+            # [B, n_points, 1]
+            output = self.pred_head(output)
+        # return the occ logit of shape [B, n_points] and the attention weights if needed
+        return output.squeeze(-1), attn_vis

model/shape/rgb_enc.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# This source code is written based on https://github.com/facebookresearch/MCC
+# The original code base is licensed under the license found in the LICENSE file in the root directory.
+import torch
+import torch.nn as nn
+import torchvision
+from functools import partial
+from timm.models.vision_transformer import Block, PatchEmbed
+from utils.pos_embed import get_2d_sincos_pos_embed
+from utils.layers import Bottleneck_Conv
+class RGBEncAtt(nn.Module):
+    """
+    Seen surface encoder based on transformer.
+    """
+    def __init__(self,
+                 img_size=224, embed_dim=768, n_blocks=12, num_heads=12, win_size=16,
+                 mlp_ratio=4., norm_layer=partial(nn.LayerNorm, eps=1e-6), drop_path=0.1):
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.rgb_embed = PatchEmbed(img_size, win_size, 3, embed_dim)
+        num_patches = self.rgb_embed.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            Block(
+                embed_dim, num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                drop_path=drop_path
+            ) for _ in range(n_blocks)])
+        self.norm = norm_layer(embed_dim)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialize the pos enc with fixed cos-sin pattern
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.rgb_embed.num_patches**.5), cls_token=True)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # initialize rgb patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.rgb_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, rgb_obj):
+        # [B, H/ws*W/ws, C]
+        rgb_embedding = self.rgb_embed(rgb_obj)
+        rgb_embedding = rgb_embedding + self.pos_embed[:, 1:, :]
+        # append cls token
+        # [1, 1, C]
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        # [B, 1, C]
+        cls_tokens = cls_token.expand(rgb_embedding.shape[0], -1, -1)
+        # [B, H/ws*W/ws+1, C]
+        rgb_embedding = torch.cat((cls_tokens, rgb_embedding), dim=1)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            rgb_embedding = blk(rgb_embedding)
+        rgb_embedding = self.norm(rgb_embedding)
+        # [B, H/ws*W/ws+1, C]
+        return rgb_embedding
+class RGBEncRes(nn.Module):
+    """
+    RGB encoder based on resnet.
+    """
+    def __init__(self, opt):
+        super().__init__()
+        self.encoder = torchvision.models.resnet50(pretrained=True)
+        self.encoder.fc = nn.Sequential(
+            Bottleneck_Conv(2048),
+            Bottleneck_Conv(2048),
+            nn.Linear(2048, opt.arch.latent_dim)
+        )
+        # define hooks
+        self.rgb_feature = None
+        def feature_hook(model, input, output):
+            self.rgb_feature = output
+        # attach hooks
+        if (opt.arch.win_size) == 16:
+            self.encoder.layer3.register_forward_hook(feature_hook)
+            self.rgb_feat_proj = nn.Sequential(
+                Bottleneck_Conv(1024),
+                Bottleneck_Conv(1024),
+                nn.Conv2d(1024, opt.arch.latent_dim, 1)
+            )
+        elif (opt.arch.win_size) == 32:
+            self.encoder.layer4.register_forward_hook(feature_hook)
+            self.rgb_feat_proj = nn.Sequential(
+                Bottleneck_Conv(2048),
+                Bottleneck_Conv(2048),
+                nn.Conv2d(2048, opt.arch.latent_dim, 1)
+            )
+        else:
+            print('Make sure win_size is 16 or 32 when using resnet backbone!')
+            raise NotImplementedError
+    def forward(self, rgb_obj):
+        batch_size = rgb_obj.shape[0]
+        assert len(rgb_obj.shape) == 4
+        # [B, 1, C]
+        global_feat = self.encoder(rgb_obj).unsqueeze(1)
+        # [B, C, H/ws*W/ws]
+        local_feat = self.rgb_feat_proj(self.rgb_feature).view(batch_size, global_feat.shape[-1], -1)
+        # [B, H/ws*W/ws, C]
+        local_feat = local_feat.permute(0, 2, 1).contiguous()
+        # [B, 1+H/ws*W/ws, C]
+        rgb_embedding = torch.cat([global_feat, local_feat], dim=1)
+        return rgb_embedding

model/shape/seen_coord_enc.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# This source code is written based on https://github.com/facebookresearch/MCC
+# The original code base is licensed under the license found in the LICENSE file in the root directory.
+import torch
+import torch.nn as nn
+import torchvision
+from functools import partial
+from timm.models.vision_transformer import Block
+from utils.pos_embed import get_2d_sincos_pos_embed
+from utils.layers import Bottleneck_Conv
+class CoordEmb(nn.Module):
+    """
+    Encode the seen coordinate map to a lower resolution feature map
+    Achieved with window-wise attention block by deviding coord map into windows
+    Each window is seperately encoded into a single CLS token with self-attention and posenc
+    """
+    def __init__(self, embed_dim, win_size=8, num_heads=8):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.win_size = win_size
+        self.two_d_pos_embed = nn.Parameter(
+            torch.zeros(1, self.win_size*self.win_size + 1, embed_dim), requires_grad=False)  # fixed sin-cos embedding
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Linear(3, embed_dim)
+        self.blocks = nn.ModuleList([
+            # each block is a residual block with layernorm -> attention -> layernorm -> mlp
+            Block(embed_dim, num_heads=num_heads, mlp_ratio=2.0, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))
+            for _ in range(1)
+        ])
+        self.invalid_coord_token = nn.Parameter(torch.zeros(embed_dim,))
+        self.initialize_weights()
+    def initialize_weights(self):
+        torch.nn.init.normal_(self.cls_token, std=.02)
+        two_d_pos_embed = get_2d_sincos_pos_embed(self.two_d_pos_embed.shape[-1], self.win_size, cls_token=True)
+        self.two_d_pos_embed.data.copy_(torch.from_numpy(two_d_pos_embed).float().unsqueeze(0))
+        torch.nn.init.normal_(self.invalid_coord_token, std=.02)
+    def forward(self, coord_obj, mask_obj):
+        # [B, H, W, C]
+        emb = self.pos_embed(coord_obj)
+        emb[~mask_obj] = 0.0
+        emb[~mask_obj] += self.invalid_coord_token
+        B, H, W, C = emb.shape
+        # [B, H/ws, 8, W/ws, W, C]
+        emb = emb.view(B, H // self.win_size, self.win_size, W // self.win_size, self.win_size, C)
+        # [B * H/ws * W/ws, 64, C]
+        emb = emb.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, self.win_size * self.win_size, C)
+        # [B * H/ws * W/ws, 64, C], add posenc that is local to each patch
+        emb = emb + self.two_d_pos_embed[:, 1:, :]
+        # [1, 1, C]
+        cls_token = self.cls_token + self.two_d_pos_embed[:, :1, :]
+        # [B * H/ws * W/ws, 1, C]
+        cls_tokens = cls_token.expand(emb.shape[0], -1, -1)
+        # [B * H/ws * W/ws, 65, C]
+        emb = torch.cat((cls_tokens, emb), dim=1)
+        # transformer (single block) that handle each of the patch seperately
+        # reasoning is done within each batch
+        for _, blk in enumerate(self.blocks):
+            emb = blk(emb)
+        # return the cls token of each window, [B, H/ws*W/ws, C]
+        return emb[:, 0].view(B, (H // self.win_size) * (W // self.win_size), -1)
+class CoordEncAtt(nn.Module):
+    """
+    Seen surface encoder based on transformer.
+    """
+    def __init__(self,
+                 embed_dim=768, n_blocks=12, num_heads=12, win_size=8,
+                 mlp_ratio=4., norm_layer=partial(nn.LayerNorm, eps=1e-6), drop_path=0.1):
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.coord_embed = CoordEmb(embed_dim, win_size, num_heads)
+        self.blocks = nn.ModuleList([
+            Block(
+                embed_dim, num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                drop_path=drop_path
+            ) for _ in range(n_blocks)])
+        self.norm = norm_layer(embed_dim)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, coord_obj, mask_obj):
+        # [B, H/ws*W/ws, C]
+        coord_embedding = self.coord_embed(coord_obj, mask_obj)
+        # append cls token
+        # [1, 1, C]
+        cls_token = self.cls_token
+        # [B, 1, C]
+        cls_tokens = cls_token.expand(coord_embedding.shape[0], -1, -1)
+        # [B, H/ws*W/ws+1, C]
+        coord_embedding = torch.cat((cls_tokens, coord_embedding), dim=1)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            coord_embedding = blk(coord_embedding)
+        coord_embedding = self.norm(coord_embedding)
+        # [B, H/ws*W/ws+1, C]
+        return coord_embedding
+class CoordEncRes(nn.Module):
+    """
+    Seen surface encoder based on resnet.
+    """
+    def __init__(self, opt):
+        super().__init__()
+        self.encoder = torchvision.models.resnet50(pretrained=True)
+        self.encoder.fc = nn.Sequential(
+            Bottleneck_Conv(2048),
+            Bottleneck_Conv(2048),
+            nn.Linear(2048, opt.arch.latent_dim)
+        )
+        # define hooks
+        self.seen_feature = None
+        def feature_hook(model, input, output):
+            self.seen_feature = output
+        # attach hooks
+        assert opt.arch.depth.dsp == 1
+        if (opt.arch.win_size) == 16:
+            self.encoder.layer3.register_forward_hook(feature_hook)
+            self.depth_feat_proj = nn.Sequential(
+                Bottleneck_Conv(1024),
+                Bottleneck_Conv(1024),
+                nn.Conv2d(1024, opt.arch.latent_dim, 1)
+            )
+        elif (opt.arch.win_size) == 32:
+            self.encoder.layer4.register_forward_hook(feature_hook)
+            self.depth_feat_proj = nn.Sequential(
+                Bottleneck_Conv(2048),
+                Bottleneck_Conv(2048),
+                nn.Conv2d(2048, opt.arch.latent_dim, 1)
+            )
+        else:
+            print('Make sure win_size is 16 or 32 when using resnet backbone!')
+            raise NotImplementedError
+    def forward(self, coord_obj, mask_obj):
+        batch_size = coord_obj.shape[0]
+        assert len(coord_obj.shape) == len(mask_obj.shape) == 4
+        mask_obj = mask_obj.float()
+        coord_obj = coord_obj * mask_obj
+        # [B, 1, C]
+        global_feat = self.encoder(coord_obj).unsqueeze(1)
+        # [B, C, H/ws*W/ws]
+        local_feat = self.depth_feat_proj(self.seen_feature).view(batch_size, global_feat.shape[-1], -1)
+        # [B, H/ws*W/ws, C]
+        local_feat = local_feat.permute(0, 2, 1).contiguous()
+        # [B, 1+H/ws*W/ws, C]
+        seen_embedding = torch.cat([global_feat, local_feat], dim=1)
+        return seen_embedding

model/shape_engine.py ADDED Viewed

	@@ -0,0 +1,598 @@

+import numpy as np
+import os, time, datetime
+import torch
+import torch.utils.tensorboard
+import torch.profiler
+import importlib
+import shutil
+import utils.util as util
+import utils.util_vis as util_vis
+import utils.eval_3D as eval_3D
+from torch.nn.parallel import DistributedDataParallel as DDP
+from utils.util import print_eval, setup, cleanup
+from utils.util import EasyDict as edict
+from copy import deepcopy
+from model.compute_graph import graph_shape
+# ============================ main engine for training and evaluation ============================
+class Runner():
+    def __init__(self, opt):
+        super().__init__()
+        if os.path.isdir(opt.output_path) and opt.resume == False and opt.device == 0:
+            for filename in os.listdir(opt.output_path):
+                if "tfevents" in filename: os.remove(os.path.join(opt.output_path, filename))
+                if "html" in filename: os.remove(os.path.join(opt.output_path, filename))
+                if "vis" in filename: shutil.rmtree(os.path.join(opt.output_path, filename))
+                if "embedding" in filename: shutil.rmtree(os.path.join(opt.output_path, filename))
+        if opt.device == 0:
+            os.makedirs(opt.output_path,exist_ok=True)
+        setup(opt.device, opt.world_size, opt.port)
+        opt.batch_size = opt.batch_size // opt.world_size
+    def get_viz_data(self, opt):
+        # get data for visualization
+        viz_data_list = []
+        sample_range = len(self.viz_loader)
+        viz_interval = sample_range // opt.eval.n_vis
+        for i in range(sample_range):
+            current_batch = next(self.viz_loader_iter)
+            if i % viz_interval != 0: continue
+            viz_data_list.append(current_batch)
+        return viz_data_list
+    def load_dataset(self, opt, eval_split="test"):
+        data_train = importlib.import_module('data.{}'.format(opt.data.dataset_train))
+        data_test = importlib.import_module('data.{}'.format(opt.data.dataset_test))
+        if opt.device == 0: print("loading training data...")
+        self.train_data = data_train.Dataset(opt, split="train")
+        self.train_loader = self.train_data.setup_loader(opt, shuffle=True, use_ddp=True, drop_last=True)
+        self.num_batches = len(self.train_loader)
+        if opt.device == 0: print("loading test data...")
+        self.test_data = data_test.Dataset(opt, split=eval_split)
+        self.test_loader = self.test_data.setup_loader(opt, shuffle=False, use_ddp=True, drop_last=True, batch_size=opt.eval.batch_size)
+        self.num_batches_test = len(self.test_loader)
+        if len(self.test_loader.sampler) * opt.world_size < len(self.test_data):
+            self.aux_test_dataset = torch.utils.data.Subset(self.test_data,
+                                    range(len(self.test_loader.sampler) * opt.world_size, len(self.test_data)))
+            self.aux_test_loader = torch.utils.data.DataLoader(
+                self.aux_test_dataset, batch_size=opt.eval.batch_size, shuffle=False, drop_last=False,
+                num_workers=opt.data.num_workers)
+        if opt.device == 0:
+            print("creating data for visualization...")
+            self.viz_loader = self.test_data.setup_loader(opt, shuffle=False, use_ddp=False, drop_last=False, batch_size=1)
+            self.viz_loader_iter = iter(self.viz_loader)
+            self.viz_data = self.get_viz_data(opt)
+    def build_networks(self, opt):
+        if opt.device == 0: print("building networks...")
+        self.graph = DDP(graph_shape.Graph(opt).to(opt.device), device_ids=[opt.device], find_unused_parameters=(not opt.optim.fix_dpt or not opt.optim.fix_clip))
+# =================================================== set up training =========================================================
+    def setup_optimizer(self, opt):
+        if opt.device == 0: print("setting up optimizers...")
+        if opt.optim.fix_dpt:
+            # when we do not need to train the dpt depth, every param will start from scratch
+            scratch_param_decay = []
+            scratch_param_nodecay = []
+            # loop over all params
+            for name, param in self.graph.named_parameters():
+                # skip and fixed params
+                if not param.requires_grad or 'dpt_depth' in name or 'intr_' in name:
+                    continue
+                # do not add wd on bias or low-dim params
+                if param.ndim <= 1 or name.endswith(".bias"):
+                    scratch_param_nodecay.append(param)
+                    # print("{} -> scratch_param_nodecay".format(name))
+                else:
+                    scratch_param_decay.append(param)
+                    # print("{} -> scratch_param_decay".format(name))
+            # create the optim dictionary
+            optim_dict = [
+                {'params': scratch_param_nodecay, 'lr': opt.optim.lr, 'weight_decay': 0.},
+                {'params': scratch_param_decay, 'lr': opt.optim.lr, 'weight_decay': opt.optim.weight_decay}
+            ]
+        else:
+            # when we need to train dpt as well, related params should go to finetune list
+            finetune_param_nodecay = []
+            scratch_param_nodecay = []
+            finetune_param_decay = []
+            scratch_param_decay = []
+            for name, param in self.graph.named_parameters():
+                # skip and fixed params
+                if not param.requires_grad:
+                    continue
+                # put dpt params into finetune list
+                if 'dpt_depth' in name or 'intr_' in name:
+                    if param.ndim <= 1 or name.endswith(".bias"):
+                        # print("{} -> finetune_param_nodecay".format(name))
+                        finetune_param_nodecay.append(param)
+                    else:
+                        finetune_param_decay.append(param)
+                        # print("{} -> finetune_param_decay".format(name))
+                # all other params go to scratch list
+                else:
+                    if param.ndim <= 1 or name.endswith(".bias"):
+                        scratch_param_nodecay.append(param)
+                        # print("{} -> scratch_param_nodecay".format(name))
+                    else:
+                        scratch_param_decay.append(param)
+                        # print("{} -> scratch_param_decay".format(name))
+            # create the optim dictionary
+            optim_dict = [
+                {'params': finetune_param_nodecay, 'lr': opt.optim.lr_ft, 'weight_decay': 0.},
+                {'params': finetune_param_decay, 'lr': opt.optim.lr_ft, 'weight_decay': opt.optim.weight_decay},
+                {'params': scratch_param_nodecay, 'lr': opt.optim.lr, 'weight_decay': 0.},
+                {'params': scratch_param_decay, 'lr': opt.optim.lr, 'weight_decay': opt.optim.weight_decay}
+            ]
+        self.optim = torch.optim.AdamW(optim_dict, betas=(0.9, 0.95))
+        if opt.optim.sched:
+            self.sched = torch.optim.lr_scheduler.CosineAnnealingLR(self.optim, opt.max_epoch)
+        if opt.optim.amp:
+            self.scaler = torch.cuda.amp.GradScaler()
+    def restore_checkpoint(self, opt, best=False, evaluate=False):
+        epoch_start, iter_start = None, None
+        if opt.resume:
+            if opt.device == 0: print("resuming from previous checkpoint...")
+            epoch_start, iter_start, best_val, best_ep = util.restore_checkpoint(opt, self, resume=opt.resume, best=best, evaluate=evaluate)
+            self.best_val = best_val
+            self.best_ep = best_ep
+        elif opt.load is not None:
+            if opt.device == 0: print("loading weights from checkpoint {}...".format(opt.load))
+            epoch_start, iter_start, best_val, best_ep = util.restore_checkpoint(opt, self, load_name=opt.load)
+        else:
+            if opt.device == 0: print("initializing weights from scratch...")
+        self.epoch_start = epoch_start or 0
+        self.iter_start = iter_start or 0
+    def setup_visualizer(self, opt, test=False):
+        if opt.device == 0:
+            print("setting up visualizers...")
+            if opt.tb:
+                if test == False:
+                    self.tb = torch.utils.tensorboard.SummaryWriter(log_dir=opt.output_path, flush_secs=10)
+                else:
+                    embedding_folder = os.path.join(opt.output_path, 'embedding')
+                    os.makedirs(embedding_folder, exist_ok=True)
+                    self.tb = torch.utils.tensorboard.SummaryWriter(log_dir=embedding_folder, flush_secs=10)
+    def train(self, opt):
+        # before training
+        torch.cuda.set_device(opt.device)
+        torch.cuda.empty_cache()
+        if opt.device == 0: print("TRAINING START")
+        self.train_metric_logger = util.MetricLogger(delimiter="  ")
+        self.train_metric_logger.add_meter('lr', util.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+        self.iter_skip = self.iter_start % len(self.train_loader)
+        self.it = self.iter_start
+        self.skip_dis = False
+        if not opt.resume:
+            self.best_val = np.inf
+            self.best_ep = 1
+        # training
+        if self.iter_start == 0 and not opt.debug: self.evaluate(opt, ep=0, training=True)
+        for self.ep in range(self.epoch_start, opt.max_epoch):
+            self.train_epoch(opt)
+        # after training
+        if opt.device == 0: self.save_checkpoint(opt, ep=self.ep, it=self.it, best_val=self.best_val, best_ep=self.best_ep)
+        if opt.tb and opt.device == 0:
+            self.tb.flush()
+            self.tb.close()
+        if opt.device == 0:
+            print("TRAINING DONE")
+            print("Best CD: %.4f @ epoch %d" % (self.best_val, self.best_ep))
+        cleanup()
+    def train_epoch(self, opt):
+        # before train epoch
+        self.train_loader.sampler.set_epoch(self.ep)
+        if opt.device == 0:
+            print("training epoch {}".format(self.ep+1))
+        batch_progress = range(self.num_batches)
+        self.graph.train()
+        # train epoch
+        loader = iter(self.train_loader)
+        if opt.debug and opt.profile:
+            with torch.profiler.profile(
+                    schedule=torch.profiler.schedule(wait=3, warmup=3, active=5, repeat=2),
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler('debug/profiler_log'),
+                    record_shapes=True,
+                    profile_memory=True,
+                    with_stack=False
+            ) as prof:
+                for batch_id in batch_progress:
+                    if batch_id >= (1 + 1 + 3) * 2:
+                        # exit the program after 2 iterations of the warmup, active, and repeat steps
+                        exit()
+                    # if resuming from previous checkpoint, skip until the last iteration number is reached
+                    if self.iter_skip>0:
+                        self.iter_skip -= 1
+                        continue
+                    batch = next(loader)
+                    # train iteration
+                    var = edict(batch)
+                    opt.H, opt.W = opt.image_size
+                    var = util.move_to_device(var, opt.device)
+                    loss = self.train_iteration(opt, var, batch_progress)
+                    prof.step()
+        else:
+            for batch_id in batch_progress:
+                # if resuming from previous checkpoint, skip until the last iteration number is reached
+                if self.iter_skip>0:
+                    self.iter_skip -= 1
+                    continue
+                batch = next(loader)
+                # train iteration
+                var = edict(batch)
+                opt.H, opt.W = opt.image_size
+                var = util.move_to_device(var, opt.device)
+                loss = self.train_iteration(opt, var, batch_progress)
+        # after train epoch
+        if opt.optim.sched: self.sched.step()
+        if (self.ep + 1) % opt.freq.eval == 0:
+            if opt.device == 0: print("validating epoch {}".format(self.ep+1))
+            current_val = self.evaluate(opt, ep=self.ep+1, training=True)
+            if current_val < self.best_val and opt.device == 0:
+                self.best_val = current_val
+                self.best_ep = self.ep + 1
+                self.save_checkpoint(opt, ep=self.ep, it=self.it, best_val=self.best_val, best_ep=self.best_ep, best=True, latest=True)
+    def train_iteration(self, opt, var, batch_progress):
+        # before train iteration
+        torch.distributed.barrier()
+        # train iteration
+        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=opt.optim.amp):
+            var, loss = self.graph.forward(opt, var, training=True, get_loss=True)
+            loss = self.summarize_loss(opt, var, loss)
+            loss_scaled = loss.all / opt.optim.accum
+        # backward
+        if opt.optim.amp:
+            self.scaler.scale(loss_scaled).backward()
+            # skip update if accumulating gradient
+            if (self.it + 1) % opt.optim.accum == 0:
+                self.scaler.unscale_(self.optim)
+                # gradient clipping
+                if opt.optim.clip_norm:
+                    norm = torch.nn.utils.clip_grad_norm_(self.graph.parameters(), opt.optim.clip_norm)
+                    if opt.debug: print("Grad norm: {}".format(norm))
+                self.scaler.step(self.optim)
+                self.scaler.update()
+                self.optim.zero_grad()
+        else:
+            loss_scaled.backward()
+            if (self.it + 1) % opt.optim.accum == 0:
+                if opt.optim.clip_norm:
+                    norm = torch.nn.utils.clip_grad_norm_(self.graph.parameters(), opt.optim.clip_norm)
+                    if opt.debug: print("Grad norm: {}".format(norm))
+                self.optim.step()
+                self.optim.zero_grad()
+        # after train iteration
+        lr = self.sched.get_last_lr()[0] if opt.optim.sched else opt.optim.lr
+        self.train_metric_logger.update(lr=lr)
+        self.train_metric_logger.update(loss=loss.all)
+        if opt.device == 0:
+            self.graph.eval()
+            # if (self.it) % opt.freq.vis == 0: self.visualize(opt, var, step=self.it, split="train")
+            if (self.it) % opt.freq.ckpt_latest == 0 and not opt.debug:
+                self.save_checkpoint(opt, ep=self.ep, it=self.it, best_val=self.best_val, best_ep=self.best_ep, latest=True)
+            if (self.it) % opt.freq.scalar == 0 and not opt.debug:
+                self.log_scalars(opt, var, loss, step=self.it, split="train")
+            if (self.it) % (opt.freq.save_vis * (self.it//10000*10+1)) == 0 and not opt.debug:
+                self.vis_train_iter(opt)
+            if (self.it) % opt.freq.print == 0:
+                print('[{}] '.format(datetime.datetime.now().time()), end='')
+                print(f'Train Iter {self.it}/{self.num_batches*opt.max_epoch}: {self.train_metric_logger}')
+            self.graph.train()
+        self.it += 1
+        return loss
+    @torch.no_grad()
+    def vis_train_iter(self, opt):
+        for i in range(len(self.viz_data)):
+            var_viz = edict(deepcopy(self.viz_data[i]))
+            var_viz = util.move_to_device(var_viz, opt.device)
+            var_viz = self.graph.module(opt, var_viz, training=False, get_loss=False)
+            eval_3D.eval_metrics(opt, var_viz, self.graph.module.impl_network, vis_only=True)
+            vis_folder = "vis_log/iter_{}".format(self.it)
+            os.makedirs("{}/{}".format(opt.output_path, vis_folder), exist_ok=True)
+            util_vis.dump_images(opt, var_viz.idx, "image_input", var_viz.rgb_input_map, masks=None, from_range=(0, 1), folder=vis_folder)
+            util_vis.dump_images(opt, var_viz.idx, "mask_input", var_viz.mask_input_map, folder=vis_folder)
+            util_vis.dump_meshes_viz(opt, var_viz.idx, "mesh_viz", var_viz.mesh_pred, folder=vis_folder)
+            if 'depth_pred' in var_viz:
+                util_vis.dump_depths(opt, var_viz.idx, "depth_est", var_viz.depth_pred, var_viz.mask_input_map, rescale=True, folder=vis_folder)
+            if 'depth_input_map' in var_viz:
+                util_vis.dump_depths(opt, var_viz.idx, "depth_input", var_viz.depth_input_map, var_viz.mask_input_map, rescale=True, folder=vis_folder)
+            if 'attn_vis' in var_viz:
+                util_vis.dump_attentions(opt, var_viz.idx, "attn", var_viz.attn_vis, folder=vis_folder)
+            if 'gt_surf_points' in var_viz and 'seen_points' in var_viz:
+                util_vis.dump_pointclouds_compare(opt, var_viz.idx, "seen_surface", var_viz.seen_points, var_viz.gt_surf_points, folder=vis_folder)
+    def summarize_loss(self, opt, var, loss, non_act_loss_key=[]):
+        loss_all = 0.
+        assert("all" not in loss)
+        # weigh losses
+        for key in loss:
+            assert(key in opt.loss_weight)
+            if opt.loss_weight[key] is not None:
+                assert not torch.isinf(loss[key].mean()), "loss {} is Inf".format(key)
+                assert not torch.isnan(loss[key].mean()), "loss {} is NaN".format(key)
+                loss_all += float(opt.loss_weight[key])*loss[key].mean() if key not in non_act_loss_key else 0.0*loss[key].mean()
+        loss.update(all=loss_all)
+        return loss
+# =================================================== set up evaluation =========================================================
+    @torch.no_grad()
+    def evaluate(self, opt, ep, training=False):
+        self.graph.eval()
+        # lists for metrics
+        cd_accs = []
+        cd_comps = []
+        f_scores = []
+        cat_indices = []
+        loss_eval = edict()
+        metric_eval = dict(dist_acc=0., dist_cov=0.)
+        eval_metric_logger = util.MetricLogger(delimiter="  ")
+        # result file on the fly
+        if not training:
+            assert opt.device == 0
+            full_results_file = open(os.path.join(opt.output_path, '{}_full_results.txt'.format(opt.data.dataset_test)), 'w')
+            full_results_file.write("IND, CD, ACC, COMP, ")
+            full_results_file.write(", ".join(["F-score@{:.2f}".format(opt.eval.f_thresholds[i]*100) for i in range(len(opt.eval.f_thresholds))]))
+        # dataloader on the test set
+        with torch.cuda.device(opt.device):
+            for it, batch in enumerate(self.test_loader):
+                # inference the model
+                var = edict(batch)
+                var = self.evaluate_batch(opt, var, ep, it, single_gpu=False)
+                # record CD for evaluation
+                dist_acc, dist_cov = eval_3D.eval_metrics(opt, var, self.graph.module.impl_network)
+                # accumulate the scores
+                cd_accs.append(var.cd_acc)
+                cd_comps.append(var.cd_comp)
+                f_scores.append(var.f_score)
+                cat_indices.append(var.category_label)
+                eval_metric_logger.update(ACC=dist_acc)
+                eval_metric_logger.update(COMP=dist_cov)
+                eval_metric_logger.update(CD=(dist_acc+dist_cov) / 2)
+                if opt.device == 0 and it % opt.freq.print_eval == 0:
+                    print('[{}] '.format(datetime.datetime.now().time()), end='')
+                    print(f'Eval Iter {it}/{len(self.test_loader)} @ EP {ep}: {eval_metric_logger}')
+                # write to file
+                if not training:
+                    full_results_file.write("\n")
+                    full_results_file.write("{:d}".format(var.idx.item()))
+                    full_results_file.write("\t{:.4f}".format((var.cd_acc.item() + var.cd_comp.item()) / 2))
+                    full_results_file.write("\t{:.4f}".format(var.cd_acc.item()))
+                    full_results_file.write("\t{:.4f}".format(var.cd_comp.item()))
+                    full_results_file.write("\t" + "\t".join(["{:.4f}".format(var.f_score[0][i].item()) for i in range(len(opt.eval.f_thresholds))]))
+                    full_results_file.flush()
+                # dump the result if in eval mode
+                if not training:
+                    self.dump_results(opt, var, ep, write_new=(it == 0))
+                # save the predicted mesh for vis data if in train mode
+                if it == 0 and training and opt.device == 0:
+                    print("visualizing and saving results...")
+                    for i in range(len(self.viz_data)):
+                        var_viz = edict(deepcopy(self.viz_data[i]))
+                        var_viz = self.evaluate_batch(opt, var_viz, ep, it, single_gpu=True)
+                        eval_3D.eval_metrics(opt, var_viz, self.graph.module.impl_network, vis_only=True)
+                        # self.visualize(opt, var_viz, step=ep, split="eval")
+                        self.dump_results(opt, var_viz, ep, train=True)
+                    # write html that organizes the results
+                    util_vis.create_gif_html(os.path.join(opt.output_path, "vis_{}".format(ep)),
+                                             os.path.join(opt.output_path, "results_ep{}.html".format(ep)),
+                                             skip_every=1)
+            # collect the eval results into tensors
+            cd_accs = torch.cat(cd_accs, dim=0)
+            cd_comps = torch.cat(cd_comps, dim=0)
+            f_scores = torch.cat(f_scores, dim=0)
+            cat_indices = torch.cat(cat_indices, dim=0)
+        if opt.world_size > 1:
+            # empty tensors for gathering
+            cd_accs_all = [torch.zeros_like(cd_accs).to(opt.device) for _ in range(opt.world_size)]
+            cd_comps_all = [torch.zeros_like(cd_comps).to(opt.device) for _ in range(opt.world_size)]
+            f_scores_all = [torch.zeros_like(f_scores).to(opt.device) for _ in range(opt.world_size)]
+            cat_indices_all = [torch.zeros_like(cat_indices).long().to(opt.device) for _ in range(opt.world_size)]
+            # gather the metrics
+            torch.distributed.barrier()
+            torch.distributed.all_gather(cd_accs_all, cd_accs)
+            torch.distributed.all_gather(cd_comps_all, cd_comps)
+            torch.distributed.all_gather(f_scores_all, f_scores)
+            torch.distributed.all_gather(cat_indices_all, cat_indices)
+            cd_accs_all = torch.cat(cd_accs_all, dim=0)
+            cd_comps_all = torch.cat(cd_comps_all, dim=0)
+            f_scores_all = torch.cat(f_scores_all, dim=0)
+            cat_indices_all = torch.cat(cat_indices_all, dim=0)
+        else:
+            cd_accs_all = cd_accs
+            cd_comps_all = cd_comps
+            f_scores_all = f_scores
+            cat_indices_all = cat_indices
+        # handle last batch, if any
+        if len(self.test_loader.sampler) * opt.world_size < len(self.test_data):
+            cd_accs_all = [cd_accs_all]
+            cd_comps_all = [cd_comps_all]
+            f_scores_all = [f_scores_all]
+            cat_indices_all = [cat_indices_all]
+            for batch in self.aux_test_loader:
+                # inference the model
+                var = edict(batch)
+                var = self.evaluate_batch(opt, var, ep, it, single_gpu=False)
+                # record CD for evaluation
+                dist_acc, dist_cov = eval_3D.eval_metrics(opt, var, self.graph.module.impl_network)
+                # accumulate the scores
+                cd_accs_all.append(var.cd_acc)
+                cd_comps_all.append(var.cd_comp)
+                f_scores_all.append(var.f_score)
+                cat_indices_all.append(var.category_label)
+                # dump the result if in eval mode
+                if not training and opt.device == 0:
+                    self.dump_results(opt, var, ep, write_new=(it == 0))
+            cd_accs_all = torch.cat(cd_accs_all, dim=0)
+            cd_comps_all = torch.cat(cd_comps_all, dim=0)
+            f_scores_all = torch.cat(f_scores_all, dim=0)
+            cat_indices_all = torch.cat(cat_indices_all, dim=0)
+        assert cd_accs_all.shape[0] == len(self.test_data)
+        if not training:
+            full_results_file.close()
+        # printout and save the metrics
+        if opt.device == 0:
+            metric_eval["dist_acc"] = cd_accs_all.mean()
+            metric_eval["dist_cov"] = cd_comps_all.mean()
+            # print eval info
+            print_eval(opt, loss=None, chamfer=(metric_eval["dist_acc"],
+                                                metric_eval["dist_cov"]))
+            val_metric = (metric_eval["dist_acc"] + metric_eval["dist_cov"]) / 2
+            if training:
+                # log/visualize results to tb/vis
+                self.log_scalars(opt, var, loss_eval, metric=metric_eval, step=ep, split="eval")
+            if not training:
+                # save the per-cat evaluation metrics if on shapenet
+                per_cat_cd_file = os.path.join(opt.output_path, 'cd_cat.txt')
+                with open(per_cat_cd_file, "w") as outfile:
+                    outfile.write("CD     Acc    Comp   Count Cat\n")
+                    for i in range(opt.data.num_classes_test):
+                        if (cat_indices_all==i).sum() == 0:
+                            continue
+                        acc_i = cd_accs_all[cat_indices_all==i].mean().item()
+                        comp_i = cd_comps_all[cat_indices_all==i].mean().item()
+                        counts_cat = torch.sum(cat_indices_all==i)
+                        cd_i = (acc_i + comp_i) / 2
+                        outfile.write("%.4f %.4f %.4f %5d %s\n" % (cd_i, acc_i, comp_i, counts_cat, self.test_data.label2cat[i]))
+                # print f_scores
+                f_scores_avg = f_scores_all.mean(dim=0)
+                print('##############################')
+                for i in range(len(opt.eval.f_thresholds)):
+                    print('F-score @ %.2f: %.4f' % (opt.eval.f_thresholds[i]*100, f_scores_avg[i].item()))
+                print('##############################')
+                # write to file
+                result_file = os.path.join(opt.output_path, 'quantitative_{}.txt'.format(opt.data.dataset_test))
+                with open(result_file, "w") as outfile:
+                    outfile.write('CD     Acc    Comp \n')
+                    outfile.write('%.4f %.4f %.4f\n' % (val_metric, metric_eval["dist_acc"], metric_eval["dist_cov"]))
+                    for i in range(len(opt.eval.f_thresholds)):
+                        outfile.write('F-score @ %.2f: %.4f\n' % (opt.eval.f_thresholds[i]*100, f_scores_avg[i].item()))
+                # write html that organizes the results
+                util_vis.create_gif_html(os.path.join(opt.output_path, "dump_{}".format(opt.data.dataset_test)),
+                                         os.path.join(opt.output_path, "results_test.html"), skip_every=10)
+            # torch.cuda.empty_cache()
+            return val_metric.item()
+        return float('inf')
+    def evaluate_batch(self, opt, var, ep=None, it=None, single_gpu=False):
+        var = util.move_to_device(var, opt.device)
+        if single_gpu:
+            var  = self.graph.module(opt, var, training=False, get_loss=False)
+        else:
+            var = self.graph(opt, var, training=False, get_loss=False)
+        return var
+    @torch.no_grad()
+    def log_scalars(self, opt, var, loss, metric=None, step=0, split="train"):
+        if split=="train":
+            dist_acc, dist_cov = eval_3D.eval_metrics(opt, var, self.graph.module.impl_network)
+            metric = dict(dist_acc=dist_acc, dist_cov=dist_cov)
+        for key, value in loss.items():
+            if key=="all": continue
+            self.tb.add_scalar("{0}/loss_{1}".format(split, key), value.mean(), step)
+        if metric is not None:
+            for key, value in metric.items():
+                self.tb.add_scalar("{0}/{1}".format(split, key), value, step)
+        # log the attention average values
+        if 'attn_geo_avg' in var:
+            self.tb.add_scalar("{0}/attn_geo_avg".format(split), var.attn_geo_avg, step)
+        if 'attn_geo_seen' in var:
+            self.tb.add_scalar("{0}/attn_geo_seen".format(split), var.attn_geo_seen, step)
+        if 'attn_geo_occl' in var:
+            self.tb.add_scalar("{0}/attn_geo_occl".format(split), var.attn_geo_occl, step)
+        if 'attn_geo_bg' in var:
+            self.tb.add_scalar("{0}/attn_geo_bg".format(split), var.attn_geo_bg, step)
+    @torch.no_grad()
+    def visualize(self, opt, var, step=0, split="train"):
+        if 'pose_input' in var:
+            pose_input = var.pose_input
+        elif 'pose_gt' in var:
+            pose_input = var.pose_gt
+        else:
+            pose_input = None
+        util_vis.tb_image(opt, self.tb, step, split, "image_input_map", var.rgb_input_map, masks=None, from_range=(0, 1), poses=pose_input)
+        util_vis.tb_image(opt, self.tb, step, split, "image_input_map_est", var.rgb_input_map, masks=None, from_range=(0, 1),
+                          poses=var.pose_pred if 'pose_pred' in var else var.pose)
+        util_vis.tb_image(opt, self.tb, step, split, "mask_input_map", var.mask_input_map)
+        if 'depth_pred' in var:
+            util_vis.tb_image(opt, self.tb, step, split, "depth_est_map", var.depth_pred)
+        if 'depth_input_map' in var:
+            util_vis.tb_image(opt, self.tb, step, split, "depth_input_map", var.depth_input_map)
+    @torch.no_grad()
+    def dump_results(self, opt, var, ep, write_new=False, train=False):
+        # create the dir
+        current_folder = "dump_{}".format(opt.data.dataset_test) if train == False else "vis_{}".format(ep)
+        os.makedirs("{}/{}/".format(opt.output_path, current_folder), exist_ok=True)
+        # save the results
+        if 'pose_input' in var:
+            pose_input = var.pose_input
+        elif 'pose_gt' in var:
+            pose_input = var.pose_gt
+        else:
+            pose_input = None
+        util_vis.dump_images(opt, var.idx, "image_input", var.rgb_input_map, masks=None, from_range=(0, 1), poses=pose_input, folder=current_folder)
+        util_vis.dump_images(opt, var.idx, "mask_input", var.mask_input_map, folder=current_folder)
+        util_vis.dump_meshes(opt, var.idx, "mesh", var.mesh_pred, folder=current_folder)
+        util_vis.dump_meshes_viz(opt, var.idx, "mesh_viz", var.mesh_pred, folder=current_folder) # image frames + gifs
+        if 'depth_pred' in var:
+            util_vis.dump_depths(opt, var.idx, "depth_est", var.depth_pred, var.mask_input_map, rescale=True, folder=current_folder)
+        if 'depth_input_map' in var:
+            util_vis.dump_depths(opt, var.idx, "depth_input", var.depth_input_map, var.mask_input_map, rescale=True, folder=current_folder)
+        if 'gt_surf_points' in var and 'seen_points' in var:
+            util_vis.dump_pointclouds_compare(opt, var.idx, "seen_surface", var.seen_points, var.gt_surf_points, folder=current_folder)
+        if 'attn_vis' in var:
+            util_vis.dump_attentions(opt, var.idx, "attn", var.attn_vis, folder=current_folder)
+        if 'attn_pc' in var:
+            util_vis.dump_pointclouds(opt, var.idx, "attn_pc", var.attn_pc["points"], var.attn_pc["colors"], folder=current_folder)
+        if 'dpc' in var:
+            util_vis.dump_pointclouds_compare(opt, var.idx, "pointclouds_comp", var.dpc_pred, var.dpc.points, folder=current_folder)
+    def save_checkpoint(self, opt, ep=0, it=0, best_val=np.inf, best_ep=1, latest=False, best=False):
+        util.save_checkpoint(opt, self, ep=ep, it=it, best_val=best_val, best_ep=best_ep, latest=latest, best=best)
+        if not latest:
+            print("checkpoint saved: ({0}) {1}, epoch {2} (iteration {3})".format(opt.group, opt.name, ep, it))
+        if best:
+            print("Saving the current model as the best...")

options/depth.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+group: depth
+name: depth_est
+load:
+batch_size: 44
+debug: false
+image_size: [224,224]
+gpu: 0
+max_epoch: 15
+output_root: output
+resume: false
+seed: 0
+yaml:
+arch:
+    depth:
+        pretrained: model/depth/pretrained_weights/omnidata_dpt_depth_v2.ckpt
+eval:
+    batch_size: 44
+    n_vis: 50
+    depth_cap:
+    d_thresholds: [1.02,1.05,1.1,1.2]
+data:
+    num_classes_test: 15
+    max_img_cat:
+    dataset_train: synthetic
+    dataset_test: synthetic
+    num_workers: 6
+    bgcolor: 1
+    pix3d:
+        cat:
+    ocrtoc:
+        cat:
+        erode_mask: 10
+    synthetic:
+        subset: objaverse_LVIS,ShapeNet55
+        percentage: 1
+    train_sub:
+    val_sub:
+training:
+    n_sdf_points: 4096
+    depth_loss:
+        grad_reg: 0.1
+        depth_inv: true
+        mask_shrink: false
+loss_weight:
+    depth: 1
+    intr: 10
+optim:
+    lr: 3.e-5
+    weight_decay: 0.05
+    clip_norm:
+    amp: false
+    accum: 1
+    sched: false
+tb:
+    num_images: [4,8]
+freq:
+    print: 200
+    print_eval: 100
+    scalar: 1000 # iterations
+    vis: 1000 # iterations
+    save_vis: 1000
+    ckpt_latest: 1000 # iterations
+    eval: 1

options/shape.yaml ADDED Viewed

	@@ -0,0 +1,110 @@

+group: shape
+name: shape_recon
+load:
+batch_size: 28
+debug: false
+profile: false
+image_size: [224,224]
+gpu: 0
+max_epoch: 15
+output_root: output
+resume: false
+seed: 0
+yaml:
+pretrain:
+    depth: weights/depth.ckpt
+arch:
+    # general
+    num_heads: 8
+    latent_dim: 256
+    win_size: 16
+    # depth
+    depth:
+        encoder: resnet
+        n_blocks: 12
+        dsp: 2
+        pretrained: model/depth/pretrained_weights/omnidata_dpt_depth_v2.ckpt
+    # rgb
+    rgb:
+        encoder:
+        n_blocks: 12
+    # implicit
+    impl:
+        n_channels: 256
+        # attention-related
+        att_blocks: 2
+        mlp_ratio: 4.
+        posenc_perlayer: false
+        # mlp-related
+        mlp_layers: 8
+        posenc_3D: 0
+        skip_in: [2,4,6]
+eval:
+    batch_size: 2
+    brute_force: false
+    n_vis: 50
+    vox_res: 64
+    num_points: 10000
+    range: [-1.5,1.5]
+    icp: false
+    f_thresholds: [0.005, 0.01, 0.02, 0.05, 0.1, 0.2]
+data:
+    num_classes_test: 15
+    max_img_cat:
+    dataset_train: synthetic
+    dataset_test: synthetic
+    num_workers: 6
+    bgcolor: 1
+    pix3d:
+        cat:
+    ocrtoc:
+        cat:
+        erode_mask:
+    synthetic:
+        subset: objaverse_LVIS,ShapeNet55
+        percentage: 1
+    train_sub:
+    val_sub:
+training:
+    n_sdf_points: 4096
+    shape_loss:
+        impt_weight: 1
+        impt_thres: 0.01
+    depth_loss:
+        grad_reg: 0.1
+        depth_inv: true
+        mask_shrink: false
+loss_weight:
+    shape: 1
+    depth:
+    intr:
+optim:
+    lr: 3.e-5
+    lr_ft: 1.e-5
+    weight_decay: 0.05
+    fix_dpt: false
+    fix_clip: true
+    clip_norm:
+    amp: false
+    accum: 1
+    sched: false
+tb:
+    num_images: [4,8]
+freq:
+    print: 200
+    print_eval: 100
+    scalar: 1000 # iterations
+    vis: 1000 # iterations
+    save_vis: 1000
+    ckpt_latest: 1000 # iterations
+    eval: 1

requirements.txt ADDED Viewed

	@@ -0,0 +1,95 @@

+absl-py==2.0.0
+anyio==4.2.0
+attrs==23.2.0
+cachetools==5.3.2
+certifi==2023.11.17
+chardet==5.2.0
+charset-normalizer==3.3.2
+colorlog==6.8.0
+contourpy==1.2.0
+cycler==0.12.1
+docopt==0.6.2
+embreex==2.17.7.post4
+exceptiongroup==1.2.0
+filelock==3.13.1
+fonttools==4.47.2
+freetype-py==2.4.0
+fsspec==2023.12.2
+google-auth==2.26.2
+google-auth-oauthlib==1.2.0
+grpcio==1.60.0
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-hub==0.20.2
+idna==3.6
+imageio==2.33.1
+Jinja2==3.1.3
+jsonschema==4.20.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+lxml==5.1.0
+mapbox-earcut==1.0.1
+Markdown==3.5.2
+MarkupSafe==2.1.3
+matplotlib==3.8.2
+mpmath==1.3.0
+networkx==3.2.1
+ninja==1.11.1.1
+numpy==1.26.3
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.2
+opencv-python==4.9.0.80
+packaging==23.2
+pillow==10.2.0
+pipreqs==0.4.13
+protobuf==4.23.4
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pycollada==0.8
+pyglet==2.0.10
+PyMCubes==0.1.4
+PyOpenGL==3.1.0
+pyparsing==3.1.1
+pyrender==0.1.45
+python-dateutil==2.8.2
+PyYAML==6.0.1
+referencing==0.32.1
+requests==2.31.0
+requests-oauthlib==1.3.1
+rpds-py==0.16.2
+rsa==4.9
+Rtree==1.1.0
+safetensors==0.4.1
+scipy==1.11.4
+shapely==2.0.2
+six==1.16.0
+sniffio==1.3.0
+svg.path==6.3
+sympy==1.12
+tensorboard==2.15.1
+tensorboard-data-server==0.7.2
+timm==0.9.12
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+trimesh==4.0.9
+triton==2.1.0
+typing_extensions==4.9.0
+urllib3==2.1.0
+vhacdx==0.0.5
+Werkzeug==3.0.1
+xxhash==3.4.1
+yarg==0.1.9
+rembg

utils/camera.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# partially from https://github.com/chenhsuanlin/signed-distance-SRN
+import numpy as np
+import torch
+class Pose():
+    # a pose class with util methods
+    def __call__(self, R=None, t=None):
+        assert(R is not None or t is not None)
+        if R is None:
+            if not isinstance(t, torch.Tensor): t = torch.tensor(t)
+            R = torch.eye(3, device=t.device).repeat(*t.shape[:-1], 1, 1)
+        elif t is None:
+            if not isinstance(R, torch.Tensor): R = torch.tensor(R)
+            t = torch.zeros(R.shape[:-1], device=R.device)
+        else:
+            if not isinstance(R, torch.Tensor): R = torch.tensor(R)
+            if not isinstance(t, torch.Tensor): t = torch.tensor(t)
+        assert(R.shape[:-1]==t.shape and R.shape[-2:]==(3, 3))
+        R = R.float()
+        t = t.float()
+        pose = torch.cat([R, t[..., None]], dim=-1) # [..., 3, 4]
+        assert(pose.shape[-2:]==(3, 4))
+        return pose
+    def invert(self, pose, use_inverse=False):
+        R, t = pose[..., :3], pose[..., 3:]
+        R_inv = R.inverse() if use_inverse else R.transpose(-1, -2)
+        t_inv = (-R_inv@t)[..., 0]
+        pose_inv = self(R=R_inv, t=t_inv)
+        return pose_inv
+    def compose(self, pose_list):
+        # pose_new(x) = poseN(...(pose2(pose1(x)))...)
+        pose_new = pose_list[0]
+        for pose in pose_list[1:]:
+            pose_new = self.compose_pair(pose_new, pose)
+        return pose_new
+    def compose_pair(self, pose_a, pose_b):
+        # pose_new(x) = pose_b(pose_a(x))
+        R_a, t_a = pose_a[..., :3], pose_a[..., 3:]
+        R_b, t_b = pose_b[..., :3], pose_b[..., 3:]
+        R_new = R_b@R_a
+        t_new = (R_b@t_a+t_b)[..., 0]
+        pose_new = self(R=R_new, t=t_new)
+        return pose_new
+pose = Pose()
+# unit sphere normalization
+def valid_norm_fac(seen_points, mask):
+    '''
+    seen_points: [B, H*W, 3]
+    mask: [B, 1, H, W], boolean
+    '''
+    # get valid points
+    batch_size = seen_points.shape[0]
+    # [B, H*W]
+    mask = mask.view(batch_size, seen_points.shape[1])
+    # get mean and variance by sample
+    means, max_dists = [], []
+    for b in range(batch_size):
+        # [N_valid, 3]
+        seen_points_valid = seen_points[b][mask[b]]
+        # [3]
+        xyz_mean = torch.mean(seen_points_valid, dim=0)
+        seen_points_valid_zmean = seen_points_valid - xyz_mean
+        # scalar
+        max_dist = torch.max(seen_points_valid_zmean.norm(dim=1))
+        means.append(xyz_mean)
+        max_dists.append(max_dist)
+    # [B, 3]
+    means = torch.stack(means, dim=0)
+    # [B]
+    max_dists = torch.stack(max_dists, dim=0)
+    return means, max_dists
+def get_pixel_grid(opt, H, W):
+    y_range = torch.arange(H, dtype=torch.float32).to(opt.device)
+    x_range = torch.arange(W, dtype=torch.float32).to(opt.device)
+    Y, X = torch.meshgrid(y_range, x_range, indexing='ij')
+    Z = torch.ones_like(Y)
+    xyz_grid = torch.stack([X, Y, Z],dim=-1).view(-1,3)
+    return xyz_grid
+def unproj_depth(opt, depth, intr):
+    '''
+    depth: [B, 1, H, W]
+    intr: [B, 3, 3]
+    '''
+    batch_size, _, H, W = depth.shape
+    assert opt.H == H == W
+    depth = depth.squeeze(1)
+    # [B, 3, 3]
+    K_inv = torch.linalg.inv(intr).float()
+    # [1, H*W,3]
+    pixel_grid = get_pixel_grid(opt, H, W).unsqueeze(0)
+    # [B, H*W,3]
+    pixel_grid = pixel_grid.repeat(batch_size, 1, 1)
+    # [B, 3, H*W]
+    ray_dirs = K_inv @ pixel_grid.permute(0, 2, 1).contiguous()
+    # [B, H*W, 3], in camera coordinates
+    seen_points = ray_dirs.permute(0, 2, 1).contiguous() * depth.view(batch_size, H*W, 1)
+    return seen_points
+def to_hom(X):
+    '''
+    X: [B, N, 3]
+    Returns:
+        X_hom: [B, N, 4]
+    '''
+    X_hom = torch.cat([X, torch.ones_like(X[..., :1])], dim=-1)
+    return X_hom
+def world2cam(X_world, pose):
+    '''
+    X_world: [B, N, 3]
+    pose: [B, 3, 4]
+    Returns:
+        X_cam: [B, N, 3]
+    '''
+    X_hom = to_hom(X_world)
+    X_cam = X_hom @ pose.transpose(-1, -2)
+    return X_cam
+def cam2img(X_cam, cam_intr):
+    '''
+    X_cam: [B, N, 3]
+    cam_intr: [B, 3, 3]
+    Returns:
+        X_img: [B, N, 3]
+    '''
+    X_img = X_cam @ cam_intr.transpose(-1, -2)
+    return X_img
+def proj_points(opt, points, intr, pose):
+    '''
+    points: [B, N, 3]
+    intr: [B, 3, 3]
+    pose: [B, 3, 4]
+    '''
+    # [B, N, 3]
+    points_cam = world2cam(points, pose)
+    # [B, N]
+    depth = points_cam[..., 2]
+    # [B, N, 3]
+    points_img = cam2img(points_cam, intr)
+    # [B, N, 2]
+    points_2D = points_img[..., :2] / points_img[..., 2:]
+    return points_2D, depth
+def azim_to_rotation_matrix(azim, representation='angle'):
+    """Azim is angle with vector +X, rotated in XZ plane"""
+    if representation == 'rad':
+        # [B, ]
+        cos, sin = torch.cos(azim), torch.sin(azim)
+    elif representation == 'angle':
+        # [B, ]
+        azim = azim * np.pi / 180
+        cos, sin = torch.cos(azim), torch.sin(azim)
+    elif representation == 'trig':
+        # [B, 2]
+        cos, sin = azim[:, 0], azim[:, 1]
+    R = torch.eye(3, device=azim.device)[None].repeat(len(azim), 1, 1)
+    zeros = torch.zeros(len(azim), device=azim.device)
+    R[:, 0, :] = torch.stack([cos, zeros, sin], dim=-1)
+    R[:, 2, :] = torch.stack([-sin, zeros, cos], dim=-1)
+    return R
+def elev_to_rotation_matrix(elev, representation='angle'):
+    """Angle with vector +Z in YZ plane"""
+    if representation == 'rad':
+        # [B, ]
+        cos, sin = torch.cos(elev), torch.sin(elev)
+    elif representation == 'angle':
+        # [B, ]
+        elev = elev * np.pi / 180
+        cos, sin = torch.cos(elev), torch.sin(elev)
+    elif representation == 'trig':
+        # [B, 2]
+        cos, sin = elev[:, 0], elev[:, 1]
+    R = torch.eye(3, device=elev.device)[None].repeat(len(elev), 1, 1)
+    R[:, 1, 1:] = torch.stack([cos, -sin], dim=-1)
+    R[:, 2, 1:] = torch.stack([sin, cos], dim=-1)
+    return R
+def roll_to_rotation_matrix(roll, representation='angle'):
+    """Angle with vector +X in XY plane"""
+    if representation == 'rad':
+        # [B, ]
+        cos, sin = torch.cos(roll), torch.sin(roll)
+    elif representation == 'angle':
+        # [B, ]
+        roll = roll * np.pi / 180
+        cos, sin = torch.cos(roll), torch.sin(roll)
+    elif representation == 'trig':
+        # [B, 2]
+        cos, sin = roll[:, 0], roll[:, 1]
+    R = torch.eye(3, device=roll.device)[None].repeat(len(roll), 1, 1)
+    R[:, 0, :2] = torch.stack([cos, sin], dim=-1)
+    R[:, 1, :2] = torch.stack([-sin, cos], dim=-1)
+    return R
+def get_rotation_sphere(azim_sample=4, elev_sample=4, roll_sample=4, scales=[1.0], device='cuda'):
+    rotations = []
+    azim_range = [0, 360]
+    elev_range = [0, 360]
+    roll_range = [0, 360]
+    azims = np.linspace(azim_range[0], azim_range[1], num=azim_sample, endpoint=False)
+    elevs = np.linspace(elev_range[0], elev_range[1], num=elev_sample, endpoint=False)
+    rolls = np.linspace(roll_range[0], roll_range[1], num=roll_sample, endpoint=False)
+    for scale in scales:
+        for azim in azims:
+            for elev in elevs:
+                for roll in rolls:
+                    Ry = azim_to_rotation_matrix(torch.tensor([azim]))
+                    Rx = elev_to_rotation_matrix(torch.tensor([elev]))
+                    Rz = roll_to_rotation_matrix(torch.tensor([roll]))
+                    R_permute = torch.tensor([
+                        [-1, 0, 0],
+                        [0, 0, -1],
+                        [0, -1, 0]
+                    ]).float().to(Ry.device).unsqueeze(0).expand_as(Ry)
+                    R = scale * Rz@Rx@Ry@R_permute
+                    rotations.append(R.to(device).float())
+    return torch.cat(rotations, dim=0)

utils/eval_3D.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import numpy as np
+import torch
+import threading
+import mcubes
+import trimesh
+from utils.util_vis import show_att_on_image
+from utils.camera import get_rotation_sphere
+@torch.no_grad()
+def get_dense_3D_grid(opt, var, N=None):
+    batch_size = len(var.idx)
+    N = N or opt.eval.vox_res
+    # -0.6, 0.6
+    range_min, range_max = opt.eval.range
+    grid = torch.linspace(range_min, range_max, N+1, device=opt.device)
+    points_3D = torch.stack(torch.meshgrid(grid, grid, grid, indexing='ij'), dim=-1) # [N, N, N, 3]
+    # actually N+1 instead of N
+    points_3D = points_3D.repeat(batch_size, 1, 1, 1, 1) # [B, N, N, N, 3]
+    return points_3D
+@torch.no_grad()
+def compute_level_grid(opt, impl_network, latent_depth, latent_semantic, points_3D, images, vis_attn=False):
+    # needed for amp
+    latent_depth = latent_depth.to(torch.float32) if latent_depth is not None else None
+    latent_semantic = latent_semantic.to(torch.float32) if latent_semantic is not None else None
+    # process points in sliced way
+    batch_size = points_3D.shape[0]
+    N = points_3D.shape[1]
+    assert N == points_3D.shape[2] == points_3D.shape[3]
+    assert points_3D.shape[4] == 3
+    points_3D = points_3D.view(batch_size, N, N*N, 3)
+    occ = []
+    attn = []
+    for i in range(N):
+        # [B, N*N, 3]
+        points_slice = points_3D[:, i]
+        # [B, N*N, 3] -> [B, N*N], [B, N*N, 1+feat_res**2]
+        occ_slice, attn_slice = impl_network(latent_depth, latent_semantic, points_slice)
+        occ.append(occ_slice)
+        attn.append(attn_slice.detach())
+    # [B, N, N*N] -> [B, N, N, N]
+    occ = torch.stack(occ, dim=1).view(batch_size, N, N, N)
+    occ = torch.sigmoid(occ)
+    if vis_attn:
+        N_global = 1
+        feat_res = opt.H // opt.arch.win_size
+        attn = torch.stack(attn, dim=1).view(batch_size, N, N, N, N_global+feat_res**2)
+        # average along Z, [B, N, N, N_global+feat_res**2]
+        attn = torch.mean(attn, dim=3)
+        # [B, N, N, N_global] -> [B, N, N, 1]
+        attn_global = attn[:, :, :, :N_global].sum(dim=-1, keepdim=True)
+        # [B, N, N, feat_res, feat_res]
+        attn_local = attn[:, :, :, N_global:].view(batch_size, N, N, feat_res, feat_res)
+        # [B, N, N, feat_res, feat_res]
+        attn_vis = attn_global.unsqueeze(-1) + attn_local
+        # list of frame lists
+        images_vis = []
+        for b in range(batch_size):
+            images_vis_sample = []
+            for row in range(0, N, 8):
+                if row % 16 == 0:
+                    col_range = range(0, N//8*8+1, 8)
+                else:
+                    col_range = range(N//8*8, -1, -8)
+                for col in col_range:
+                    # [feat_res, feat_res], x is col
+                    attn_curr = attn_vis[b, col, row]
+                    attn_curr = torch.nn.functional.interpolate(
+                        attn_curr.unsqueeze(0).unsqueeze(0), size=(opt.H, opt.W),
+                        mode='bilinear', align_corners=False
+                    ).squeeze(0).squeeze(0).cpu().numpy()
+                    attn_curr /= attn_curr.max()
+                    # [feat_res, feat_res, 3]
+                    image_curr = images[b].permute(1, 2, 0).cpu().numpy()
+                    # merge the image and the attention
+                    images_vis_sample.append(show_att_on_image(image_curr, attn_curr))
+            images_vis.append(images_vis_sample)
+    return occ, images_vis if vis_attn else None
+@torch.no_grad()
+def standardize_pc(pc):
+    assert len(pc.shape) == 3
+    pc_mean = pc.mean(dim=1, keepdim=True)
+    pc_zmean = pc - pc_mean
+    origin_distance = (pc_zmean**2).sum(dim=2, keepdim=True).sqrt()
+    scale = torch.sqrt(torch.sum(origin_distance**2, dim=1, keepdim=True) / pc.shape[1])
+    pc_standardized = pc_zmean / (scale * 2)
+    return pc_standardized
+@torch.no_grad()
+def normalize_pc(pc):
+    assert len(pc.shape) == 3
+    pc_mean = pc.mean(dim=1, keepdim=True)
+    pc_zmean = pc - pc_mean
+    length_x = pc_zmean[:, :, 0].max(dim=-1)[0] - pc_zmean[:, :, 0].min(dim=-1)[0]
+    length_y = pc_zmean[:, :, 1].max(dim=-1)[0] - pc_zmean[:, :, 1].min(dim=-1)[0]
+    length_max = torch.stack([length_x, length_y], dim=-1).max(dim=-1)[0].unsqueeze(-1).unsqueeze(-1)
+    pc_normalized = pc_zmean / (length_max + 1.e-7)
+    return pc_normalized
+def convert_to_explicit(opt, level_grids, isoval=0., to_pointcloud=False):
+    N = len(level_grids)
+    meshes = [None]*N
+    pointclouds = [None]*N if to_pointcloud else None
+    threads = [threading.Thread(target=convert_to_explicit_worker,
+                                args=(opt, i, level_grids[i], isoval, meshes),
+                                kwargs=dict(pointclouds=pointclouds),
+                                daemon=False) for i in range(N)]
+    for t in threads: t.start()
+    for t in threads: t.join()
+    if to_pointcloud:
+        pointclouds = np.stack(pointclouds, axis=0)
+        return meshes, pointclouds
+    else: return meshes
+def convert_to_explicit_worker(opt, i, level_vox_i, isoval, meshes, pointclouds=None):
+    # use marching cubes to convert implicit surface to mesh
+    vertices, faces = mcubes.marching_cubes(level_vox_i, isovalue=isoval)
+    assert(level_vox_i.shape[0]==level_vox_i.shape[1]==level_vox_i.shape[2])
+    S = level_vox_i.shape[0]
+    range_min, range_max = opt.eval.range
+    # marching cubes treat every cube as unit length
+    vertices = vertices/S*(range_max-range_min)+range_min
+    mesh = trimesh.Trimesh(vertices, faces)
+    meshes[i] = mesh
+    if pointclouds is not None:
+        # randomly sample on mesh to get uniform dense point cloud
+        if len(mesh.triangles)!=0:
+            points = mesh.sample(opt.eval.num_points)
+        else: points = np.zeros([opt.eval.num_points, 3])
+        pointclouds[i] = points

utils/eval_depth.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# based on https://gist.github.com/ranftlr/45f4c7ddeb1bbb88d606bc600cab6c8d
+import torch
+class DepthMetric:
+    def __init__(self, thresholds=[1.25, 1.25**2, 1.25**3], depth_cap=None, prediction_type='depth'):
+        self.thresholds = thresholds
+        self.depth_cap = depth_cap
+        self.metric_keys = self.get_metric_keys()
+        self.prediction_type = prediction_type
+    def compute_scale_and_shift(self, prediction, target, mask):
+        # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+        a_00 = torch.sum(mask * prediction * prediction, (1, 2))
+        a_01 = torch.sum(mask * prediction, (1, 2))
+        a_11 = torch.sum(mask, (1, 2))
+        # right hand side: b = [b_0, b_1]
+        b_0 = torch.sum(mask * prediction * target, (1, 2))
+        b_1 = torch.sum(mask * target, (1, 2))
+        # solution: x = A^-1 . b = [[a_11, -a_01], [-a_10, a_00]] / (a_00 * a_11 - a_01 * a_10) . b
+        x_0 = torch.zeros_like(b_0)
+        x_1 = torch.zeros_like(b_1)
+        det = a_00 * a_11 - a_01 * a_01
+        # A needs to be a positive definite matrix.
+        valid = det > 0
+        x_0[valid] = (a_11[valid] * b_0[valid] - a_01[valid] * b_1[valid]) / det[valid]
+        x_1[valid] = (-a_01[valid] * b_0[valid] + a_00[valid] * b_1[valid]) / det[valid]
+        return x_0, x_1
+    def get_metric_keys(self):
+        metric_keys = []
+        for threshold in self.thresholds:
+            metric_keys.append('d>{}'.format(threshold))
+        metric_keys.append('rmse')
+        metric_keys.append('l1_err')
+        metric_keys.append('abs_rel')
+        return metric_keys
+    def compute_metrics(self, prediction, target, mask):
+        # check inputs
+        prediction = prediction.float()
+        target = target.float()
+        mask = mask.float()
+        assert prediction.shape == target.shape == mask.shape
+        assert len(prediction.shape) == 4
+        assert prediction.shape[1] == 1
+        assert prediction.dtype == target.dtype == mask.dtype == torch.float32
+        # process inputs
+        prediction = prediction.squeeze(1)
+        target = target.squeeze(1)
+        mask = (mask.squeeze(1) > 0.5).long()
+        # output dict
+        metrics = {}
+        # get the predicted disparity
+        prediction_disparity = torch.zeros_like(prediction)
+        if self.prediction_type == 'depth':
+            prediction_disparity[mask == 1] = 1.0 / (prediction[mask == 1] + 1.e-6)
+        elif self.prediction_type == 'disparity':
+            prediction_disparity[mask == 1] = prediction[mask == 1]
+        else:
+            raise ValueError('Unknown prediction type: {}'.format(self.prediction_type))
+        # transform predicted disparity to align with depth
+        target_disparity = torch.zeros_like(target)
+        target_disparity[mask == 1] = 1.0 / target[mask == 1]
+        scale, shift = self.compute_scale_and_shift(prediction_disparity, target_disparity, mask)
+        prediction_aligned = scale.view(-1, 1, 1) * prediction_disparity + shift.view(-1, 1, 1)
+        if self.depth_cap is not None:
+            disparity_cap = 1.0 / self.depth_cap
+            prediction_aligned[prediction_aligned < disparity_cap] = disparity_cap
+        prediciton_depth = 1.0 / prediction_aligned
+        # delta > threshold, [batch_size, ]
+        for threshold in self.thresholds:
+            err = torch.zeros_like(prediciton_depth, dtype=torch.float)
+            err[mask == 1] = torch.max(
+                prediciton_depth[mask == 1] / target[mask == 1],
+                target[mask == 1] / prediciton_depth[mask == 1],
+            )
+            err[mask == 1] = (err[mask == 1] > threshold).float()
+            metrics['d>{}'.format(threshold)] = torch.sum(err, (1, 2)) / torch.sum(mask, (1, 2))
+        # rmse, [batch_size, ]
+        rmse = torch.zeros_like(prediciton_depth, dtype=torch.float)
+        rmse[mask == 1] = (prediciton_depth[mask == 1] - target[mask == 1]) ** 2
+        rmse = torch.sum(rmse, (1, 2)) / torch.sum(mask, (1, 2))
+        metrics['rmse'] = torch.sqrt(rmse)
+        # l1 error, [batch_size, ]
+        l1_err = torch.zeros_like(prediciton_depth, dtype=torch.float)
+        l1_err[mask == 1] = torch.abs(prediciton_depth[mask == 1] - target[mask == 1])
+        metrics['l1_err'] = torch.sum(l1_err, (1, 2)) / torch.sum(mask, (1, 2))
+        # abs_rel, [batch_size, ]
+        abs_rel = torch.zeros_like(prediciton_depth, dtype=torch.float)
+        abs_rel[mask == 1] = torch.abs(prediciton_depth[mask == 1] - target[mask == 1]) / target[mask == 1]
+        metrics['abs_rel'] = torch.sum(abs_rel, (1, 2)) / torch.sum(mask, (1, 2))
+        return metrics, prediciton_depth.unsqueeze(1)

utils/layers.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import torch
+import torch.nn as nn
+from functools import partial
+from timm.models.vision_transformer import Block
+# 3D positional encoding, from https://github.com/bmild/nerf.
+class Embedder:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        self.create_embedding_fn()
+    def create_embedding_fn(self):
+        embed_fns = []
+        d = self.kwargs['input_dims']
+        out_dim = 0
+        if self.kwargs['include_input']:
+            embed_fns.append(lambda x: x)
+            out_dim += d
+        max_freq = self.kwargs['max_freq_log2']
+        N_freqs = self.kwargs['num_freqs']
+        if self.kwargs['log_sampling']:
+            freq_bands = 2. ** torch.linspace(0., max_freq, N_freqs)
+        else:
+            freq_bands = torch.linspace(2.**0., 2.**max_freq, N_freqs)
+        for freq in freq_bands:
+            for p_fn in self.kwargs['periodic_fns']:
+                embed_fns.append(lambda x, p_fn=p_fn,
+                                 freq=freq: p_fn(x * freq))
+                out_dim += d
+        self.embed_fns = embed_fns
+        self.out_dim = out_dim
+    def embed(self, inputs):
+        return torch.cat([fn(inputs) for fn in self.embed_fns], -1)
+def get_embedder(posenc_res, input_dims=3):
+    embed_kwargs = {
+        'include_input': True,
+        'input_dims': input_dims,
+        'max_freq_log2': posenc_res-1,
+        'num_freqs': posenc_res,
+        'log_sampling': True,
+        'periodic_fns': [torch.sin, torch.cos],
+    }
+    embedder_obj = Embedder(**embed_kwargs)
+    def embed(x, eo=embedder_obj): return eo.embed(x)
+    return embed, embedder_obj.out_dim
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Bottleneck_Linear(nn.Module):
+    def __init__(self, n_channels):
+        super().__init__()
+        self.linear1 = nn.Linear(n_channels, n_channels)
+        self.norm = nn.LayerNorm(n_channels)
+        self.linear2 = nn.Linear(n_channels, n_channels)
+        self.gelu = nn.GELU()
+    def forward(self, x):
+        x = x + self.linear2(self.gelu(self.linear1(self.norm(x))))
+        return x
+class Bottleneck_Conv(nn.Module):
+    def __init__(self, n_channels, kernel_size=1):
+        super().__init__()
+        self.linear1 = nn.Conv2d(n_channels, n_channels, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
+        self.bn1 = nn.BatchNorm2d(n_channels)
+        self.linear2 = nn.Conv2d(n_channels, n_channels, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
+        self.bn2 = nn.BatchNorm2d(n_channels)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        assert len(x.shape) in [2, 4]
+        input_dims = len(x.shape)
+        if input_dims == 2:
+            x = x.unsqueeze(-1).unsqueeze(-1)
+        residual = x
+        out = self.linear1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.linear2(out)
+        out = self.bn2(out)
+        out += residual
+        out = self.relu(out)
+        if input_dims == 2:
+            out = out.squeeze(-1).squeeze(-1)
+        return out
+class CLIPFusionBlock_Concat(nn.Module):
+    """
+    Fuse clip and rgb embeddings via concat-proj
+    """
+    def __init__(self, n_channels=512, n_layers=1, act=True):
+        super().__init__()
+        proj = [Bottleneck_Linear(2 * n_channels) for _ in range(n_layers)]
+        proj.append(nn.Linear(2 * n_channels, n_channels))
+        if act: proj.append(nn.GELU())
+        self.proj = nn.Sequential(*proj)
+    def forward(self, sem_latent, clip_latent):
+        """
+        sem_latent: [B, N, C]
+        clip_latent: [B, C]
+        """
+        # [B, N, 2C]
+        latent_concat = torch.cat([sem_latent, clip_latent.unsqueeze(1).expand_as(sem_latent)], dim=-1)
+        # [B, N, C]
+        latent = self.proj(latent_concat)
+        return latent
+class CLIPFusionBlock_Attn(nn.Module):
+    """
+    Fuse geometric and semantic embeddings via multi-layer MHA blocks
+    """
+    def __init__(self, n_channels=512, n_layers=1, act=True):
+        super().__init__()
+        self.attn_blocks = nn.ModuleList(
+            [Block(
+                n_channels, 8, 4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), drop_path=0.1
+            ) for _ in range(n_layers)]
+        )
+        if act: self.attn_blocks.append(nn.GELU())
+    def forward(self, sem_latent, clip_latent):
+        """
+        sem_latent: [B, N, C]
+        clip_latent: [B, C]
+        """
+        # [B, 1+N, C], clip first
+        latent = torch.cat([clip_latent.unsqueeze(1), sem_latent], dim=1)
+        for attn_block in self.attn_blocks:
+            latent = attn_block(latent)
+        # [B, N, C]
+        return latent[:, 1:, :]

utils/loss.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as torch_F
+from copy import deepcopy
+from model.depth.midas_loss import MidasLoss
+class Loss(nn.Module):
+    def __init__(self, opt):
+        super().__init__()
+        self.opt = deepcopy(opt)
+        self.occ_loss = nn.BCEWithLogitsLoss(reduction='none')
+        self.midas_loss = MidasLoss(alpha=opt.training.depth_loss.grad_reg,
+                                    inverse_depth=opt.training.depth_loss.depth_inv,
+                                    shrink_mask=opt.training.depth_loss.mask_shrink)
+    def shape_loss(self, pred_occ_raw, gt_sdf):
+        assert len(pred_occ_raw.shape) == 2
+        assert len(gt_sdf.shape) == 2
+        # [B, N]
+        gt_occ = (gt_sdf < 0).float()
+        loss = self.occ_loss(pred_occ_raw, gt_occ)
+        weight_mask = torch.ones_like(loss)
+        thres = self.opt.training.shape_loss.impt_thres
+        weight_mask[torch.abs(gt_sdf) < thres] = weight_mask[torch.abs(gt_sdf) < thres] * self.opt.training.shape_loss.impt_weight
+        loss = loss * weight_mask
+        return loss.mean()
+    def depth_loss(self, pred_depth, gt_depth, mask):
+        assert len(pred_depth.shape) == len(gt_depth.shape) == len(mask.shape) == 4
+        assert pred_depth.shape[1] == gt_depth.shape[1] == mask.shape[1] == 1
+        loss = self.midas_loss(pred_depth, gt_depth, mask)
+        return loss
+    def intr_loss(self, seen_pred, seen_gt, mask):
+        assert len(seen_pred.shape) == len(seen_gt.shape) == 3
+        assert len(mask.shape) == 2
+        # [B, HW]
+        distance = torch.sum((seen_pred - seen_gt)**2, dim=-1)
+        loss = (distance * mask).sum() / (mask.sum() + 1.e-8)
+        return loss

utils/options.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import numpy as np
+import os, sys, time
+import torch
+import random
+import string
+import yaml
+import utils.util as util
+import time
+from utils.util import EasyDict as edict
+# torch.backends.cudnn.enabled = False
+# torch.backends.cudnn.benchmark = False
+# torch.backends.cudnn.deterministic = True
+def parse_arguments(args):
+    # parse from command line (syntax: --key1.key2.key3=value)
+    opt_cmd = {}
+    for arg in args:
+        assert(arg.startswith("--"))
+        if "=" not in arg[2:]: # --key means key=True, --key! means key=False
+            key_str, value = (arg[2:-1], "false") if arg[-1]=="!" else (arg[2:], "true")
+        else:
+            key_str, value = arg[2:].split("=")
+        keys_sub = key_str.split(".")
+        opt_sub = opt_cmd
+        for k in keys_sub[:-1]:
+            if k not in opt_sub: opt_sub[k] = {}
+            opt_sub = opt_sub[k]
+        # if opt_cmd['key1']['key2']['key3'] already exist for key1.key2.key3, print key3 as error msg
+        assert keys_sub[-1] not in opt_sub, keys_sub[-1]
+        opt_sub[keys_sub[-1]] = yaml.safe_load(value)
+    opt_cmd = edict(opt_cmd)
+    return opt_cmd
+def set(opt_cmd={}, verbose=True, safe_check=True):
+    print("setting configurations...")
+    fname = opt_cmd.yaml # load from yaml file
+    opt_base = load_options(fname)
+    # override with command line arguments
+    opt = override_options(opt_base, opt_cmd, key_stack=[], safe_check=safe_check)
+    process_options(opt)
+    if verbose:
+        def print_options(opt, level=0):
+            for key, value in sorted(opt.items()):
+                if isinstance(value, (dict, edict)):
+                    print("   "*level+"* "+key+":")
+                    print_options(value, level+1)
+                else:
+                    print("   "*level+"* "+key+":", value)
+        print_options(opt)
+    return opt
+def load_options(fname):
+    with open(fname) as file:
+        opt = edict(yaml.safe_load(file))
+    if "_parent_" in opt:
+        # load parent yaml file(s) as base options
+        parent_fnames = opt.pop("_parent_")
+        if type(parent_fnames) is str:
+            parent_fnames = [parent_fnames]
+        for parent_fname in parent_fnames:
+            opt_parent = load_options(parent_fname)
+            opt_parent = override_options(opt_parent, opt, key_stack=[])
+            opt = opt_parent
+    print("loading {}...".format(fname))
+    return opt
+def override_options(opt, opt_over, key_stack=None, safe_check=False):
+    for key, value in opt_over.items():
+        if isinstance(value, dict):
+            # parse child options (until leaf nodes are reached)
+            opt[key] = override_options(opt.get(key, dict()), value, key_stack=key_stack+[key], safe_check=safe_check)
+        else:
+            # ensure command line argument to override is also in yaml file
+            if safe_check and key not in opt:
+                add_new = None
+                while add_new not in ["y", "n"]:
+                    key_str = ".".join(key_stack+[key])
+                    add_new = input("\"{}\" not found in original opt, add? (y/n) ".format(key_str))
+                if add_new=="n":
+                    print("safe exiting...")
+                    exit()
+            opt[key] = value
+    return opt
+def process_options(opt):
+    # set seed
+    if opt.seed is not None:
+        random.seed(opt.seed)
+        np.random.seed(opt.seed)
+        torch.manual_seed(opt.seed)
+        torch.cuda.manual_seed_all(opt.seed)
+    else:
+        # create random string as run ID
+        randkey = "".join(random.choice(string.ascii_uppercase) for _ in range(4))
+        opt.name += "_{}".format(randkey)
+    # other default options
+    opt.output_path = "{0}/{1}/{2}".format(opt.output_root, opt.group, opt.name)
+    os.makedirs(opt.output_path, exist_ok=True)
+    opt.H, opt.W = opt.image_size
+    if opt.freq.eval is None:
+        opt.freq.eval = max(opt.max_epoch // 20, 1)
+    if 'loss_weight' in opt:
+        opt.get_depth = False
+        opt.get_normal = False
+def save_options_file(opt):
+    opt_fname = "{}/options.yaml".format(opt.output_path)
+    if os.path.isfile(opt_fname):
+        with open(opt_fname) as file:
+            opt_old = yaml.safe_load(file)
+        if opt!=opt_old:
+            # prompt if options are not identical
+            opt_new_fname = "{}/options_temp.yaml".format(opt.output_path)
+            with open(opt_new_fname, "w") as file:
+                yaml.safe_dump(util.to_dict(opt), file, default_flow_style=False, indent=4)
+            print("existing options file found (different from current one)...")
+            os.system("diff {} {}".format(opt_fname, opt_new_fname))
+            os.system("rm {}".format(opt_new_fname))
+            if not opt.debug:
+                print("please cancel within 10 seconds if you do not want to override...")
+                time.sleep(10)
+        else: print("existing options file found (identical)")
+    else: print("(creating new options file...)")
+    with open(opt_fname, "w") as file:
+        yaml.safe_dump(util.to_dict(opt), file, default_flow_style=False, indent=4)

utils/pos_embed.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is copied from https://github.com/facebookresearch/MCC
+# The original code base is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid_torch(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, device=pos.device).float()
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out) # (M, D/2)
+    emb_cos = torch.cos(out) # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed

utils/util.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import os, sys, time
+import shutil
+import datetime
+import torch
+import torch.nn.functional as torch_F
+import socket
+import contextlib
+import socket
+import torch.distributed as dist
+from collections import defaultdict, deque
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def print_eval(opt, loss=None, chamfer=None, depth_metrics=None):
+    message = "[eval] "
+    if loss is not None: message += "loss:{}".format("{:.3e}".format(loss.all))
+    if chamfer is not None:
+        message += " chamfer:{}|{}|{}".format("{:.4f}".format(chamfer[0]),
+                                                "{:.4f}".format(chamfer[1]),
+                                                "{:.4f}".format((chamfer[0]+chamfer[1])/2))
+    if depth_metrics is not None:
+        for k, v in depth_metrics.items():
+            message += "{}:{}, ".format(k, "{:.4f}".format(v))
+        message = message[:-2]
+    print(message)
+def update_timer(opt, timer, ep, it_per_ep):
+    momentum = 0.99
+    timer.elapsed = time.time()-timer.start
+    timer.it = timer.it_end-timer.it_start
+    # compute speed with moving average
+    timer.it_mean = timer.it_mean*momentum+timer.it*(1-momentum) if timer.it_mean is not None else timer.it
+    timer.arrival = timer.it_mean*it_per_ep*(opt.max_epoch-ep)
+# move tensors to device in-place
+def move_to_device(X, device):
+    if isinstance(X, dict):
+        for k, v in X.items():
+            X[k] = move_to_device(v, device)
+    elif isinstance(X, list):
+        for i, e in enumerate(X):
+            X[i] = move_to_device(e, device)
+    elif isinstance(X, tuple) and hasattr(X, "_fields"): # collections.namedtuple
+        dd = X._asdict()
+        dd = move_to_device(dd, device)
+        return type(X)(**dd)
+    elif isinstance(X, torch.Tensor):
+        return X.to(device=device, non_blocking=True)
+    return X
+# detach tensors
+def detach_tensors(X):
+    if isinstance(X, dict):
+        for k, v in X.items():
+            X[k] = detach_tensors(v)
+    elif isinstance(X, list):
+        for i, e in enumerate(X):
+            X[i] = detach_tensors(e)
+    elif isinstance(X, tuple) and hasattr(X, "_fields"): # collections.namedtuple
+        dd = X._asdict()
+        dd = detach_tensors(dd)
+        return type(X)(**dd)
+    elif isinstance(X, torch.Tensor):
+        return X.detach()
+    return X
+# this recursion seems to only work for the outer loop when dict_type is not dict
+def to_dict(D, dict_type=dict):
+    D = dict_type(D)
+    for k, v in D.items():
+        if isinstance(v, dict):
+            D[k] = to_dict(v, dict_type)
+    return D
+def get_child_state_dict(state_dict, key):
+    out_dict = {}
+    for k, v in state_dict.items():
+        if k.startswith("module."):
+            param_name = k[7:]
+        else:
+            param_name = k
+        if param_name.startswith("{}.".format(key)):
+            out_dict[".".join(param_name.split(".")[1:])] = v
+    return out_dict
+def resume_checkpoint(opt, model, best):
+    load_name = "{0}/best.ckpt".format(opt.output_path) if best else "{0}/latest.ckpt".format(opt.output_path)
+    checkpoint = torch.load(load_name, map_location=torch.device(opt.device))
+    model.graph.module.load_state_dict(checkpoint["graph"], strict=True)
+    # load the training stats
+    for key in model.__dict__:
+        if key.split("_")[0] in ["optim", "sched", "scaler"] and key in checkpoint:
+            if opt.device == 0: print("restoring {}...".format(key))
+            getattr(model, key).load_state_dict(checkpoint[key])
+    # also need to record ep, it, best_val if we are returning
+    ep, it = checkpoint["epoch"], checkpoint["iter"]
+    best_val, best_ep = checkpoint["best_val"], checkpoint["best_ep"] if "best_ep" in checkpoint else 0
+    print("resuming from epoch {0} (iteration {1})".format(ep, it))
+    return ep, it, best_val, best_ep
+def load_checkpoint(opt, model, load_name):
+    # load_name as to be given
+    checkpoint = torch.load(load_name, map_location=torch.device(opt.device))
+    # load individual (possibly partial) children modules
+    for name, child in model.graph.module.named_children():
+        child_state_dict = get_child_state_dict(checkpoint["graph"], name)
+        if child_state_dict:
+            if opt.device == 0: print("restoring {}...".format(name))
+            child.load_state_dict(child_state_dict, strict=True)
+        else:
+            if opt.device == 0: print("skipping {}...".format(name))
+    return None, None, None, None
+def restore_checkpoint(opt, model, load_name=None, resume=False, best=False, evaluate=False):
+    # we cannot load and resume at the same time
+    assert not (load_name is not None and resume)
+    # when resuming we want everything to be the same
+    if resume:
+        ep, it, best_val, best_ep = resume_checkpoint(opt, model, best)
+    # loading is more flexible, as we can only load parts of the model
+    else:
+        ep, it, best_val, best_ep = load_checkpoint(opt, model, load_name)
+    return ep, it, best_val, best_ep
+def save_checkpoint(opt, model, ep, it, best_val, best_ep, latest=False, best=False, children=None):
+    os.makedirs("{0}/checkpoint".format(opt.output_path), exist_ok=True)
+    if isinstance(model.graph, torch.nn.DataParallel) or isinstance(model.graph, torch.nn.parallel.DistributedDataParallel):
+        graph = model.graph.module
+    else:
+        graph = model.graph
+    if children is not None:
+        graph_state_dict = { k: v for k, v in graph.state_dict().items() if k.startswith(children) }
+    else: graph_state_dict = graph.state_dict()
+    checkpoint = dict(
+        epoch=ep,
+        iter=it,
+        best_val=best_val,
+        best_ep=best_ep,
+        graph=graph_state_dict,
+    )
+    for key in model.__dict__:
+        if key.split("_")[0] in ["optim", "sched", "scaler"]:
+            checkpoint.update({key: getattr(model, key).state_dict()})
+    torch.save(checkpoint, "{0}/latest.ckpt".format(opt.output_path))
+    if best:
+        shutil.copy("{0}/latest.ckpt".format(opt.output_path),
+                    "{0}/best.ckpt".format(opt.output_path))
+    if not latest:
+        shutil.copy("{0}/latest.ckpt".format(opt.output_path),
+                    "{0}/checkpoint/ep{1}.ckpt".format(opt.output_path, ep))
+def check_socket_open(hostname, port):
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    is_open = False
+    try:
+        s.bind((hostname, port))
+    except socket.error:
+        is_open = True
+    finally:
+        s.close()
+    return is_open
+def get_layer_dims(layers):
+    # return a list of tuples (k_in, k_out)
+    return list(zip(layers[:-1], layers[1:]))
+@contextlib.contextmanager
+def suppress(stdout=False, stderr=False):
+    with open(os.devnull, "w") as devnull:
+        if stdout: old_stdout, sys.stdout = sys.stdout, devnull
+        if stderr: old_stderr, sys.stderr = sys.stderr, devnull
+        try: yield
+        finally:
+            if stdout: sys.stdout = old_stdout
+            if stderr: sys.stderr = old_stderr
+def toggle_grad(model, requires_grad):
+    for p in model.parameters():
+        p.requires_grad_(requires_grad)
+def compute_grad2(d_outs, x_in):
+    d_outs = [d_outs] if not isinstance(d_outs, list) else d_outs
+    reg = 0
+    for d_out in d_outs:
+        batch_size = x_in.size(0)
+        grad_dout = torch.autograd.grad(
+            outputs=d_out.sum(), inputs=x_in,
+            create_graph=True, retain_graph=True, only_inputs=True
+        )[0]
+        grad_dout2 = grad_dout.pow(2)
+        assert(grad_dout2.size() == x_in.size())
+        reg += grad_dout2.view(batch_size, -1).sum(1)
+    return reg / len(d_outs)
+# import matplotlib.pyplot as plt
+def interpolate_depth(depth_input, mask_input, size, bg_depth=20):
+    assert len(depth_input.shape) == len(mask_input.shape) == 4
+    mask = (mask_input > 0.5).float()
+    depth_valid = depth_input * mask
+    depth_valid = torch_F.interpolate(depth_valid, size, mode='bilinear', align_corners=False)
+    mask = torch_F.interpolate(mask, size, mode='bilinear', align_corners=False)
+    depth_out = depth_valid / (mask + 1.e-6)
+    mask_binary = (mask > 0.5).float()
+    depth_out = depth_out * mask_binary + bg_depth * (1 - mask_binary)
+    return depth_out, mask_binary
+# import matplotlib.pyplot as plt
+# import torchvision
+def interpolate_coordmap(coord_map, mask_input, size, bg_coord=0):
+    assert len(coord_map.shape) == len(mask_input.shape) == 4
+    mask = (mask_input > 0.5).float()
+    coord_valid = coord_map * mask
+    coord_valid = torch_F.interpolate(coord_valid, size, mode='bilinear', align_corners=False)
+    mask = torch_F.interpolate(mask, size, mode='bilinear', align_corners=False)
+    coord_out = coord_valid / (mask + 1.e-6)
+    mask_binary = (mask > 0.5).float()
+    coord_out = coord_out * mask_binary + bg_coord * (1 - mask_binary)
+    return coord_out, mask_binary
+def cleanup():
+    dist.destroy_process_group()
+def is_port_in_use(port):
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(('localhost', port)) == 0
+def setup(rank, world_size, port_no):
+    full_address = 'tcp://127.0.0.1:' + str(port_no)
+    dist.init_process_group("nccl", init_method=full_address, rank=rank, world_size=world_size)
+def print_grad(grad, prefix=''):
+    print("{} --- Grad Abs Mean, Grad Max, Grad Min: {:.5f} | {:.5f} | {:.5f}".format(prefix, grad.abs().mean().item(), grad.max().item(), grad.min().item()))
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+class EasyDict(dict):
+    def __init__(self, d=None, **kwargs):
+        if d is None:
+            d = {}
+        else:
+            d = dict(d)
+        if kwargs:
+            d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        # Class attributes
+        for k in self.__class__.__dict__.keys():
+            if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
+                setattr(self, k, getattr(self, k))
+    def __setattr__(self, name, value):
+        if isinstance(value, (list, tuple)):
+            value = [self.__class__(x)
+                     if isinstance(x, dict) else x for x in value]
+        elif isinstance(value, dict) and not isinstance(value, self.__class__):
+            value = self.__class__(value)
+        super(EasyDict, self).__setattr__(name, value)
+        super(EasyDict, self).__setitem__(name, value)
+    __setitem__ = __setattr__
+    def update(self, e=None, **f):
+        d = e or dict()
+        d.update(f)
+        for k in d:
+            setattr(self, k, d[k])
+    def pop(self, k, d=None):
+        delattr(self, k)
+        return super(EasyDict, self).pop(k, d)

utils/util_vis.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import numpy as np
+import os
+import torch
+import torchvision
+import torchvision.transforms.functional as torchvision_F
+import matplotlib.pyplot as plt
+import PIL
+import PIL.ImageDraw
+from PIL import Image, ImageFont
+import trimesh
+import pyrender
+import cv2
+import copy
+import base64
+import io
+import imageio
+os.environ['PYOPENGL_PLATFORM'] = 'egl'
+@torch.no_grad()
+def tb_image(opt, tb, step, group, name, images, masks=None, num_vis=None, from_range=(0, 1), poses=None, cmap="gray", depth=False):
+    if not depth:
+        images = preprocess_vis_image(opt, images, masks=masks, from_range=from_range, cmap=cmap) # [B, 3, H, W]
+    else:
+        masks = (masks > 0.5).float()
+        images = images * masks + (1 - masks) * ((images * masks).max())
+        images = (1 - images).detach().cpu()
+    num_H, num_W = num_vis or opt.tb.num_images
+    images = images[:num_H*num_W]
+    if poses is not None:
+        # poses: [B, 3, 4]
+        # rots: [max(B, num_images), 3, 3]
+        rots = poses[:num_H*num_W, ..., :3]
+        images = torch.stack([draw_pose(opt, image, rot, size=20, width=2) for image, rot in zip(images, rots)], dim=0)
+    image_grid = torchvision.utils.make_grid(images[:, :3], nrow=num_W, pad_value=1.)
+    if images.shape[1]==4:
+        mask_grid = torchvision.utils.make_grid(images[:, 3:], nrow=num_W, pad_value=1.)[:1]
+        image_grid = torch.cat([image_grid, mask_grid], dim=0)
+    tag = "{0}/{1}".format(group, name)
+    tb.add_image(tag, image_grid, step)
+def preprocess_vis_image(opt, images, masks=None, from_range=(0, 1), cmap="gray"):
+    min, max = from_range
+    images = (images-min)/(max-min)
+    if masks is not None:
+        # then the mask is directly the transparency channel of png
+        images = torch.cat([images, masks], dim=1)
+    images = images.clamp(min=0, max=1).cpu()
+    if images.shape[1]==1:
+        images = get_heatmap(opt, images[:, 0].cpu(), cmap=cmap)
+    return images
+def preprocess_depth_image(opt, depth, mask=None, max_depth=1000):
+    if mask is not None: depth = depth * mask + (1 - mask) * max_depth  # min of this will leads to minimum of masked regions
+    depth = depth - depth.min()
+    if mask is not None: depth = depth * mask   # max of this will leads to maximum of masked regions
+    depth = depth / depth.max()
+    return depth
+def dump_images(opt, idx, name, images, masks=None, from_range=(0, 1), poses=None, metrics=None, cmap="gray", folder='dump'):
+    images = preprocess_vis_image(opt, images, masks=masks, from_range=from_range, cmap=cmap) # [B, 3, H, W]
+    if poses is not None:
+        rots = poses[..., :3]
+        images = torch.stack([draw_pose(opt, image, rot, size=20, width=2) for image, rot in zip(images, rots)], dim=0)
+    if metrics is not None:
+        images = torch.stack([draw_metric(opt, image, metric.item()) for image, metric in zip(images, metrics)], dim=0)
+    images = images.cpu().permute(0, 2, 3, 1).contiguous().numpy() # [B, H, W, 3]
+    for i, img in zip(idx, images):
+        fname = "{}/{}/{}_{}.png".format(opt.output_path, folder, i, name)
+        img = Image.fromarray((img*255).astype(np.uint8))
+        img.save(fname)
+def dump_depths(opt, idx, name, depths, masks=None, rescale=False, folder='dump'):
+    if rescale:
+        masks = (masks > 0.5).float()
+        depths = depths * masks + (1 - masks) * ((depths * masks).max())
+    depths = (1 - depths).detach().cpu()
+    for i, depth in zip(idx, depths):
+        fname = "{}/{}/{}_{}.png".format(opt.output_path, folder, i, name)
+        plt.imsave(fname, depth.squeeze(), cmap='viridis')
+# img_list is a list of length n_views, where each view is a image tensor of [B, 3, H, W]
+def dump_gifs(opt, idx, name, imgs_list, from_range=(0, 1), folder='dump', cmap="gray"):
+    for i in range(len(imgs_list)):
+        imgs_list[i] = preprocess_vis_image(opt, imgs_list[i], from_range=from_range, cmap=cmap)
+    for i in range(len(idx)):
+        img_list_np = [imgs[i].cpu().permute(1, 2, 0).contiguous().numpy() for imgs in imgs_list]  # list of [H, W, 3], each item is a view of ith sample
+        img_list_pil = [Image.fromarray((img*255).astype(np.uint8)).convert('RGB') for img in img_list_np]
+        fname = "{}/{}/{}_{}.gif".format(opt.output_path, folder, idx[i], name)
+        img_list_pil[0].save(fname, format='GIF', append_images=img_list_pil[1:], save_all=True, duration=100, loop=0)
+# img_list is a list of length n_views, where each view is a image tensor of [B, 3, H, W]
+def dump_attentions(opt, idx, name, attn_vis, folder='dump'):
+    for i in range(len(idx)):
+        img_list_pil = [Image.fromarray((img*255).astype(np.uint8)).convert('RGB') for img in attn_vis[i]]
+        fname = "{}/{}/{}_{}.gif".format(opt.output_path, folder, idx[i], name)
+        img_list_pil[0].save(fname, format='GIF', append_images=img_list_pil[1:], save_all=True, duration=50, loop=0)
+def get_heatmap(opt, gray, cmap): # [N, H, W]
+    color = plt.get_cmap(cmap)(gray.numpy())
+    color = torch.from_numpy(color[..., :3]).permute(0, 3, 1, 2).contiguous().float() # [N, 3, H, W]
+    return color
+def dump_meshes(opt, idx, name, meshes, folder='dump'):
+    for i, mesh in zip(idx, meshes):
+        fname = "{}/{}/{}_{}.ply".format(opt.output_path, folder, i, name)
+        try:
+            mesh.export(fname)
+        except:
+            print('Mesh is empty!')
+def dump_meshes_viz(opt, idx, name, meshes, save_frames=True, folder='dump'):
+    for i, mesh in zip(idx, meshes):
+        mesh = copy.deepcopy(mesh)
+        R = trimesh.transformations.rotation_matrix(np.radians(180), [0,0,1])
+        mesh.apply_transform(R)
+        R = trimesh.transformations.rotation_matrix(np.radians(180), [0,1,0])
+        mesh.apply_transform(R)
+        # our  marching cubes outputs inverted normals for some reason so this is necessary
+        trimesh.repair.fix_inversion(mesh)
+        fname = "{}/{}/{}_{}".format(opt.output_path, folder, i, name)
+        try:
+            mesh = scale_to_unit_cube(mesh)
+            visualize_mesh(mesh, fname, write_frames=save_frames)
+        except:
+            pass
+def dump_seen_surface(opt, idx, obj_name, img_name, seen_projs, folder='dump'):
+    # seen_proj: [B, H, W, 3]
+    for i, seen_proj in zip(idx, seen_projs):
+        out_folder = "{}/{}".format(opt.output_path, folder)
+        img_fname = "{}_{}.png".format(i, img_name)
+        create_seen_surface(i, img_fname, seen_proj, out_folder, obj_name)
+# https://github.com/princeton-vl/oasis/blob/master/utils/vis_mesh.py
+def create_seen_surface(sample_ID, img_path, XYZ, output_folder, obj_name, connect_thres=0.005):
+    height, width = XYZ.shape[:2]
+    XYZ_to_idx = {}
+    idx = 1
+    with open("{}/{}_{}.mtl".format(output_folder, sample_ID, obj_name), "w") as f:
+        f.write("newmtl material_0\n")
+        f.write("Ka 0.200000 0.200000 0.200000\n")
+        f.write("Kd 0.752941 0.752941 0.752941\n")
+        f.write("Ks 1.000000 1.000000 1.000000\n")
+        f.write("Tr 1.000000\n")
+        f.write("illum 2\n")
+        f.write("Ns 0.000000\n")
+        f.write("map_Ka %s\n" % img_path)
+        f.write("map_Kd %s\n" % img_path)
+    with open("{}/{}_{}.obj".format(output_folder, sample_ID, obj_name), "w") as f:
+        f.write("mtllib {}_{}.mtl\n".format(sample_ID, obj_name))
+        for y in range(height):
+            for x in range(width):
+                if XYZ[y][x][2] > 0:
+                    XYZ_to_idx[(y, x)] = idx
+                    idx += 1
+                    f.write("v %.4f %.4f %.4f\n" % (XYZ[y][x][0], XYZ[y][x][1], XYZ[y][x][2]))
+                    f.write("vt %.8f %.8f\n" % ( float(x) / float(width), 1.0 - float(y) / float(height)))
+        f.write("usemtl material_0\n")
+        for y in range(height-1):
+            for x in range(width-1):
+                if XYZ[y][x][2] > 0 and XYZ[y][x+1][2] > 0 and XYZ[y+1][x][2] > 0:
+                    # if close enough, connect vertices to form a face
+                    if torch.norm(XYZ[y][x] - XYZ[y][x+1]).item() < connect_thres and torch.norm(XYZ[y][x] - XYZ[y+1][x]).item() < connect_thres:
+                        f.write("f %d/%d %d/%d %d/%d\n" % (XYZ_to_idx[(y, x)], XYZ_to_idx[(y, x)], XYZ_to_idx[(y, x+1)], XYZ_to_idx[(y, x+1)], XYZ_to_idx[(y+1, x)], XYZ_to_idx[(y+1, x)]))
+                if XYZ[y][x+1][2] > 0 and XYZ[y+1][x+1][2] > 0 and XYZ[y+1][x][2] > 0:
+                    if torch.norm(XYZ[y][x+1] - XYZ[y+1][x+1]).item() < connect_thres and torch.norm(XYZ[y][x+1] - XYZ[y+1][x]).item() < connect_thres:
+                        f.write("f %d/%d %d/%d %d/%d\n" % (XYZ_to_idx[(y, x+1)], XYZ_to_idx[(y, x+1)], XYZ_to_idx[(y+1, x+1)], XYZ_to_idx[(y+1, x+1)], XYZ_to_idx[(y+1, x)], XYZ_to_idx[(y+1, x)]))
+def dump_pointclouds_compare(opt, idx, name, preds, gts, folder='dump'):
+    for i in range(len(idx)):
+        pred = preds[i].cpu().numpy()   # [N1, 3]
+        gt = gts[i].cpu().numpy()   # [N2, 3]
+        color_pred = np.zeros(pred.shape).astype(np.uint8)
+        color_pred[:, 0] = 255
+        color_gt = np.zeros(gt.shape).astype(np.uint8)
+        color_gt[:, 1] = 255
+        pc_vertices = np.vstack([pred, gt])
+        colors = np.vstack([color_pred, color_gt])
+        pc_color = trimesh.points.PointCloud(vertices=pc_vertices, colors=colors)
+        fname = "{}/{}/{}_{}.ply".format(opt.output_path, folder, idx[i], name)
+        pc_color.export(fname)
+def dump_pointclouds(opt, idx, name, pcs, colors, folder='dump', colormap='jet'):
+    for i, pc, color in zip(idx, pcs, colors):
+        pc = pc.cpu().numpy()   # [N, 3]
+        color = color.cpu().numpy()   # [N, 3] or [N, 1]
+        # convert scalar color to rgb with colormap
+        if color.shape[1] == 1:
+            # single channel color in numpy between [0, 1] to rgb
+            color = plt.get_cmap(colormap)(color[:, 0])
+            color = (color * 255).astype(np.uint8)
+        pc_color = trimesh.points.PointCloud(vertices=pc, colors=color)
+        fname = "{}/{}/{}_{}.ply".format(opt.output_path, folder, i, name)
+        pc_color.export(fname)
+@torch.no_grad()
+def vis_pointcloud(opt, vis, step, split, pred, GT=None):
+    win_name = "{0}/{1}".format(opt.group, opt.name)
+    pred, GT = pred.cpu().numpy(), GT.cpu().numpy()
+    for i in range(opt.visdom.num_samples):
+        # prediction
+        data = [dict(
+            type="scatter3d",
+            x=[float(n) for n in points[i, :opt.visdom.num_points, 0]],
+            y=[float(n) for n in points[i, :opt.visdom.num_points, 1]],
+            z=[float(n) for n in points[i, :opt.visdom.num_points, 2]],
+            mode="markers",
+            marker=dict(
+                color=color,
+                size=1,
+            ),
+        ) for points, color in zip([pred, GT], ["blue", "magenta"])]
+        vis._send(dict(
+            data=data,
+            win="{0} #{1}".format(split, i),
+            eid="{0}/{1}".format(opt.group, opt.name),
+            layout=dict(
+                title="{0} #{1} ({2})".format(split, i, step),
+                autosize=True,
+                margin=dict(l=30, r=30, b=30, t=30, ),
+                showlegend=False,
+                yaxis=dict(
+                    scaleanchor="x",
+                    scaleratio=1,
+                )
+            ),
+            opts=dict(title="{0} #{1} ({2})".format(win_name, i, step), ),
+        ))
+@torch.no_grad()
+def draw_pose(opt, image, rot_mtrx, size=15, width=1):
+    # rot_mtrx: [3, 4]
+    mode = "RGBA" if image.shape[0]==4 else "RGB"
+    image_pil = torchvision_F.to_pil_image(image.cpu()).convert("RGBA")
+    draw_pil = PIL.Image.new("RGBA", image_pil.size, (0, 0, 0, 0))
+    draw = PIL.ImageDraw.Draw(draw_pil)
+    center = (size, size)
+    # first column of rotation matrix is the rotated vector of [1, 0, 0]'
+    # second column of rotation matrix is the rotated vector of [0, 1, 0]'
+    # third column of rotation matrix is the rotated vector of [0, 0, 1]'
+    # then always take the first two element of each column is a projection to the 2D plane for visualization
+    endpoint = [(size+size*p[0], size+size*p[1]) for p in rot_mtrx.t()]
+    draw.line([center, endpoint[0]], fill=(255, 0, 0), width=width)
+    draw.line([center, endpoint[1]], fill=(0, 255, 0), width=width)
+    draw.line([center, endpoint[2]], fill=(0, 0, 255), width=width)
+    image_pil.alpha_composite(draw_pil)
+    image_drawn = torchvision_F.to_tensor(image_pil.convert(mode))
+    return image_drawn
+@torch.no_grad()
+def draw_metric(opt, image, metric):
+    mode = "RGBA" if image.shape[0]==4 else "RGB"
+    image_pil = torchvision_F.to_pil_image(image.cpu()).convert("RGBA")
+    draw_pil = PIL.Image.new("RGBA", image_pil.size, (0, 0, 0, 0))
+    draw = PIL.ImageDraw.Draw(draw_pil)
+    font = ImageFont.truetype("DejaVuSans.ttf", 24)
+    position = (image_pil.size[0] - 80, image_pil.size[1] - 35)
+    draw.text(position, '{:.3f}'.format(metric), fill="red", font=font)
+    image_pil.alpha_composite(draw_pil)
+    image_drawn = torchvision_F.to_tensor(image_pil.convert(mode))
+    return image_drawn
+@torch.no_grad()
+def show_att_on_image(img, mask):
+    """
+    Convert the grayscale attention into heatmap on the image.
+    Parameters
+    ----------
+    img: np.array, [H, W, 3]
+        Original colored image in [0, 1].
+    mask: np.array, [H, W]
+        Attention map in [0, 1].
+    Returns
+    ----------
+    np image with attention applied.
+    """
+    # check the validity
+    assert np.max(img) <= 1
+    assert np.max(mask) <= 1
+    # generate heatmap and normalize into [0, 1]
+    heatmap = cv2.cvtColor(cv2.applyColorMap(np.uint8(255*mask), cv2.COLORMAP_JET), cv2.COLOR_BGR2RGB)
+    heatmap = np.float32(heatmap) / 255
+    # add heatmap onto the image
+    merged = heatmap + np.float32(img)
+    # re-scale the image
+    merged = merged / np.max(merged)
+    return merged
+def look_at(camera_position, camera_target, up_vector):
+	vector = camera_position - camera_target
+	vector = vector / np.linalg.norm(vector)
+	vector2 = np.cross(up_vector, vector)
+	vector2 = vector2 / np.linalg.norm(vector2)
+	vector3 = np.cross(vector, vector2)
+	return np.array([
+		[vector2[0], vector3[0], vector[0], 0.0],
+		[vector2[1], vector3[1], vector[1], 0.0],
+		[vector2[2], vector3[2], vector[2], 0.0],
+		[-np.dot(vector2, camera_position), -np.dot(vector3, camera_position), np.dot(vector, camera_position), 1.0]
+	])
+def scale_to_unit_cube(mesh):
+    if isinstance(mesh, trimesh.Scene):
+        mesh = mesh.dump().sum()
+    vertices = mesh.vertices - mesh.bounding_box.centroid
+    vertices *= 2 / np.max(mesh.bounding_box.extents)
+    vertices *= 0.5
+    return trimesh.Trimesh(vertices=vertices, faces=mesh.faces)
+def get_positions_and_rotations(n_frames=180, r=1.5):
+    '''
+        n_frames: how many frames
+        r: how far should the camera be
+    '''
+    # test case 1
+    n_frame_full_circ = n_frames // 3 # frames for a full circle
+    n_frame_half_circ = n_frames // 6 # frames for a half circle
+    # full circle in horizontal axes going from 1 to -1 height axis
+    pos1 = [np.array([r*np.cos(theta), elev, r*np.sin(theta)])
+        for theta, elev in zip(np.linspace(0.5*np.pi,2.5*np.pi, n_frame_full_circ), np.linspace(1,-1,n_frame_full_circ))]
+    # half circle in horizontal axes at fixed -1 height
+    pos2 = [np.array([r*np.cos(theta), -1, r*np.sin(theta)])
+        for theta in np.linspace(2.5*np.pi,3.5*np.pi, n_frame_half_circ)]
+    # full circle in horizontal axes going from -1 to 1 height axis
+    pos3 = [np.array([r*np.cos(theta), elev, r*np.sin(theta)])
+        for theta, elev in zip(np.linspace(3.5*np.pi,5.5*np.pi, n_frame_full_circ), np.linspace(-1,1,n_frame_full_circ))]
+    # half circle in horizontal axes at fixed 1 height
+    pos4 = [np.array([r*np.cos(theta), 1, r*np.sin(theta)])
+        for theta in np.linspace(3.5*np.pi,4.5*np.pi, n_frame_half_circ)]
+    pos = pos1 + pos2 + pos3 + pos4
+    target = np.array([0.0, 0.0, 0.0])
+    up = np.array([0.0, 1.0, 0.0])
+    rot = [look_at(x, target, up) for x in pos]
+    return pos, rot
+def visualize_mesh(mesh, output_path, resolution=(200,200), write_gif=True, write_frames=True, time_per_frame=80, n_frames=180):
+    '''
+        mesh: Trimesh mesh object
+        output_path: absolute path, ".gif" will get added if write_gif, and this will be used as dirname if write_frames is true
+        time_per_frame: how many milliseconds to wait for each frame
+        n_frames: how many frames in total
+    '''
+    # set material
+    mat = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.8,
+            roughnessFactor=1.0,
+            alphaMode='OPAQUE',
+            baseColorFactor=(0.5, 0.5, 0.8, 1.0),
+        )
+    # define and add scene elements
+    mesh = pyrender.Mesh.from_trimesh(mesh, material=mat)
+    camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0, aspectRatio=1.0)
+    light = pyrender.SpotLight(color=np.ones(3), intensity=15.0,
+                               innerConeAngle=np.pi/4.0,
+                               outerConeAngle=np.pi/4.0)
+    scene = pyrender.Scene()
+    obj = scene.add(mesh)
+    cam = scene.add(camera)
+    light = scene.add(light)
+    positions, rotations = get_positions_and_rotations(n_frames=n_frames)
+    r = pyrender.OffscreenRenderer(*resolution)
+    # move the camera and generate images
+    count = 0
+    image_list = []
+    for pos, rot in zip(positions, rotations):
+        pose = np.eye(4)
+        pose[:3, 3] = pos
+        pose[:3,:3] = rot[:3,:3]
+        scene.set_pose(cam, pose)
+        scene.set_pose(light, pose)
+        color, depth = r.render(scene)
+        img = Image.fromarray(color, mode="RGB")
+        image_list.append(img)
+    # save to file
+    if write_gif:
+        image_list[0].save(f"{output_path}.gif", format='GIF', append_images=image_list[1:], save_all=True, duration=80, loop=0)
+    if write_frames:
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        for i, img in enumerate(image_list):
+            img.save(os.path.join(output_path, f"{i:04d}.jpg"))
+def get_base64_encoded_image(image_path):
+    """
+    Returns the base64-encoded image at the given path.
+    Args:
+    image_path (str): The path to the image file.
+    Returns:
+    str: The base64-encoded image.
+    """
+    with open(image_path, "rb") as f:
+        img = Image.open(f)
+        if img.mode == 'RGBA':
+            img = img.convert('RGB')
+        # Resize the image to reduce its file size
+        img.thumbnail((200, 200))
+        buffer = io.BytesIO()
+        # Convert the image to JPEG format to reduce its file size
+        img.save(buffer, format="JPEG", quality=80)
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def get_base64_encoded_gif(gif_path):
+    """
+    Returns the base64-encoded GIF at the given path.
+    Args:
+    gif_path (str): The path to the GIF file.
+    Returns:
+    str: The base64-encoded GIF.
+    """
+    with open(gif_path, "rb") as f:
+        frames = imageio.mimread(f)
+        # Reduce the number of frames to reduce the file size
+        frames = frames[::4]
+        buffer = io.BytesIO()
+        # compress each image frame to reduce the file size
+        frames = [frame[::2, ::2] for frame in frames]
+        # Convert the GIF to a subrectangle format to reduce the file size
+        imageio.mimsave(buffer, frames, format="GIF", fps=10, subrectangles=True)
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def create_gif_html(folder_path, html_file, skip_every=10):
+    """
+    Creates an HTML file with a grid of sample visualizations.
+    Args:
+    folder_path (str): The path to the folder containing the sample visualizations.
+    html_file (str): The name of the HTML file to create.
+    """
+    # convert path to absolute path
+    folder_path = os.path.abspath(folder_path)
+    # Get a list of all the sample IDs
+    ids = []
+    count = 0
+    all_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split("_")[0]))
+    for filename in all_files:
+        if filename.endswith("_image_input.png"):
+            if count % skip_every == 0:
+                ids.append(filename.split("_")[0])
+            count += 1
+    # Write the HTML file
+    with open(html_file, "w") as f:
+        # Write the HTML header and CSS style
+        f.write("<html>\n")
+        f.write("<head>\n")
+        f.write("<style>\n")
+        f.write(".sample-container {\n")
+        f.write("  display: inline-block;\n")
+        f.write("  margin: 10px;\n")
+        f.write("  width: 350px;\n")
+        f.write("  height: 150px;\n")
+        f.write("  text-align: center;\n")
+        f.write("}\n")
+        f.write(".sample-container:nth-child(6n+1) {\n")
+        f.write("  clear: left;\n")
+        f.write("}\n")
+        f.write(".image-container, .gif-container {\n")
+        f.write("  display: inline-block;\n")
+        f.write("  margin: 10px;\n")
+        f.write("  width: 90px;\n")
+        f.write("  height: 90px;\n")
+        f.write("  object-fit: cover;\n")
+        f.write("}\n")
+        f.write("</style>\n")
+        f.write("</head>\n")
+        f.write("<body>\n")
+        # Write the sample visualizations to the HTML file
+        for sample_id in ids:
+            try:
+                f.write("<div class=\"sample-container\">\n")
+                f.write(f"<div class=\"sample-id\"><p>{sample_id}</p></div>\n")
+                f.write(f"<div class=\"image-container\"><img src=\"data:image/png;base64,{get_base64_encoded_image(os.path.join(folder_path, sample_id + '_image_input.png'))}\" width=\"90\" height=\"90\"></div>\n")
+                f.write(f"<div class=\"image-container\"><img src=\"data:image/png;base64,{get_base64_encoded_image(os.path.join(folder_path, sample_id + '_depth_est.png'))}\" width=\"90\" height=\"90\"></div>\n")
+                f.write(f"<div class=\"gif-container\"><img src=\"data:image/gif;base64,{get_base64_encoded_gif(os.path.join(folder_path, sample_id + '_mesh_viz.gif'))}\" width=\"90\" height=\"90\"></div>\n")
+                f.write("</div>\n")
+            except:
+                pass
+        # Write the HTML footer
+        f.write("</body>\n")
+        f.write("</html>\n")

weights/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore