# -*- coding: utf-8 -*- # # @File: gancraft.py # @Author: Haozhe Xie # @Date: 2023-04-12 19:53:21 # @Last Modified by: Haozhe Xie # @Last Modified at: 2024-03-03 11:15:36 # @Email: root@haozhexie.com # @Ref: https://github.com/FrozenBurning/SceneDreamer import numpy as np import torch import torch.nn.functional as F import citydreamer.extensions.grid_encoder class GanCraftGenerator(torch.nn.Module): def __init__(self, cfg): super(GanCraftGenerator, self).__init__() self.cfg = cfg self.render_net = RenderMLP(cfg) self.denoiser = RenderCNN(cfg) if cfg.NETWORK.GANCRAFT.ENCODER == "GLOBAL": self.encoder = GlobalEncoder(cfg) elif cfg.NETWORK.GANCRAFT.ENCODER == "LOCAL": self.encoder = LocalEncoder(cfg) else: self.encoder = None if ( not cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS and not cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES ): raise ValueError( "Either POS_EMD_INCUDE_CORDS or POS_EMD_INCUDE_FEATURES should be True." ) if cfg.NETWORK.GANCRAFT.POS_EMD == "HASH_GRID": grid_encoder_in_dim = 3 if cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS else 0 if ( cfg.NETWORK.GANCRAFT.ENCODER in ["GLOBAL", "LOCAL"] and cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES ): grid_encoder_in_dim += cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM self.pos_encoder = citydreamer.extensions.grid_encoder.GridEncoder( in_channels=grid_encoder_in_dim, n_levels=cfg.NETWORK.GANCRAFT.HASH_GRID_N_LEVELS, lvl_channels=cfg.NETWORK.GANCRAFT.HASH_GRID_LEVEL_DIM, desired_resolution=cfg.NETWORK.GANCRAFT.HASH_GRID_RESOLUTION, ) elif cfg.NETWORK.GANCRAFT.POS_EMD == "SIN_COS": self.pos_encoder = SinCosEncoder(cfg) def forward( self, hf_seg, voxel_id, depth2, raydirs, cam_origin, building_stats=None, z=None, deterministic=False, ): r"""GANcraft Generator forward. Args: hf_seg (N x (1 + M) x H' x W' tensor) : height field + seg map, where M is the number of classes. voxel_id (N x H x W x max_samples x 1 tensor): IDs of intersected tensors along each ray. depth2 (N x H x W x 2 x max_samples x 1 tensor): Depths of entrance and exit points for each ray-voxel intersection. raydirs (N x H x W x 1 x 3 tensor): The direction of each ray. cam_origin (N x 3 tensor): Camera origins. building_stats (N x 5 tensor): The dy, dx, h, w, ID of the target building. (Only used in building mode) z (N x STYLE_DIM tensor): The style vector. deterministic (bool): Whether to use equal-distance sampling instead of random stratified sampling. Returns: fake_images (N x 3 x H x W tensor): fake images """ bs, device = hf_seg.size(0), hf_seg.device if z is None and self.cfg.NETWORK.GANCRAFT.STYLE_DIM is not None: z = torch.randn( bs, self.cfg.NETWORK.GANCRAFT.STYLE_DIM, dtype=torch.float32, device=device, ) features = None if self.encoder is not None: features = self.encoder(hf_seg) net_out = self._forward_perpix( features, voxel_id, depth2, raydirs, cam_origin, z, building_stats, deterministic, ) fake_images = self._forward_global(net_out, z) return fake_images def _forward_perpix( self, features, voxel_id, depth2, raydirs, cam_origin, z, building_stats=None, deterministic=False, ): r"""Sample points along rays, forwarding the per-point MLP and aggregate pixel features Args: features (N x C1 tensor): Local features determined by the current pixel. voxel_id (N x H x W x M x 1 tensor): Voxel ids from ray-voxel intersection test. M: num intersected voxels depth2 (N x H x W x 2 x M x 1 tensor): Depths of entrance and exit points for each ray-voxel intersection. raydirs (N x H x W x 1 x 3 tensor): The direction of each ray. cam_origin (N x 3 tensor): Camera origins. z (N x C3 tensor): Intermediate style vectors. building_stats (N x 4 tensor): The dy, dx, h, w of the target building. (Only used in building mode) deterministic (bool): Whether to use equal-distance sampling instead of random stratified sampling. """ # Generate sky_mask; PE transform on ray direction. with torch.no_grad(): # sky_only_mask: when True, ray hits nothing but sky sky_only_mask = voxel_id[:, :, :, [0], :] == 0 with torch.no_grad(): normalized_cord, new_dists, new_idx = self._get_sampled_coordinates( self.cfg.NETWORK.GANCRAFT.N_SAMPLE_POINTS_PER_RAY, depth2, raydirs, cam_origin, building_stats, deterministic, ) # Generate per-sample segmentation label seg_map_bev = torch.gather(voxel_id, -2, new_idx) # print(seg_map_bev.size()) # torch.Size([N, H, W, n_samples + 1, 1]) # In Building Mode, the one more channel is used for building roofs n_classes = ( self.cfg.NETWORK.GANCRAFT.N_CLASSES + 1 if self.cfg.NETWORK.GANCRAFT.BUILDING_MODE else self.cfg.NETWORK.GANCRAFT.N_CLASSES ) seg_map_bev_onehot = torch.zeros( [ seg_map_bev.size(0), seg_map_bev.size(1), seg_map_bev.size(2), seg_map_bev.size(3), n_classes, ], dtype=torch.float, device=voxel_id.device, ) # print(seg_map_bev_onehot.size()) # torch.Size([N, H, W, n_samples + 1, 1]) seg_map_bev_onehot.scatter_(-1, seg_map_bev.long(), 1.0) net_out_s, net_out_c = self._forward_perpix_sub( features, normalized_cord, z, seg_map_bev_onehot ) # Blending weights = self._volum_rendering_relu( net_out_s, new_dists * self.cfg.NETWORK.GANCRAFT.DIST_SCALE, dim=-2 ) # If a ray exclusively hits the sky (no intersection with the voxels), set its weight to zero. weights = weights * torch.logical_not(sky_only_mask).float() # print(weights.size()) # torch.Size([N, H, W, n_samples + 1, 1]) rgbs = torch.clamp(net_out_c, -1, 1) + 1 net_out = torch.sum(weights * rgbs, dim=-2, keepdim=True) net_out = net_out.squeeze(-2) net_out = net_out - 1 return net_out def _get_sampled_coordinates( self, n_samples, depth2, raydirs, cam_origin, building_stats=None, deterministic=False, ): # Random sample points along the ray rand_depth, new_dists, new_idx = self._sample_depth_batched( depth2, n_samples + 1, deterministic=deterministic, use_box_boundaries=False, sample_depth=3, ) nan_mask = torch.isnan(rand_depth) inf_mask = torch.isinf(rand_depth) rand_depth[nan_mask | inf_mask] = 0.0 world_coord = raydirs * rand_depth + cam_origin[:, None, None, None, :] # assert worldcoord2.shape[-1] == 3 if self.cfg.NETWORK.GANCRAFT.BUILDING_MODE: assert building_stats is not None # Make the building object-centric building_stats = building_stats[:, None, None, None, :].repeat( 1, world_coord.size(1), world_coord.size(2), world_coord.size(3), 1 ) world_coord[..., 0] -= ( building_stats[..., 0] + self.cfg.NETWORK.GANCRAFT.CENTER_OFFSET ) world_coord[..., 1] -= ( building_stats[..., 1] + self.cfg.NETWORK.GANCRAFT.CENTER_OFFSET ) # TODO: Fix non-building rays zero_rd_mask = raydirs.repeat(1, 1, 1, n_samples, 1) world_coord[zero_rd_mask == 0] = 0 normalized_cord = self._get_normalized_coordinates(world_coord) return normalized_cord, new_dists, new_idx def _get_normalized_coordinates(self, world_coord): delimeter = torch.tensor( self.cfg.NETWORK.GANCRAFT.NORMALIZE_DELIMETER, device=world_coord.device ) normalized_cord = world_coord / delimeter * 2 - 1 # TODO: Temporary fix normalized_cord[normalized_cord > 1] = 1 normalized_cord[normalized_cord < -1] = -1 # assert (normalized_cord <= 1).all() # assert (normalized_cord >= -1).all() # print(delimeter, torch.min(normalized_cord), torch.max(normalized_cord)) # print(normalized_cord.size()) # torch.Size([1, 192, 192, 24, 3]) return normalized_cord def _sample_depth_batched( self, depth2, n_samples, deterministic=False, use_box_boundaries=True, sample_depth=3, ): r"""Make best effort to sample points within the same distance for every ray. Exception: When there is not enough voxel. Args: depth2 (N x H x W x 2 x M x 1 tensor): - N: Batch. - H, W: Height, Width. - 2: Entrance / exit depth for each intersected box. - M: Number of intersected boxes along the ray. - 1: One extra dim for consistent tensor dims. depth2 can include NaNs. deterministic (bool): Whether to use equal-distance sampling instead of random stratified sampling. use_box_boundaries (bool): Whether to add the entrance / exit points into the sample. sample_depth (float): Truncate the ray when it travels further than sample_depth inside voxels. """ bs = depth2.size(0) dim0 = depth2.size(1) dim1 = depth2.size(2) dists = depth2[:, :, :, 1] - depth2[:, :, :, 0] dists[torch.isnan(dists)] = 0 # print(dists.size()) # torch.Size([N, H, W, M, 1]) accu_depth = torch.cumsum(dists, dim=-2) # print(accu_depth.size()) # torch.Size([N, H, W, M, 1]) total_depth = accu_depth[..., [-1], :] # print(total_depth.size()) # torch.Size([N, H, W, 1, 1]) total_depth = torch.clamp(total_depth, None, sample_depth) # Ignore out of range box boundaries. Fill with random samples. if use_box_boundaries: boundary_samples = accu_depth.clone().detach() boundary_samples_filler = torch.rand_like(boundary_samples) * total_depth bad_mask = (accu_depth > sample_depth) | (dists == 0) boundary_samples[bad_mask] = boundary_samples_filler[bad_mask] rand_shape = [bs, dim0, dim1, n_samples, 1] if deterministic: rand_samples = torch.empty( rand_shape, dtype=total_depth.dtype, device=total_depth.device ) rand_samples[..., :, 0] = torch.linspace(0, 1, n_samples + 2)[1:-1] else: rand_samples = torch.rand( rand_shape, dtype=total_depth.dtype, device=total_depth.device ) # Stratified sampling as in NeRF rand_samples = rand_samples / n_samples rand_samples[..., :, 0] += torch.linspace( 0, 1, n_samples + 1, device=rand_samples.device )[:-1] rand_samples = rand_samples * total_depth # print(rand_samples.size()) # torch.Size([N, H, W, n_samples, 1]) # Can also include boundaries if use_box_boundaries: rand_samples = torch.cat( [ rand_samples, boundary_samples, torch.zeros( [bs, dim0, dim1, 1, 1], dtype=total_depth.dtype, device=total_depth.device, ), ], dim=-2, ) rand_samples, _ = torch.sort(rand_samples, dim=-2, descending=False) midpoints = (rand_samples[..., 1:, :] + rand_samples[..., :-1, :]) / 2 # print(midpoints.size()) # torch.Size([N, H, W, n_samples, 1]) new_dists = rand_samples[..., 1:, :] - rand_samples[..., :-1, :] # Scatter the random samples back # print(midpoints.unsqueeze(-3).size()) # torch.Size([N, H, W, 1, n_samples, 1]) # print(accu_depth.unsqueeze(-2).size()) # torch.Size([N, H, W, M, 1, 1]) idx = torch.sum(midpoints.unsqueeze(-3) > accu_depth.unsqueeze(-2), dim=-3) # print(idx.shape, idx.max(), idx.min()) # torch.Size([N, H, W, n_samples, 1]) max 5, min 0 depth_deltas = ( depth2[:, :, :, 0, 1:, :] - depth2[:, :, :, 1, :-1, :] ) # There might be NaNs! # print(depth_deltas.size()) # torch.Size([N, H, W, M, M - 1, 1]) depth_deltas = torch.cumsum(depth_deltas, dim=-2) depth_deltas = torch.cat( [depth2[:, :, :, 0, [0], :], depth_deltas + depth2[:, :, :, 0, [0], :]], dim=-2, ) heads = torch.gather(depth_deltas, -2, idx) # print(heads.size()) # torch.Size([N, H, W, M, 1]) # print(torch.any(torch.isnan(heads))) rand_depth = heads + midpoints # print(rand_depth.size()) # torch.Size([N, H, W, M, n_samples, 1]) return rand_depth, new_dists, idx def _volum_rendering_relu(self, sigma, dists, dim=2): free_energy = F.relu(sigma) * dists a = 1 - torch.exp(-free_energy.float()) # probability of it is not empty here b = torch.exp( -self._cumsum_exclusive(free_energy, dim=dim) ) # probability of everything is empty up to now return a * b # probability of the ray hits something here def _cumsum_exclusive(self, tensor, dim): cumsum = torch.cumsum(tensor, dim) cumsum = torch.roll(cumsum, 1, dim) cumsum.index_fill_( dim, torch.tensor([0], dtype=torch.long, device=tensor.device), 0 ) return cumsum def _forward_perpix_sub(self, features, normalized_cord, z, seg_map_bev_onehot): r"""Forwarding the MLP. Args: features (N x C1 x ...? tensor): Local features determined by the current pixel. normalized_coord (N x H x W x L x 3 tensor): 3D world coordinates of sampled points. L is number of samples; N is batch size, always 1. z (N x C3 tensor): Intermediate style vectors. seg_map_bev_onehot (N x H x W x L x C4): One-hot segmentation maps. Returns: net_out_s (N x H x W x L x 1 tensor): Opacities. net_out_c (N x H x W x L x C5 tensor): Color embeddings. """ feature_in = torch.empty( normalized_cord.size(0), normalized_cord.size(1), normalized_cord.size(2), normalized_cord.size(3), 0, device=normalized_cord.device, ) if self.cfg.NETWORK.GANCRAFT.ENCODER == "GLOBAL": # print(features.size()) # torch.Size([N, ENCODER_OUT_DIM]) feature_in = features[:, None, None, None, :].repeat( 1, normalized_cord.size(1), normalized_cord.size(2), normalized_cord.size(3), 1, ) elif self.cfg.NETWORK.GANCRAFT.ENCODER == "LOCAL": # print(features.size()) # torch.Size([N, ENCODER_OUT_DIM - 1, H, W]) # print(world_coord.size()) # torch.Size([N, H, W, L, 3]) # NOTE: grid specifies the sampling pixel locations normalized by the input spatial # dimensions. Therefore, it should have most values in the range of [-1, 1]. grid = normalized_cord.permute(0, 3, 1, 2, 4).reshape( -1, normalized_cord.size(1), normalized_cord.size(2), 3 ) # print(grid.size()) # torch.Size([N * L, H, W, 3]) feature_in = F.grid_sample( features.repeat(grid.size(0), 1, 1, 1), grid[..., [1, 0]], align_corners=False, ) # print(feature_in.size()) # torch.Size([N * L, ENCODER_OUT_DIM - 1, H, W]) feature_in = feature_in.reshape( normalized_cord.size(0), normalized_cord.size(3), feature_in.size(1), feature_in.size(2), feature_in.size(3), ).permute(0, 3, 4, 1, 2) # print(feature_in.size()) # torch.Size([N, H, W, L, ENCODER_OUT_DIM - 1]) feature_in = torch.cat([feature_in, normalized_cord[..., [2]]], dim=-1) # print(feature_in.size()) # torch.Size([N, H, W, L, ENCODER_OUT_DIM]) if self.cfg.NETWORK.GANCRAFT.POS_EMD in ["HASH_GRID", "SIN_COS"]: if ( self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS and self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES ): feature_in = self.pos_encoder( torch.cat([normalized_cord, feature_in], dim=-1) ) elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS: feature_in = torch.cat( [self.pos_encoder(normalized_cord), feature_in], dim=-1 ) elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES: # Ignore normalized_cord here to make it decoupled with coordinates feature_in = torch.cat([self.pos_encoder(feature_in)], dim=-1) else: if ( self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS and self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES ): feature_in = torch.cat([normalized_cord, feature_in], dim=-1) elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS: feature_in = normalized_cord elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES: feature_in = feature_in net_out_s, net_out_c = self.render_net(feature_in, z, seg_map_bev_onehot) return net_out_s, net_out_c def _forward_global(self, net_out, z): r"""Forward the CNN Args: net_out (N x C5 x H x W tensor): Intermediate feature maps. z (N x C3 tensor): Intermediate style vectors. Returns: fake_images (N x 3 x H x W tensor): Output image. """ fake_images = net_out.permute(0, 3, 1, 2).contiguous() if self.denoiser is not None: fake_images = self.denoiser(fake_images, z) fake_images = torch.tanh(fake_images) return fake_images class GlobalEncoder(torch.nn.Module): def __init__(self, cfg): super(GlobalEncoder, self).__init__() n_classes = cfg.NETWORK.GANCRAFT.N_CLASSES self.hf_conv = torch.nn.Conv2d(1, 8, kernel_size=3, stride=2, padding=1) self.seg_conv = torch.nn.Conv2d( n_classes, 8, kernel_size=3, stride=2, padding=1, ) conv_blocks = [] cur_hidden_channels = 16 for _ in range(1, cfg.NETWORK.GANCRAFT.GLOBAL_ENCODER_N_BLOCKS): conv_blocks.append( SRTConvBlock(in_channels=cur_hidden_channels, out_channels=None) ) cur_hidden_channels *= 2 self.conv_blocks = torch.nn.Sequential(*conv_blocks) self.fc1 = torch.nn.Linear(cur_hidden_channels, 16) self.fc2 = torch.nn.Linear(16, cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM) self.act = torch.nn.LeakyReLU(0.2) def forward(self, hf_seg): hf = self.act(self.hf_conv(hf_seg[:, [0]])) seg = self.act(self.seg_conv(hf_seg[:, 1:])) out = torch.cat([hf, seg], dim=1) for layer in self.conv_blocks: out = self.act(layer(out)) out = out.permute(0, 2, 3, 1) out = torch.mean(out.reshape(out.shape[0], -1, out.shape[-1]), dim=1) cond = self.act(self.fc1(out)) cond = torch.tanh(self.fc2(cond)) return cond class LocalEncoder(torch.nn.Module): def __init__(self, cfg): super(LocalEncoder, self).__init__() n_classes = cfg.NETWORK.GANCRAFT.N_CLASSES self.hf_conv = torch.nn.Conv2d(1, 32, kernel_size=7, stride=2, padding=3) self.seg_conv = torch.nn.Conv2d( n_classes, 32, kernel_size=7, stride=2, padding=3 ) if cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM == "BATCH_NORM": self.bn1 = torch.nn.BatchNorm2d(64) elif cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM == "GROUP_NORM": self.bn1 = torch.nn.GroupNorm(32, 64) else: raise ValueError( "Unknown normalization: %s" % cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM ) self.conv2 = ResConvBlock(64, 128, cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM) self.conv3 = ResConvBlock(128, 256, cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM) self.conv4 = ResConvBlock(256, 512, cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM) self.dconv5 = torch.nn.ConvTranspose2d( 512, 128, kernel_size=4, stride=2, padding=1 ) self.dconv6 = torch.nn.ConvTranspose2d( 128, 32, kernel_size=4, stride=2, padding=1 ) self.dconv7 = torch.nn.Conv2d( 32, cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM - 1, kernel_size=1 ) def forward(self, hf_seg): hf = self.hf_conv(hf_seg[:, [0]]) seg = self.seg_conv(hf_seg[:, 1:]) out = F.relu(self.bn1(torch.cat([hf, seg], dim=1)), inplace=True) # print(out.size()) # torch.Size([N, 64, H/2, W/2]) out = F.avg_pool2d(self.conv2(out), 2, stride=2) # print(out.size()) # torch.Size([N, 128, H/4, W/4]) out = self.conv3(out) # print(out.size()) # torch.Size([N, 256, H/4, W/4]) out = self.conv4(out) # print(out.size()) # torch.Size([N, 512, H/4, W/4]) out = self.dconv5(out) # print(out.size()) # torch.Size([N, 128, H/2, W/2]) out = self.dconv6(out) # print(out.size()) # torch.Size([N, 32, H, W]) out = self.dconv7(out) # print(out.size()) # torch.Size([N, OUT_DIM - 1, H, W]) return torch.tanh(out) class SinCosEncoder(torch.nn.Module): def __init__(self, cfg): super(SinCosEncoder, self).__init__() self.freq_bands = 2.0 ** torch.linspace( 0, cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS - 1, steps=cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS, ) def forward(self, features): cord_sin = torch.cat( [torch.sin(features * fb) for fb in self.freq_bands], dim=-1 ) cord_cos = torch.cat( [torch.cos(features * fb) for fb in self.freq_bands], dim=-1 ) return torch.cat([cord_sin, cord_cos], dim=-1) class RenderMLP(torch.nn.Module): r"""MLP with affine modulation.""" def __init__(self, cfg): super(RenderMLP, self).__init__() in_dim = 0 f_dim = ( cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM if cfg.NETWORK.GANCRAFT.ENCODER in ["GLOBAL", "LOCAL"] else 0 ) if cfg.NETWORK.GANCRAFT.POS_EMD == "HASH_GRID": in_dim = ( cfg.NETWORK.GANCRAFT.HASH_GRID_N_LEVELS * cfg.NETWORK.GANCRAFT.HASH_GRID_LEVEL_DIM ) in_dim += ( f_dim if cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS and not cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES else 0 ) elif cfg.NETWORK.GANCRAFT.POS_EMD == "SIN_COS": if ( cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS and cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES ): in_dim = (3 + f_dim) * cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS * 2 elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS: in_dim = 3 * cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS * 2 + f_dim elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES: in_dim = f_dim * cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS * 2 else: if ( cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS and cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES ): in_dim = 3 + f_dim elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS: in_dim = 3 elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES: in_dim = f_dim self.fc_m_a = torch.nn.Linear( ( cfg.NETWORK.GANCRAFT.N_CLASSES + 1 if cfg.NETWORK.GANCRAFT.BUILDING_MODE else cfg.NETWORK.GANCRAFT.N_CLASSES ), cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, bias=False, ) self.fc_1 = torch.nn.Linear( in_dim, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, ) self.fc_2 = ( ModLinear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.STYLE_DIM, bias=False, mod_bias=True, output_mode=True, ) if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None else torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, ) ) self.fc_3 = ( ModLinear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.STYLE_DIM, bias=False, mod_bias=True, output_mode=True, ) if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None else torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, ) ) self.fc_4 = ( ModLinear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.STYLE_DIM, bias=False, mod_bias=True, output_mode=True, ) if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None else torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, ) ) self.fc_sigma = ( torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_SIGMA, ) if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None else torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_SIGMA, ) ) self.fc_5 = ( ModLinear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.STYLE_DIM, bias=False, mod_bias=True, output_mode=True, ) if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None else torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, ) ) self.fc_6 = ( ModLinear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.STYLE_DIM, bias=False, mod_bias=True, output_mode=True, ) if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None else torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, ) ) self.fc_out_c = ( torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_COLOR, ) if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None else torch.nn.Linear( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_COLOR, ) ) self.act = torch.nn.LeakyReLU(negative_slope=0.2) def forward(self, x, z, m): r"""Forward network Args: x (N x H x W x M x in_channels tensor): Projected features. z (N x cfg.NETWORK.GANCRAFT.STYLE_DIM tensor): Style codes. m (N x H x W x M x mask_channels tensor): One-hot segmentation maps. """ # b, h, w, n, _ = x.size() if z is not None: z = z[:, None, None, None, :] f = self.fc_1(x) f = f + self.fc_m_a(m) # Common MLP f = self.act(f) f = self.act(self.fc_2(f, z)) if z is not None else self.act(self.fc_2(f)) f = self.act(self.fc_3(f, z)) if z is not None else self.act(self.fc_3(f)) f = self.act(self.fc_4(f, z)) if z is not None else self.act(self.fc_4(f)) # Sigma MLP sigma = self.fc_sigma(f) if z is not None else self.act(self.fc_sigma(f)) # Color MLP f = self.act(self.fc_5(f, z)) if z is not None else self.act(self.fc_5(f)) f = self.act(self.fc_6(f, z)) if z is not None else self.act(self.fc_6(f)) c = self.fc_out_c(f) return sigma, c class RenderCNN(torch.nn.Module): r"""CNN converting intermediate feature map to final image.""" def __init__(self, cfg): super(RenderCNN, self).__init__() if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None: self.fc_z_cond = torch.nn.Linear( cfg.NETWORK.GANCRAFT.STYLE_DIM, 2 * 2 * cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, ) self.conv1 = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_COLOR, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 1, stride=1, padding=0, ) self.conv2a = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 3, stride=1, padding=1, ) self.conv2b = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 3, stride=1, padding=1, bias=False, ) self.conv3a = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 3, stride=1, padding=1, ) self.conv3b = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 3, stride=1, padding=1, bias=False, ) self.conv4a = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 1, stride=1, padding=0, ) self.conv4b = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 1, stride=1, padding=0, ) self.conv4 = torch.nn.Conv2d( cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 3, 1, stride=1, padding=0 ) self.act = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True) def modulate(self, x, w, b): w = w[..., None, None] b = b[..., None, None] return x * (w + 1) + b def forward(self, x, z): r"""Forward network. Args: x (N x in_channels x H x W tensor): Intermediate feature map z (N x style_dim tensor): Style codes. """ if z is not None: z = self.fc_z_cond(z) adapt = torch.chunk(z, 2 * 2, dim=-1) y = self.act(self.conv1(x)) y = y + self.conv2b(self.act(self.conv2a(y))) if z is not None: y = self.act(self.modulate(y, adapt[0], adapt[1])) else: y = self.act(y) y = y + self.conv3b(self.act(self.conv3a(y))) if z is not None: y = self.act(self.modulate(y, adapt[2], adapt[3])) else: y = self.act(y) y = y + self.conv4b(self.act(self.conv4a(y))) y = self.act(y) y = self.conv4(y) return y class SRTConvBlock(torch.nn.Module): def __init__(self, in_channels, hidden_channels=None, out_channels=None): super(SRTConvBlock, self).__init__() if hidden_channels is None: hidden_channels = in_channels if out_channels is None: out_channels = 2 * hidden_channels self.layers = torch.nn.Sequential( torch.nn.Conv2d( in_channels, hidden_channels, stride=1, kernel_size=3, padding=1, bias=False, ), torch.nn.ReLU(), torch.nn.Conv2d( hidden_channels, out_channels, stride=2, kernel_size=3, padding=1, bias=False, ), torch.nn.ReLU(), ) def forward(self, x): return self.layers(x) class ResConvBlock(torch.nn.Module): def __init__(self, in_channels, out_channels, norm, bias=False): super(ResConvBlock, self).__init__() # conv3x3(in_planes, int(out_planes / 2)) self.conv1 = torch.nn.Conv2d( in_channels, out_channels // 2, kernel_size=3, stride=1, padding=1, bias=bias, ) # conv3x3(int(out_planes / 2), int(out_planes / 4)) self.conv2 = torch.nn.Conv2d( out_channels // 2, out_channels // 4, kernel_size=3, stride=1, padding=1, bias=bias, ) # conv3x3(int(out_planes / 4), int(out_planes / 4)) self.conv3 = torch.nn.Conv2d( out_channels // 4, out_channels // 4, kernel_size=3, stride=1, padding=1, bias=bias, ) if norm == "BATCH_NORM": self.bn1 = torch.nn.BatchNorm2d(in_channels) self.bn2 = torch.nn.BatchNorm2d(out_channels // 2) self.bn3 = torch.nn.BatchNorm2d(out_channels // 4) self.bn4 = torch.nn.BatchNorm2d(in_channels) elif norm == "GROUP_NORM": self.bn1 = torch.nn.GroupNorm(32, in_channels) self.bn2 = torch.nn.GroupNorm(32, out_channels // 2) self.bn3 = torch.nn.GroupNorm(32, out_channels // 4) self.bn4 = torch.nn.GroupNorm(32, in_channels) if in_channels != out_channels: self.downsample = torch.nn.Sequential( self.bn4, torch.nn.ReLU(True), torch.nn.Conv2d( in_channels, out_channels, kernel_size=1, stride=1, bias=False ), ) else: self.downsample = None def forward(self, x): residual = x # print(residual.size()) # torch.Size([N, 64, H, W]) out1 = self.bn1(x) out1 = F.relu(out1, True) out1 = self.conv1(out1) # print(out1.size()) # torch.Size([N, 64, H, W]) out2 = self.bn2(out1) out2 = F.relu(out2, True) out2 = self.conv2(out2) # print(out2.size()) # torch.Size([N, 32, H, W]) out3 = self.bn3(out2) out3 = F.relu(out3, True) out3 = self.conv3(out3) # print(out3.size()) # torch.Size([N, 32, H, W]) out3 = torch.cat((out1, out2, out3), dim=1) # print(out3.size()) # torch.Size([N, 128, H, W]) if self.downsample is not None: residual = self.downsample(residual) # print(residual.size()) # torch.Size([N, 128, H, W]) out3 += residual return out3 class ModLinear(torch.nn.Module): r"""Linear layer with affine modulation (Based on StyleGAN2 mod demod). Equivalent to affine modulation following linear, but faster when the same modulation parameters are shared across multiple inputs. Args: in_features (int): Number of input features. out_features (int): Number of output features. style_features (int): Number of style features. bias (bool): Apply additive bias before the activation function? mod_bias (bool): Whether to modulate bias. output_mode (bool): If True, modulate output instead of input. weight_gain (float): Initialization gain """ def __init__( self, in_features, out_features, style_features, bias=True, mod_bias=True, output_mode=False, weight_gain=1, bias_init=0, ): super(ModLinear, self).__init__() weight_gain = weight_gain / np.sqrt(in_features) self.weight = torch.nn.Parameter( torch.randn([out_features, in_features]) * weight_gain ) self.bias = ( torch.nn.Parameter(torch.full([out_features], np.float32(bias_init))) if bias else None ) self.weight_alpha = torch.nn.Parameter( torch.randn([in_features, style_features]) / np.sqrt(style_features) ) self.bias_alpha = torch.nn.Parameter( torch.full([in_features], 1, dtype=torch.float) ) # init to 1 self.weight_beta = None self.bias_beta = None self.mod_bias = mod_bias self.output_mode = output_mode if mod_bias: if output_mode: mod_bias_dims = out_features else: mod_bias_dims = in_features self.weight_beta = torch.nn.Parameter( torch.randn([mod_bias_dims, style_features]) / np.sqrt(style_features) ) self.bias_beta = torch.nn.Parameter( torch.full([mod_bias_dims], 0, dtype=torch.float) ) @staticmethod def _linear_f(x, w, b): w = w.to(x.dtype) x_shape = x.shape x = x.reshape(-1, x_shape[-1]) if b is not None: b = b.to(x.dtype) x = torch.addmm(b.unsqueeze(0), x, w.t()) else: x = x.matmul(w.t()) x = x.reshape(*x_shape[:-1], -1) return x # x: B, ... , Cin # z: B, 1, 1, , Cz def forward(self, x, z): x_shape = x.shape z_shape = z.shape x = x.reshape(x_shape[0], -1, x_shape[-1]) z = z.reshape(z_shape[0], 1, z_shape[-1]) alpha = self._linear_f(z, self.weight_alpha, self.bias_alpha) # [B, ..., I] w = self.weight.to(x.dtype) # [O I] w = w.unsqueeze(0) * alpha # [1 O I] * [B 1 I] = [B O I] if self.mod_bias: beta = self._linear_f(z, self.weight_beta, self.bias_beta) # [B, ..., I] if not self.output_mode: x = x + beta b = self.bias if b is not None: b = b.to(x.dtype)[None, None, :] if self.mod_bias and self.output_mode: if b is None: b = beta else: b = b + beta # [B ? I] @ [B I O] = [B ? O] if b is not None: x = torch.baddbmm(b, x, w.transpose(1, 2)) else: x = x.bmm(w.transpose(1, 2)) x = x.reshape(*x_shape[:-1], x.shape[-1]) return x class GanCraftDiscriminator(torch.nn.Module): def __init__(self, cfg): super(GanCraftDiscriminator, self).__init__() # bottom-up pathway # down_conv2d_block = Conv2dBlock, stride=2, kernel=3, padding=1, weight_norm=spectral # self.enc1 = down_conv2d_block(num_input_channels, num_filters) # 3 self.enc1 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 3, # RGB cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=2, kernel_size=3, padding=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.enc2 = down_conv2d_block(1 * num_filters, 2 * num_filters) # 7 self.enc2 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 1 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=2, kernel_size=3, padding=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.enc3 = down_conv2d_block(2 * num_filters, 4 * num_filters) # 15 self.enc3 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=2, kernel_size=3, padding=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.enc4 = down_conv2d_block(4 * num_filters, 8 * num_filters) # 31 self.enc4 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=2, kernel_size=3, padding=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.enc5 = down_conv2d_block(8 * num_filters, 8 * num_filters) # 63 self.enc5 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=2, kernel_size=3, padding=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # top-down pathway # latent_conv2d_block = Conv2dBlock, stride=1, kernel=1, weight_norm=spectral # self.lat2 = latent_conv2d_block(2 * num_filters, 4 * num_filters) self.lat2 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=1, kernel_size=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.lat3 = latent_conv2d_block(4 * num_filters, 4 * num_filters) self.lat3 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=1, kernel_size=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.lat4 = latent_conv2d_block(8 * num_filters, 4 * num_filters) self.lat4 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=1, kernel_size=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.lat5 = latent_conv2d_block(8 * num_filters, 4 * num_filters) self.lat5 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=1, kernel_size=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # upsampling self.upsample2x = torch.nn.Upsample( scale_factor=2, mode="bilinear", align_corners=False ) # final layers # stride1_conv2d_block = Conv2dBlock, stride=1, kernel=3, padding=1, weight_norm=spectral # self.final2 = stride1_conv2d_block(4 * num_filters, 2 * num_filters) self.final2 = torch.nn.Sequential( torch.nn.utils.spectral_norm( torch.nn.Conv2d( 4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, 2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, stride=1, kernel_size=3, padding=1, bias=True, ) ), torch.nn.LeakyReLU(0.2), ) # self.output = Conv2dBlock(num_filters * 2, num_labels + 1, kernel_size=1) self.output = torch.nn.Sequential( torch.nn.Conv2d( 2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE, cfg.NETWORK.GANCRAFT.N_CLASSES + 1, stride=1, kernel_size=1, bias=True, ), torch.nn.LeakyReLU(0.2), ) self.interpolator = self._smooth_interp @staticmethod def _smooth_interp(x, size): r"""Smooth interpolation of segmentation maps. Args: x (4D tensor): Segmentation maps. size(2D list): Target size (H, W). """ x = F.interpolate(x, size=size, mode="area") onehot_idx = torch.argmax(x, dim=-3, keepdims=True) x.fill_(0.0) x.scatter_(1, onehot_idx, 1.0) return x def _single_forward(self, images, seg_maps): # bottom-up pathway feat11 = self.enc1(images) feat12 = self.enc2(feat11) feat13 = self.enc3(feat12) feat14 = self.enc4(feat13) feat15 = self.enc5(feat14) # top-down pathway and lateral connections feat25 = self.lat5(feat15) feat24 = self.upsample2x(feat25) + self.lat4(feat14) feat23 = self.upsample2x(feat24) + self.lat3(feat13) feat22 = self.upsample2x(feat23) + self.lat2(feat12) # final prediction layers feat32 = self.final2(feat22) label_map = self.interpolator(seg_maps, size=feat32.size()[2:]) pred = self.output(feat32) # N, num_labels + 1, H//4, W//4 return {"pred": pred, "label": label_map} def forward(self, images, seg_maps, masks): # print(seg_maps.size()) # torch.Size([1, 7, H, W]) # print(masks.size()) # torch.Size([1, 1, H, W]) seg_maps = seg_maps * masks return self._single_forward(images * masks, seg_maps)