Spaces:

liliyRehtina
/

color

Runtime error

App Files Files Community

DmitrMakeev commited on Aug 19, 2023

Commit

98c5805

•

1 Parent(s): 2e9004e

Upload 9 files

Browse files

Files changed (9) hide show

models/__init__.py +0 -0
models/anchor_gen.py +107 -0
models/basic.py +504 -0
models/clusterkit.py +291 -0
models/loss.py +222 -0
models/model.py +196 -0
models/network.py +352 -0
models/position_encoding.py +86 -0
models/transformer2d.py +229 -0

models/__init__.py ADDED Viewed

File without changes

models/anchor_gen.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from models import basic, clusterkit
+import pdb
+class AnchorAnalysis:
+    def __init__(self, mode, colorLabeler):
+        ## anchor generating mode: 1.random; 2.clustering
+        self.mode = mode
+        self.colorLabeler = colorLabeler
+    def _detect_correlation(self, data_tensors, color_probs, hint_masks, thres=0.1):
+        N,C,H,W = data_tensors.shape
+        ## (N,C,HW)
+        data_vecs = data_tensors.flatten(2)
+        prob_vecs = color_probs.flatten(2)
+        mask_vecs = hint_masks.flatten(2)
+        #anchor_data = torch.masked_select(data_vecs, mask_vecs.bool()).view(N,C,-1)
+        #anchor_prob = torch.masked_select(prob_vecs, mask_vecs.bool()).view(N,313,-1)
+        #_,_,K = anchor_data.shape
+        anchor_mask = torch.matmul(mask_vecs.permute(0,2,1), mask_vecs)
+        cosine_sim = True
+        ## non-similarity matrix
+        if cosine_sim:
+            norm_data = F.normalize(data_vecs, p=2, dim=1)
+            ## (N,HW,HW) = (N,HW,C) X (N,C,HW)
+            corr_matrix = torch.matmul(norm_data.permute(0,2,1), norm_data)
+            ## remapping: [-1.0,1.0] to [0.0,1.0], and convert into dis-similarity
+            dist_matrix = 1.0 - 0.5*(corr_matrix + 1.0)
+        else:
+            ## (N,HW,HW) = (N,HW,C) X (N,C,HW)
+            XtX = torch.matmul(data_vecs.permute(0,2,1), data_vecs)
+            diag_vec = torch.diagonal(XtX, dim1=-2, dim2=-1)
+            A = diag_vec.unsqueeze(1).repeat(1,H*W,1)
+            At = diag_vec.unsqueeze(2).repeat(1,1,H*W)
+            dist_matrix = A - 2*XtX + At
+        #dist_matrix = dist_matrix + 1e7*torch.eye(K).to(data_tensors.device).repeat(N,1,1)
+        ## for debug use
+        K = 8
+        anchor_adj_matrix = torch.masked_select(dist_matrix, anchor_mask.bool()).view(N,K,K)
+        ## dectect connected nodes
+        adj_matrix = torch.where((dist_matrix < thres) & (anchor_mask > 0), torch.ones_like(dist_matrix), torch.zeros_like(dist_matrix))
+        adj_matrix = torch.matmul(adj_matrix, adj_matrix)
+        adj_matrix = adj_matrix / (1e-7+adj_matrix)
+        ## merge nodes
+        ## (N,K,C) = (N,K,K) X (N,K,C)
+        anchor_prob = torch.matmul(adj_matrix, prob_vecs.permute(0,2,1)) / torch.sum(adj_matrix, dim=2, keepdim=True)
+        updated_prob_vecs = anchor_prob.permute(0,2,1) * mask_vecs + (1-mask_vecs) * prob_vecs
+        color_probs = updated_prob_vecs.view(N,313,H,W)
+        return color_probs, anchor_adj_matrix
+    def _sample_anchor_colors(self, pred_prob, hint_mask, T=0):
+        N,C,H,W = pred_prob.shape
+        topk = 10
+        assert T < topk
+        sorted_probs, batch_indexs = torch.sort(pred_prob, dim=1, descending=True)
+        ## (N,topk,H,W,1)
+        topk_probs = torch.softmax(sorted_probs[:,:topk,:,:], dim=1).unsqueeze(4)
+        topk_indexs = batch_indexs[:,:topk,:,:]
+        topk_ABs = torch.stack([self.colorLabeler.q_to_ab.index_select(0, q_i.flatten()).reshape(topk,H,W,2)
+                    for q_i in topk_indexs])
+        ## (N,topk,H,W,2)
+        topk_ABs = topk_ABs / 110.0
+        ## choose the most distinctive 3 colors for each anchor
+        if T == 0:
+            sampled_ABs = topk_ABs[:,0,:,:,:]
+        elif T == 1:
+            sampled_AB0 = topk_ABs[:,[0],:,:,:]
+            internal_diff = torch.norm(topk_ABs-sampled_AB0, p=2, dim=4, keepdim=True)
+            _, batch_indexs = torch.sort(internal_diff, dim=1, descending=True)
+            ## (N,1,H,W,2)
+            selected_index = batch_indexs[:,[0],:,:,:].expand([-1,-1,-1,-1,2])
+            sampled_ABs = torch.gather(topk_ABs, 1, selected_index)
+            sampled_ABs = sampled_ABs.squeeze(1)
+        else:
+            sampled_AB0 = topk_ABs[:,[0],:,:,:]
+            internal_diff = torch.norm(topk_ABs-sampled_AB0, p=2, dim=4, keepdim=True)
+            _, batch_indexs = torch.sort(internal_diff, dim=1, descending=True)
+            selected_index = batch_indexs[:,[0],:,:,:].expand([-1,-1,-1,-1,2])
+            sampled_AB1 = torch.gather(topk_ABs, 1, selected_index)
+            internal_diff2 = torch.norm(topk_ABs-sampled_AB1, p=2, dim=4, keepdim=True)
+            _, batch_indexs = torch.sort(internal_diff+internal_diff2, dim=1, descending=True)
+            ## (N,1,H,W,2)
+            selected_index = batch_indexs[:,[T-2],:,:,:].expand([-1,-1,-1,-1,2])
+            sampled_ABs = torch.gather(topk_ABs, 1, selected_index)
+            sampled_ABs = sampled_ABs.squeeze(1)
+        return sampled_ABs.permute(0,3,1,2)
+    def __call__(self, data_tensors, n_anchors, spixel_sizes, use_sklearn_kmeans=False):
+        N,C,H,W = data_tensors.shape
+        if self.mode == 'clustering':
+            ## clusters map: (N,K,H,W)
+            cluster_mask = clusterkit.batch_kmeans_pytorch(data_tensors, n_anchors, 'euclidean', use_sklearn_kmeans)
+            #noises = torch.rand(N,1,H,W).to(cluster_mask.device)
+            perturb_factors = spixel_sizes
+            cluster_prob = cluster_mask + perturb_factors * 0.01
+            hint_mask_layers = F.one_hot(torch.argmax(cluster_prob.flatten(2), dim=-1), num_classes=H*W).float()
+            hint_mask = torch.sum(hint_mask_layers, dim=1, keepdim=True).view(N,1,H,W)
+        else:
+            #print('----------hello, random!')
+            cluster_mask = torch.zeros(N,n_anchors,H,W).to(data_tensors.device)
+            binary_mask = basic.get_random_mask(N, H, W, minNum=n_anchors, maxNum=n_anchors)
+            hint_mask = torch.from_numpy(binary_mask).to(data_tensors.device)
+        return hint_mask, cluster_mask

models/basic.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.spectral_norm as spectral_norm
+from torch.autograd import Function
+from utils import util, cielab
+import cv2, math, random
+def tensor2array(tensors):
+    arrays = tensors.detach().to("cpu").numpy()
+    return np.transpose(arrays, (0, 2, 3, 1))
+def rgb2gray(color_batch):
+    #! gray = 0.299*R+0.587*G+0.114*B
+    gray_batch = color_batch[:, 0, ...] * 0.299 + color_batch[:, 1, ...] * 0.587 + color_batch[:, 2, ...] * 0.114
+    gray_batch = gray_batch.unsqueeze_(1)
+    return gray_batch
+def getParamsAmount(model):
+    params = list(model.parameters())
+    count = 0
+    for var in params:
+        l = 1
+        for j in var.size():
+            l *= j
+        count += l
+    return count
+def checkAverageGradient(model):
+    meanGrad, cnt = 0.0, 0
+    for name, parms in model.named_parameters():
+        if parms.requires_grad:
+            meanGrad += torch.mean(torch.abs(parms.grad))
+            cnt += 1
+    return meanGrad.item() / cnt
+def get_random_mask(N, H, W, minNum, maxNum):
+    binary_maps = np.zeros((N, H*W), np.float32)
+    for i in range(N):
+        locs = random.sample(range(0, H*W), random.randint(minNum,maxNum))
+        binary_maps[i, locs] = 1
+    return binary_maps.reshape(N,1,H,W)
+def io_user_control(hint_mask, spix_colors, output=True):
+    cache_dir = '/apdcephfs/private_richardxia'
+    if output:
+        print('--- data saving')
+        mask_imgs = tensor2array(hint_mask) * 2.0 - 1.0
+        util.save_images_from_batch(mask_imgs, cache_dir, ['mask.png'], -1)
+        fake_gray = torch.zeros_like(spix_colors[:,[0],:,:])
+        spix_labs = torch.cat((fake_gray,spix_colors), dim=1)
+        spix_imgs = tensor2array(spix_labs)
+        util.save_normLabs_from_batch(spix_imgs, cache_dir, ['color.png'], -1)
+        return hint_mask, spix_colors
+    else:
+        print('--- data loading')
+        mask_img = cv2.imread(cache_dir+'/mask.png', cv2.IMREAD_GRAYSCALE)
+        mask_img = np.expand_dims(mask_img, axis=2) / 255.
+        hint_mask = torch.from_numpy(mask_img.transpose((2, 0, 1)))
+        hint_mask = hint_mask.unsqueeze(0).cuda()
+        bgr_img = cv2.imread(cache_dir+'/color.png', cv2.IMREAD_COLOR)
+        rgb_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB)
+        rgb_img = np.array(rgb_img / 255., np.float32)
+        lab_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2LAB)
+        lab_img = torch.from_numpy(lab_img.transpose((2, 0, 1)))
+        ab_chans = lab_img[1:3,:,:] / 110.
+        spix_colors = ab_chans.unsqueeze(0).cuda()
+        return hint_mask.float(), spix_colors.float()
+class Quantize(Function):
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        y = x.round()
+        return y
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+        """
+        inputX = ctx.saved_tensors
+        return grad_output
+def mark_color_hints(input_grays, target_ABs, gate_maps, kernel_size=3, base_ABs=None):
+    ## to highlight the seeds with 1-pixel margin
+    binary_map = torch.where(gate_maps>0.7, torch.ones_like(gate_maps), torch.zeros_like(gate_maps))
+    center_mask = dilate_seeds(binary_map, kernel_size=kernel_size)
+    margin_mask = dilate_seeds(binary_map, kernel_size=kernel_size+2) - center_mask
+    ## drop colors
+    dilated_seeds = dilate_seeds(gate_maps, kernel_size=kernel_size+2)
+    marked_grays = torch.where(margin_mask > 1e-5, torch.ones_like(gate_maps), input_grays)
+    if base_ABs is None:
+        marked_ABs = torch.where(center_mask < 1e-5, torch.zeros_like(target_ABs), target_ABs)
+    else:
+        marked_ABs = torch.where(margin_mask > 1e-5, torch.zeros_like(base_ABs), base_ABs)
+        marked_ABs = torch.where(center_mask > 1e-5, target_ABs, marked_ABs)
+    return torch.cat((marked_grays,marked_ABs), dim=1)
+def dilate_seeds(gate_maps, kernel_size=3):
+    N,C,H,W = gate_maps.shape
+    input_unf = F.unfold(gate_maps, kernel_size, padding=kernel_size//2)
+    #! Notice: differentiable? just like max pooling?
+    dilated_seeds, _ = torch.max(input_unf, dim=1, keepdim=True)
+    output = F.fold(dilated_seeds, output_size=(H,W), kernel_size=1)
+    #print('-------', input_unf.shape)
+    return output
+class RebalanceLoss(Function):
+    @staticmethod
+    def forward(ctx, data_input, weights):
+        ctx.save_for_backward(weights)
+        return data_input.clone()
+    @staticmethod
+    def backward(ctx, grad_output):
+        weights, = ctx.saved_tensors
+        # reweigh gradient pixelwise so that rare colors get a chance to
+        # contribute
+        grad_input = grad_output * weights
+        # second return value is None since we are not interested in the
+        # gradient with respect to the weights
+        return grad_input, None
+class GetClassWeights:
+    def __init__(self, cielab, lambda_=0.5, device='cuda'):
+        prior = torch.from_numpy(cielab.gamut.prior).cuda()
+        uniform = torch.zeros_like(prior)
+        uniform[prior > 0] = 1 / (prior > 0).sum().type_as(uniform)
+        self.weights = 1 / ((1 - lambda_) * prior + lambda_ * uniform)
+        self.weights /= torch.sum(prior * self.weights)
+    def __call__(self, ab_actual):
+        return self.weights[ab_actual.argmax(dim=1, keepdim=True)]
+class ColorLabel:
+    def __init__(self, lambda_=0.5, device='cuda'):
+        self.cielab = cielab.CIELAB()
+        self.q_to_ab = torch.from_numpy(self.cielab.q_to_ab).to(device)
+        prior = torch.from_numpy(self.cielab.gamut.prior).to(device)
+        uniform = torch.zeros_like(prior)
+        uniform[prior>0] = 1 / (prior>0).sum().type_as(uniform)
+        self.weights = 1 / ((1-lambda_) * prior + lambda_ * uniform)
+        self.weights /= torch.sum(prior * self.weights)
+    def visualize_label(self, step=3):
+        height, width = 200, 313*step
+        label_lab = np.ones((height,width,3), np.float32)
+        for x in range(313):
+            ab = self.cielab.q_to_ab[x,:]
+            label_lab[:,step*x:step*(x+1),1:] = ab / 110.
+        label_lab[:,:,0] = np.zeros((height,width), np.float32)
+        return label_lab
+    @staticmethod
+    def _gauss_eval(x, mu, sigma):
+        norm = 1 / (2 * math.pi * sigma)
+        return norm * torch.exp(-torch.sum((x - mu)**2, dim=0) / (2 * sigma**2))
+    def get_classweights(self, batch_gt_indx):
+        #return self.weights[batch_gt_q.argmax(dim=1, keepdim=True)]
+        return self.weights[batch_gt_indx]
+    def encode_ab2ind(self, batch_ab, neighbours=5, sigma=5.0):
+        batch_ab = batch_ab * 110.
+        n, _, h, w = batch_ab.shape
+        m = n * h * w
+        # find nearest neighbours
+        ab_ = batch_ab.permute(1, 0, 2, 3).reshape(2, -1) # (2, n*h*w)
+        cdist = torch.cdist(self.q_to_ab, ab_.t())
+        nns = cdist.argsort(dim=0)[:neighbours, :]
+        # gaussian weighting
+        nn_gauss = batch_ab.new_zeros(neighbours, m)
+        for i in range(neighbours):
+            nn_gauss[i, :] = self._gauss_eval(self.q_to_ab[nns[i, :], :].t(), ab_, sigma)
+        nn_gauss /= nn_gauss.sum(dim=0, keepdim=True)
+        # expand
+        bins = self.cielab.gamut.EXPECTED_SIZE
+        q = batch_ab.new_zeros(bins, m)
+        q[nns, torch.arange(m).repeat(neighbours, 1)] = nn_gauss
+        return q.reshape(bins, n, h, w).permute(1, 0, 2, 3)
+    def decode_ind2ab(self, batch_q, T=0.38):
+        _, _, h, w = batch_q.shape
+        batch_q = F.softmax(batch_q, dim=1)
+        if T%1 == 0:
+            # take the T-st probable index
+            sorted_probs, batch_indexs = torch.sort(batch_q, dim=1, descending=True)
+            #print('checking [index]', batch_indexs[:,0:5,5,5])
+            #print('checking [probs]', sorted_probs[:,0:5,5,5])
+            batch_indexs = batch_indexs[:,T:T+1,:,:]
+            #batch_indexs = torch.where(sorted_probs[:,T:T+1,:,:] > 0.25, batch_indexs[:,T:T+1,:,:], batch_indexs[:,0:1,:,:])
+            ab = torch.stack([
+                self.q_to_ab.index_select(0, q_i.flatten()).reshape(h,w,2).permute(2,0,1)
+                for q_i in batch_indexs])
+        else:
+            batch_q = torch.exp(batch_q / T)
+            batch_q /= batch_q.sum(dim=1, keepdim=True)
+            a = torch.tensordot(batch_q, self.q_to_ab[:,0], dims=((1,), (0,)))
+            a = a.unsqueeze(dim=1)
+            b = torch.tensordot(batch_q, self.q_to_ab[:,1], dims=((1,), (0,)))
+            b = b.unsqueeze(dim=1)
+            ab = torch.cat((a, b), dim=1)
+        ab = ab / 110.
+        return ab.type(batch_q.dtype)
+def init_spixel_grid(img_height, img_width, spixel_size=16):
+    # get spixel id for the final assignment
+    n_spixl_h = int(np.floor(img_height/spixel_size))
+    n_spixl_w = int(np.floor(img_width/spixel_size))
+    spixel_height = int(img_height / (1. * n_spixl_h))
+    spixel_width = int(img_width / (1. * n_spixl_w))
+    spix_values = np.int32(np.arange(0, n_spixl_w * n_spixl_h).reshape((n_spixl_h, n_spixl_w)))
+    def shift9pos(input, h_shift_unit=1, w_shift_unit=1):
+        # input should be padding as (c, 1+ height+1, 1+width+1)
+        input_pd = np.pad(input, ((h_shift_unit, h_shift_unit), (w_shift_unit, w_shift_unit)), mode='edge')
+        input_pd = np.expand_dims(input_pd, axis=0)
+        # assign to ...
+        top     = input_pd[:, :-2 * h_shift_unit,          w_shift_unit:-w_shift_unit]
+        bottom  = input_pd[:, 2 * h_shift_unit:,           w_shift_unit:-w_shift_unit]
+        left    = input_pd[:, h_shift_unit:-h_shift_unit,  :-2 * w_shift_unit]
+        right   = input_pd[:, h_shift_unit:-h_shift_unit,  2 * w_shift_unit:]
+        center = input_pd[:,h_shift_unit:-h_shift_unit,w_shift_unit:-w_shift_unit]
+        bottom_right    = input_pd[:, 2 * h_shift_unit:,   2 * w_shift_unit:]
+        bottom_left     = input_pd[:, 2 * h_shift_unit:,   :-2 * w_shift_unit]
+        top_right       = input_pd[:, :-2 * h_shift_unit,  2 * w_shift_unit:]
+        top_left        = input_pd[:, :-2 * h_shift_unit,  :-2 * w_shift_unit]
+        shift_tensor = np.concatenate([     top_left,    top,      top_right,
+                                            left,        center,      right,
+                                            bottom_left, bottom,    bottom_right], axis=0)
+        return shift_tensor
+    spix_idx_tensor_ = shift9pos(spix_values)
+    spix_idx_tensor = np.repeat(
+        np.repeat(spix_idx_tensor_, spixel_height, axis=1), spixel_width, axis=2)
+    spixel_id_tensor = torch.from_numpy(spix_idx_tensor).type(torch.float)
+    #! pixel coord feature maps
+    all_h_coords = np.arange(0, img_height, 1)
+    all_w_coords = np.arange(0, img_width, 1)
+    curr_pxl_coord = np.array(np.meshgrid(all_h_coords, all_w_coords, indexing='ij'))
+    coord_feat_tensor = np.concatenate([curr_pxl_coord[1:2, :, :], curr_pxl_coord[:1, :, :]])
+    coord_feat_tensor = torch.from_numpy(coord_feat_tensor).type(torch.float)
+    return spixel_id_tensor, coord_feat_tensor
+def split_spixels(assign_map, spixel_ids):
+    N,C,H,W = assign_map.shape
+    spixel_id_map = spixel_ids.expand(N,-1,-1,-1)
+    assig_max,_ = torch.max(assign_map, dim=1, keepdim=True)
+    assignment_ = torch.where(assign_map == assig_max, torch.ones(assign_map.shape).cuda(),torch.zeros(assign_map.shape).cuda())
+    ## winner take all
+    new_spixl_map_ = spixel_id_map * assignment_
+    new_spixl_map = torch.sum(new_spixl_map_,dim=1,keepdim=True).type(torch.int)
+    return new_spixl_map
+def poolfeat(input, prob, sp_h=2, sp_w=2, need_entry_prob=False):
+    def feat_prob_sum(feat_sum, prob_sum, shift_feat):
+        feat_sum += shift_feat[:, :-1, :, :]
+        prob_sum += shift_feat[:, -1:, :, :]
+        return feat_sum, prob_sum
+    b, _, h, w = input.shape
+    h_shift_unit = 1
+    w_shift_unit = 1
+    p2d = (w_shift_unit, w_shift_unit, h_shift_unit, h_shift_unit)
+    feat_ = torch.cat([input, torch.ones([b, 1, h, w], device=input.device)], dim=1)  # b* (n+1) *h*w
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 0, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w)) # b * (n+1) * h* w
+    send_to_top_left =  F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, 2 * h_shift_unit:, 2 * w_shift_unit:]
+    feat_sum = send_to_top_left[:, :-1, :, :].clone()
+    prob_sum = send_to_top_left[:, -1:, :, :].clone()
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 1, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    top = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, 2 * h_shift_unit:, w_shift_unit:-w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, top)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 2, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    top_right = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, 2 * h_shift_unit:, :-2 * w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, top_right)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 3, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    left = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, h_shift_unit:-h_shift_unit, 2 * w_shift_unit:]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, left)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 4, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    center = F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, h_shift_unit:-h_shift_unit, w_shift_unit:-w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, center)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 5, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    right = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, h_shift_unit:-h_shift_unit, :-2 * w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, right)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 6, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    bottom_left = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, :-2 * h_shift_unit, 2 * w_shift_unit:]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, bottom_left)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 7, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    bottom = F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, :-2 * h_shift_unit, w_shift_unit:-w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, bottom)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 8, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    bottom_right = F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, :-2 * h_shift_unit, :-2 * w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, bottom_right)
+    pooled_feat = feat_sum / (prob_sum + 1e-8)
+    if need_entry_prob:
+        return pooled_feat, prob_sum
+    return pooled_feat
+def get_spixel_size(affinity_map, sp_h=2, sp_w=2, elem_thres=25):
+    N,C,H,W = affinity_map.shape
+    device = affinity_map.device
+    assign_max,_ = torch.max(affinity_map, dim=1, keepdim=True)
+    assign_map = torch.where(affinity_map==assign_max, torch.ones(affinity_map.shape, device=device), torch.zeros(affinity_map.shape, device=device))
+    ## one_map = (N,1,H,W)
+    _, elem_num_maps = poolfeat(torch.ones(assign_max.shape, device=device), assign_map, sp_h, sp_w, True)
+    #all_one_map = torch.ones(elem_num_maps.shape).cuda()
+    #empty_mask = torch.where(elem_num_maps < elem_thres/256, all_one_map, 1-all_one_map)
+    return elem_num_maps
+def upfeat(input, prob, up_h=2, up_w=2):
+    # input b*n*H*W  downsampled
+    # prob b*9*h*w
+    b, c, h, w = input.shape
+    h_shift = 1
+    w_shift = 1
+    p2d = (w_shift, w_shift, h_shift, h_shift)
+    feat_pd = F.pad(input, p2d, mode='constant', value=0)
+    gt_frm_top_left = F.interpolate(feat_pd[:, :, :-2 * h_shift, :-2 * w_shift], size=(h * up_h, w * up_w),mode='nearest')
+    feat_sum = gt_frm_top_left * prob.narrow(1,0,1)
+    top = F.interpolate(feat_pd[:, :, :-2 * h_shift, w_shift:-w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += top * prob.narrow(1, 1, 1)
+    top_right = F.interpolate(feat_pd[:, :, :-2 * h_shift, 2 * w_shift:], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += top_right * prob.narrow(1,2,1)
+    left = F.interpolate(feat_pd[:, :, h_shift:-w_shift, :-2 * w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += left * prob.narrow(1, 3, 1)
+    center = F.interpolate(input, (h * up_h, w * up_w), mode='nearest')
+    feat_sum += center * prob.narrow(1, 4, 1)
+    right = F.interpolate(feat_pd[:, :, h_shift:-w_shift, 2 * w_shift:], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += right * prob.narrow(1, 5, 1)
+    bottom_left = F.interpolate(feat_pd[:, :, 2 * h_shift:, :-2 * w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += bottom_left * prob.narrow(1, 6, 1)
+    bottom = F.interpolate(feat_pd[:, :, 2 * h_shift:, w_shift:-w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += bottom * prob.narrow(1, 7, 1)
+    bottom_right =  F.interpolate(feat_pd[:, :, 2 * h_shift:, 2 * w_shift:], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += bottom_right * prob.narrow(1, 8, 1)
+    return feat_sum
+def suck_and_spread(self, base_maps, seg_layers):
+    N,S,H,W = seg_layers.shape
+    base_maps = base_maps.unsqueeze(1)
+    seg_layers = seg_layers.unsqueeze(2)
+    ## (N,S,C,1,1) = (N,1,C,H,W) * (N,S,1,H,W)
+    mean_val_layers = (base_maps * seg_layers).sum(dim=(3,4), keepdim=True) / (1e-5 + seg_layers.sum(dim=(3,4), keepdim=True))
+    ## normalized to be sum one
+    weight_layers = seg_layers / (1e-5 + torch.sum(seg_layers, dim=1, keepdim=True))
+    ## (N,S,C,H,W) = (N,S,C,1,1) * (N,S,1,H,W)
+    recon_maps = mean_val_layers * weight_layers
+    return recon_maps.sum(dim=1)
+#! copy from Richard Zhang [SIGGRAPH2017]
+# RGB grid points maps to Lab range: L[0,100], a[-86.183,98,233], b[-107.857,94.478]
+#------------------------------------------------------------------------------
+def rgb2xyz(rgb):  # rgb from [0,1]
+    # xyz_from_rgb = np.array([[0.412453, 0.357580, 0.180423],
+        #  [0.212671, 0.715160, 0.072169],
+        #  [0.019334, 0.119193, 0.950227]])
+    mask = (rgb > .04045).type(torch.FloatTensor)
+    if(rgb.is_cuda):
+        mask = mask.cuda()
+    rgb = (((rgb+.055)/1.055)**2.4)*mask + rgb/12.92*(1-mask)
+    x = .412453*rgb[:,0,:,:]+.357580*rgb[:,1,:,:]+.180423*rgb[:,2,:,:]
+    y = .212671*rgb[:,0,:,:]+.715160*rgb[:,1,:,:]+.072169*rgb[:,2,:,:]
+    z = .019334*rgb[:,0,:,:]+.119193*rgb[:,1,:,:]+.950227*rgb[:,2,:,:]
+    out = torch.cat((x[:,None,:,:],y[:,None,:,:],z[:,None,:,:]),dim=1)
+    return out
+def xyz2rgb(xyz):
+    # array([[ 3.24048134, -1.53715152, -0.49853633],
+    #        [-0.96925495,  1.87599   ,  0.04155593],
+    #        [ 0.05564664, -0.20404134,  1.05731107]])
+    r = 3.24048134*xyz[:,0,:,:]-1.53715152*xyz[:,1,:,:]-0.49853633*xyz[:,2,:,:]
+    g = -0.96925495*xyz[:,0,:,:]+1.87599*xyz[:,1,:,:]+.04155593*xyz[:,2,:,:]
+    b = .05564664*xyz[:,0,:,:]-.20404134*xyz[:,1,:,:]+1.05731107*xyz[:,2,:,:]
+    rgb = torch.cat((r[:,None,:,:],g[:,None,:,:],b[:,None,:,:]),dim=1)
+    #！ sometimes reaches a small negative number, which causes NaNs
+    rgb = torch.max(rgb,torch.zeros_like(rgb))
+    mask = (rgb > .0031308).type(torch.FloatTensor)
+    if(rgb.is_cuda):
+        mask = mask.cuda()
+    rgb = (1.055*(rgb**(1./2.4)) - 0.055)*mask + 12.92*rgb*(1-mask)
+    return rgb
+def xyz2lab(xyz):
+    # 0.95047, 1., 1.08883 # white
+    sc = torch.Tensor((0.95047, 1., 1.08883))[None,:,None,None]
+    if(xyz.is_cuda):
+        sc = sc.cuda()
+    xyz_scale = xyz/sc
+    mask = (xyz_scale > .008856).type(torch.FloatTensor)
+    if(xyz_scale.is_cuda):
+        mask = mask.cuda()
+    xyz_int = xyz_scale**(1/3.)*mask + (7.787*xyz_scale + 16./116.)*(1-mask)
+    L = 116.*xyz_int[:,1,:,:]-16.
+    a = 500.*(xyz_int[:,0,:,:]-xyz_int[:,1,:,:])
+    b = 200.*(xyz_int[:,1,:,:]-xyz_int[:,2,:,:])
+    out = torch.cat((L[:,None,:,:],a[:,None,:,:],b[:,None,:,:]),dim=1)
+    return out
+def lab2xyz(lab):
+    y_int = (lab[:,0,:,:]+16.)/116.
+    x_int = (lab[:,1,:,:]/500.) + y_int
+    z_int = y_int - (lab[:,2,:,:]/200.)
+    if(z_int.is_cuda):
+        z_int = torch.max(torch.Tensor((0,)).cuda(), z_int)
+    else:
+        z_int = torch.max(torch.Tensor((0,)), z_int)
+    out = torch.cat((x_int[:,None,:,:],y_int[:,None,:,:],z_int[:,None,:,:]),dim=1)
+    mask = (out > .2068966).type(torch.FloatTensor)
+    if(out.is_cuda):
+        mask = mask.cuda()
+    out = (out**3.)*mask + (out - 16./116.)/7.787*(1-mask)
+    sc = torch.Tensor((0.95047, 1., 1.08883))[None,:,None,None]
+    sc = sc.to(out.device)
+    out = out*sc
+    return out
+def rgb2lab(rgb, l_mean=50, l_norm=50, ab_norm=110):
+    #! input rgb: [0,1]
+    #! output lab: [-1,1]
+    lab = xyz2lab(rgb2xyz(rgb))
+    l_rs = (lab[:,[0],:,:]-l_mean) / l_norm
+    ab_rs = lab[:,1:,:,:] / ab_norm
+    out = torch.cat((l_rs,ab_rs),dim=1)
+    return out
+def lab2rgb(lab_rs, l_mean=50, l_norm=50, ab_norm=110):
+    #! input lab: [-1,1]
+    #! output rgb: [0,1]
+    l_ = lab_rs[:,[0],:,:] * l_norm + l_mean
+    ab = lab_rs[:,1:,:,:] * ab_norm
+    lab = torch.cat((l_,ab), dim=1)
+    out = xyz2rgb(lab2xyz(lab))
+    return out
+if __name__ == '__main__':
+    minL, minA, minB = 999., 999., 999.
+    maxL, maxA, maxB = 0., 0., 0.
+    for r in range(256):
+        print('h',r)
+        for g in range(256):
+            for b in range(256):
+                rgb = np.array([r,g,b], np.float32).reshape(1,1,-1) / 255.0
+                #lab_img = cv2.cvtColor(rgb, cv2.COLOR_RGB2LAB)
+                rgb = torch.from_numpy(rgb.transpose((2, 0, 1)))
+                rgb = rgb.reshape(1,3,1,1)
+                lab = rgb2lab(rgb)
+                lab[:,[0],:,:] = lab[:,[0],:,:] * 50 + 50
+                lab[:,1:,:,:] = lab[:,1:,:,:] * 110
+                lab = lab.squeeze()
+                lab_float = lab.numpy()
+                #print('zhang vs. cv2:', lab_float, lab_img.squeeze())
+                minL = min(lab_float[0], minL)
+                minA = min(lab_float[1], minA)
+                minB = min(lab_float[2], minB)
+                maxL = max(lab_float[0], maxL)
+                maxA = max(lab_float[1], maxA)
+                maxB = max(lab_float[2], maxB)
+    print('L:', minL, maxL)
+    print('A:', minA, maxA)
+    print('B:', minB, maxB)

models/clusterkit.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+import numpy as np
+import torch
+from tqdm import tqdm
+import math, random
+#from sklearn.cluster import KMeans, kmeans_plusplus, MeanShift, estimate_bandwidth
+def tensor_kmeans_sklearn(data_vecs, n_clusters=7, metric='euclidean', need_layer_masks=False, max_iters=20):
+    N,C,H,W = data_vecs.shape
+    assert N == 1, 'only support singe image tensor'
+    ## (1,C,H,W) -> (HW,C)
+    data_vecs = data_vecs.permute(0,2,3,1).view(-1,C)
+    ## convert tensor to array
+    data_vecs_np = data_vecs.squeeze().detach().to("cpu").numpy()
+    km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300)
+    pred = km.fit_predict(data_vecs_np)
+    cluster_ids_x = torch.from_numpy(km.labels_).to(data_vecs.device)
+    id_maps = cluster_ids_x.reshape(1,1,H,W).long()
+    if need_layer_masks:
+        one_hot_labels = F.one_hot(id_maps.squeeze(1), num_classes=n_clusters).float()
+        cluster_mask = one_hot_labels.permute(0,3,1,2)
+        return cluster_mask
+    return id_maps
+def tensor_kmeans_pytorch(data_vecs, n_clusters=7, metric='euclidean', need_layer_masks=False, max_iters=20):
+    N,C,H,W = data_vecs.shape
+    assert N == 1, 'only support singe image tensor'
+    ## (1,C,H,W) -> (HW,C)
+    data_vecs = data_vecs.permute(0,2,3,1).view(-1,C)
+    ## cosine | euclidean
+    #cluster_ids_x, cluster_centers = kmeans(X=data_vecs, num_clusters=n_clusters, distance=metric, device=data_vecs.device)
+    cluster_ids_x, cluster_centers = kmeans(X=data_vecs, num_clusters=n_clusters, distance=metric,\
+                                                    tqdm_flag=False, iter_limit=max_iters, device=data_vecs.device)
+    id_maps = cluster_ids_x.reshape(1,1,H,W)
+    if need_layer_masks:
+        one_hot_labels = F.one_hot(id_maps.squeeze(1), num_classes=n_clusters).float()
+        cluster_mask = one_hot_labels.permute(0,3,1,2)
+        return cluster_mask
+    return id_maps
+def batch_kmeans_pytorch(data_vecs, n_clusters=7, metric='euclidean', use_sklearn_kmeans=False):
+    N,C,H,W = data_vecs.shape
+    sample_list = []
+    for idx in range(N):
+        if use_sklearn_kmeans:
+            cluster_mask = tensor_kmeans_sklearn(data_vecs[idx:idx+1,:,:,:], n_clusters, metric, True)
+        else:
+            cluster_mask = tensor_kmeans_pytorch(data_vecs[idx:idx+1,:,:,:], n_clusters, metric, True)
+        sample_list.append(cluster_mask)
+    return torch.cat(sample_list, dim=0)
+def get_centroid_candidates(data_vecs, n_clusters=7, metric='euclidean', max_iters=20):
+    N,C,H,W = data_vecs.shape
+    data_vecs = data_vecs.permute(0,2,3,1).view(-1,C)
+    cluster_ids_x, cluster_centers = kmeans(X=data_vecs, num_clusters=n_clusters, distance=metric,\
+                                                    tqdm_flag=False, iter_limit=max_iters, device=data_vecs.device)
+    return cluster_centers
+def find_distinctive_elements(data_tensor, n_clusters=7, topk=3, metric='euclidean'):
+    N,C,H,W = data_tensor.shape
+    centroid_list = []
+    for idx in range(N):
+        cluster_centers = get_centroid_candidates(data_tensor[idx:idx+1,:,:,:], n_clusters, metric)
+        centroid_list.append(cluster_centers)
+    batch_centroids = torch.stack(centroid_list, dim=0)
+    data_vecs = data_tensor.flatten(2)
+    ## distance matrix: (N,K,HW) = (N,K,C) x (N,C,HW)
+    AtB = torch.matmul(batch_centroids, data_vecs)
+    AtA = torch.matmul(batch_centroids, batch_centroids.permute(0,2,1))
+    BtB = torch.matmul(data_vecs.permute(0,2,1), data_vecs)
+    diag_A = torch.diagonal(AtA, dim1=-2, dim2=-1)
+    diag_B = torch.diagonal(BtB, dim1=-2, dim2=-1)
+    A2 = diag_A.unsqueeze(2).repeat(1,1,H*W)
+    B2 = diag_B.unsqueeze(1).repeat(1,n_clusters,1)
+    distance_map = A2 - 2*AtB + B2
+    values, indices = distance_map.topk(topk, dim=2, largest=False, sorted=True)
+    cluster_mask = torch.where(distance_map <= values[:,:,topk-1:], torch.ones_like(distance_map), torch.zeros_like(distance_map))
+    cluster_mask = cluster_mask.view(N,n_clusters,H,W)
+    return cluster_mask
+##---------------------------------------------------------------------------------
+'''
+    resource from github: https://github.com/subhadarship/kmeans_pytorch
+'''
+##---------------------------------------------------------------------------------
+def initialize(X, num_clusters):
+    """
+    initialize cluster centers
+    :param X: (torch.tensor) matrix
+    :param num_clusters: (int) number of clusters
+    :return: (np.array) initial state
+    """
+    np.random.seed(1)
+    num_samples = len(X)
+    indices = np.random.choice(num_samples, num_clusters, replace=False)
+    initial_state = X[indices]
+    return initial_state
+def kmeans(
+        X,
+        num_clusters,
+        distance='euclidean',
+        cluster_centers=[],
+        tol=1e-4,
+        tqdm_flag=True,
+        iter_limit=0,
+        device=torch.device('cpu'),
+        gamma_for_soft_dtw=0.001
+):
+    """
+    perform kmeans
+    :param X: (torch.tensor) matrix
+    :param num_clusters: (int) number of clusters
+    :param distance: (str) distance [options: 'euclidean', 'cosine'] [default: 'euclidean']
+    :param tol: (float) threshold [default: 0.0001]
+    :param device: (torch.device) device [default: cpu]
+    :param tqdm_flag: Allows to turn logs on and off
+    :param iter_limit: hard limit for max number of iterations
+    :param gamma_for_soft_dtw: approaches to (hard) DTW as gamma -> 0
+    :return: (torch.tensor, torch.tensor) cluster ids, cluster centers
+    """
+    if tqdm_flag:
+        print(f'running k-means on {device}..')
+    if distance == 'euclidean':
+        pairwise_distance_function = partial(pairwise_distance, device=device, tqdm_flag=tqdm_flag)
+    elif distance == 'cosine':
+        pairwise_distance_function = partial(pairwise_cosine, device=device)
+    else:
+        raise NotImplementedError
+    # convert to float
+    X = X.float()
+    # transfer to device
+    X = X.to(device)
+    # initialize
+    if type(cluster_centers) == list:  # ToDo: make this less annoyingly weird
+        initial_state = initialize(X, num_clusters)
+    else:
+        if tqdm_flag:
+            print('resuming')
+        # find data point closest to the initial cluster center
+        initial_state = cluster_centers
+        dis = pairwise_distance_function(X, initial_state)
+        choice_points = torch.argmin(dis, dim=0)
+        initial_state = X[choice_points]
+        initial_state = initial_state.to(device)
+    iteration = 0
+    if tqdm_flag:
+        tqdm_meter = tqdm(desc='[running kmeans]')
+    while True:
+        dis = pairwise_distance_function(X, initial_state)
+        choice_cluster = torch.argmin(dis, dim=1)
+        initial_state_pre = initial_state.clone()
+        for index in range(num_clusters):
+            selected = torch.nonzero(choice_cluster == index).squeeze().to(device)
+            selected = torch.index_select(X, 0, selected)
+            # https://github.com/subhadarship/kmeans_pytorch/issues/16
+            if selected.shape[0] == 0:
+                selected = X[torch.randint(len(X), (1,))]
+            initial_state[index] = selected.mean(dim=0)
+        center_shift = torch.sum(
+            torch.sqrt(
+                torch.sum((initial_state - initial_state_pre) ** 2, dim=1)
+            ))
+        # increment iteration
+        iteration = iteration + 1
+        # update tqdm meter
+        if tqdm_flag:
+            tqdm_meter.set_postfix(
+                iteration=f'{iteration}',
+                center_shift=f'{center_shift ** 2:0.6f}',
+                tol=f'{tol:0.6f}'
+            )
+            tqdm_meter.update()
+        if center_shift ** 2 < tol:
+            break
+        if iter_limit != 0 and iteration >= iter_limit:
+            #print('hello, there!')
+            break
+    return choice_cluster.to(device), initial_state.to(device)
+def kmeans_predict(
+        X,
+        cluster_centers,
+        distance='euclidean',
+        device=torch.device('cpu'),
+        gamma_for_soft_dtw=0.001,
+        tqdm_flag=True
+):
+    """
+    predict using cluster centers
+    :param X: (torch.tensor) matrix
+    :param cluster_centers: (torch.tensor) cluster centers
+    :param distance: (str) distance [options: 'euclidean', 'cosine'] [default: 'euclidean']
+    :param device: (torch.device) device [default: 'cpu']
+    :param gamma_for_soft_dtw: approaches to (hard) DTW as gamma -> 0
+    :return: (torch.tensor) cluster ids
+    """
+    if tqdm_flag:
+        print(f'predicting on {device}..')
+    if distance == 'euclidean':
+        pairwise_distance_function = partial(pairwise_distance, device=device, tqdm_flag=tqdm_flag)
+    elif distance == 'cosine':
+        pairwise_distance_function = partial(pairwise_cosine, device=device)
+    elif distance == 'soft_dtw':
+        sdtw = SoftDTW(use_cuda=device.type == 'cuda', gamma=gamma_for_soft_dtw)
+        pairwise_distance_function = partial(pairwise_soft_dtw, sdtw=sdtw, device=device)
+    else:
+        raise NotImplementedError
+    # convert to float
+    X = X.float()
+    # transfer to device
+    X = X.to(device)
+    dis = pairwise_distance_function(X, cluster_centers)
+    choice_cluster = torch.argmin(dis, dim=1)
+    return choice_cluster.cpu()
+def pairwise_distance(data1, data2, device=torch.device('cpu'), tqdm_flag=True):
+    if tqdm_flag:
+        print(f'device is :{device}')
+    # transfer to device
+    data1, data2 = data1.to(device), data2.to(device)
+    # N*1*M
+    A = data1.unsqueeze(dim=1)
+    # 1*N*M
+    B = data2.unsqueeze(dim=0)
+    dis = (A - B) ** 2.0
+    # return N*N matrix for pairwise distance
+    dis = dis.sum(dim=-1).squeeze()
+    return dis
+def pairwise_cosine(data1, data2, device=torch.device('cpu')):
+    # transfer to device
+    data1, data2 = data1.to(device), data2.to(device)
+    # N*1*M
+    A = data1.unsqueeze(dim=1)
+    # 1*N*M
+    B = data2.unsqueeze(dim=0)
+    # normalize the points  | [0.3, 0.4] -> [0.3/sqrt(0.09 + 0.16), 0.4/sqrt(0.09 + 0.16)] = [0.3/0.5, 0.4/0.5]
+    A_normalized = A / A.norm(dim=-1, keepdim=True)
+    B_normalized = B / B.norm(dim=-1, keepdim=True)
+    cosine = A_normalized * B_normalized
+    # return N*N matrix for pairwise distance
+    cosine_dis = 1 - cosine.sum(dim=-1).squeeze()
+    return cosine_dis

models/loss.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from __future__ import division
+import os, glob, shutil, math, random, json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+import basic
+from utils import util
+eps = 0.0000001
+class SPixelLoss:
+    def __init__(self, psize=8, mpdist=False, gpu_no=0):
+        self.mpdist = mpdist
+        self.gpu_no = gpu_no
+        self.sp_size = psize
+    def __call__(self, data, epoch_no):
+        kernel_size = self.sp_size
+        #pos_weight = 0.003
+        prob = data['pred_prob']
+        labxy_feat = data['target_feat']
+        N,C,H,W = labxy_feat.shape
+        pooled_labxy = basic.poolfeat(labxy_feat, prob, kernel_size, kernel_size)
+        reconstr_feat = basic.upfeat(pooled_labxy, prob, kernel_size, kernel_size)
+        loss_map = reconstr_feat[:,:,:,:] - labxy_feat[:,:,:,:]
+        featLoss_idx = torch.norm(loss_map[:,:-2,:,:], p=2, dim=1).mean()
+        posLoss_idx = torch.norm(loss_map[:,-2:,:,:], p=2, dim=1).mean() / kernel_size
+        totalLoss_idx = 10*featLoss_idx + 0.003*posLoss_idx
+        return {'totalLoss':totalLoss_idx, 'featLoss':featLoss_idx, 'posLoss':posLoss_idx}
+class AnchorColorProbLoss:
+    def __init__(self, hint2regress=False, enhanced=False, with_grad=False, mpdist=False, gpu_no=0):
+        self.mpdist = mpdist
+        self.gpu_no = gpu_no
+        self.hint2regress = hint2regress
+        self.enhanced = enhanced
+        self.with_grad = with_grad
+        self.rebalance_gradient = basic.RebalanceLoss.apply
+        self.entropy_loss = nn.CrossEntropyLoss(ignore_index=-1)
+        if self.enhanced:
+            self.VGGLoss = VGG19Loss(gpu_no=gpu_no, is_ddp=mpdist)
+    def _perceptual_loss(self, input_grays, input_colors, pred_colors):
+        input_RGBs = basic.lab2rgb(torch.cat([input_grays,input_colors], dim=1))
+        pred_RGBs = basic.lab2rgb(torch.cat([input_grays,pred_colors], dim=1))
+        ## the output of "lab2rgb" just matches the input of "VGGLoss": [0,1]
+        return self.VGGLoss(input_RGBs, pred_RGBs)
+    def _laplace_gradient(self, pred_AB, target_AB):
+        N,C,H,W = pred_AB.shape
+        kernel = torch.tensor([[1, 1, 1], [1, -8, 1], [1, 1, 1]], device=pred_AB.get_device()).float()
+        kernel = kernel.view(1, 1, *kernel.size()).repeat(C,1,1,1)
+        grad_pred = F.conv2d(pred_AB, kernel, groups=C)
+        grad_trg = F.conv2d(target_AB, kernel, groups=C)
+        return l1_loss(grad_trg, grad_pred)
+    def __call__(self, data, epoch_no):
+        N,C,H,W = data['target_label'].shape
+        pal_probs = self.rebalance_gradient(data['pal_prob'], data['class_weight'])
+        #ref_probs = data['ref_prob']
+        pal_probs = pal_probs.permute(0,2,3,1).contiguous().view(N*H*W, -1)
+        gt_labels = data['target_label'].permute(0,2,3,1).contiguous().view(N*H*W, -1)
+        '''
+        igored_mask = data['empty_entries'].permute(0,2,3,1).contiguous().view(N*H*W, -1)
+        gt_labels[igored_mask] = -1
+        gt_labels = gt_probs.squeeze()
+        '''
+        palLoss_idx = self.entropy_loss(pal_probs, gt_labels.squeeze(dim=1))
+        if self.hint2regress:
+            ref_probs = data['ref_prob']
+            refLoss_idx = 50 * l2_loss(data['spix_color'], ref_probs)
+        else:
+            ref_probs = self.rebalance_gradient(data['ref_prob'], data['class_weight'])
+            ref_probs = ref_probs.permute(0,2,3,1).contiguous().view(N*H*W, -1)
+            refLoss_idx = self.entropy_loss(ref_probs, gt_labels.squeeze(dim=1))
+        reconLoss_idx = torch.zeros_like(palLoss_idx)
+        if self.enhanced:
+            scalar = 1.0 if self.hint2regress else 5.0
+            reconLoss_idx = scalar * self._perceptual_loss(data['input_gray'], data['pred_color'], data['input_color'])
+            if self.with_grad:
+                gradient_loss = self._laplace_gradient(data['pred_color'], data['input_color'])
+                reconLoss_idx += gradient_loss
+        totalLoss_idx = palLoss_idx + refLoss_idx + reconLoss_idx
+        #print("loss terms:", palLoss_idx.item(), refLoss_idx.item(), reconLoss_idx.item())
+        return {'totalLoss':totalLoss_idx, 'palLoss':palLoss_idx, 'refLoss':refLoss_idx, 'recLoss':reconLoss_idx}
+def compute_affinity_pos_loss(prob_in, labxy_feat, pos_weight=0.003, kernel_size=16):
+    S = kernel_size
+    m = pos_weight
+    prob = prob_in.clone()
+    N,C,H,W = labxy_feat.shape
+    pooled_labxy = basic.poolfeat(labxy_feat, prob, kernel_size, kernel_size)
+    reconstr_feat = basic.upfeat(pooled_labxy, prob, kernel_size, kernel_size)
+    loss_map = reconstr_feat[:,:,:,:] - labxy_feat[:,:,:,:]
+    loss_feat = torch.norm(loss_map[:,:-2,:,:], p=2, dim=1).mean()
+    loss_pos = torch.norm(loss_map[:,-2:,:,:], p=2, dim=1).mean() * m / S
+    loss_affinity = loss_feat + loss_pos
+    return loss_affinity
+def l2_loss(y_input, y_target, weight_map=None):
+    if weight_map is None:
+        return F.mse_loss(y_input, y_target)
+    else:
+        diff_map = torch.mean(torch.abs(y_input-y_target), dim=1, keepdim=True)
+        batch_dev = torch.sum(diff_map*diff_map*weight_map, dim=(1,2,3)) / (eps+torch.sum(weight_map, dim=(1,2,3)))
+        return batch_dev.mean()
+def l1_loss(y_input, y_target, weight_map=None):
+    if weight_map is None:
+        return F.l1_loss(y_input, y_target)
+    else:
+        diff_map = torch.mean(torch.abs(y_input-y_target), dim=1, keepdim=True)
+        batch_dev = torch.sum(diff_map*weight_map, dim=(1,2,3)) / (eps+torch.sum(weight_map, dim=(1,2,3)))
+        return batch_dev.mean()
+def masked_l1_loss(y_input, y_target, outlier_mask):
+    one = torch.tensor([1.0]).cuda(y_input.get_device())
+    weight_map = torch.where(outlier_mask, one * 0.0, one * 1.0)
+    return l1_loss(y_input, y_target, weight_map)
+def huber_loss(y_input, y_target, delta=0.01):
+    mask = torch.zeros_like(y_input)
+    mann = torch.abs(y_input - y_target)
+    eucl = 0.5 * (mann**2)
+    mask[...] = mann < delta
+    loss = eucl * mask / delta + (mann - 0.5 * delta) * (1 - mask)
+    return torch.mean(loss)
+## Perceptual loss that uses a pretrained VGG network
+class VGG19Loss(nn.Module):
+    def __init__(self, feat_type='liu', gpu_no=0, is_ddp=False, requires_grad=False):
+        super(VGG19Loss, self).__init__()
+        os.environ['TORCH_HOME'] = '/apdcephfs/share_1290939/richardxia/Saved/Checkpoints/VGG19'
+        ## data requirement: (N,C,H,W) in RGB format, [0,1] range, and resolution >= 224x224
+        self.mean = [0.485, 0.456, 0.406]
+        self.std = [0.229, 0.224, 0.225]
+        self.feat_type = feat_type
+        vgg_model = torchvision.models.vgg19(pretrained=True)
+        ## AssertionError: DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient
+        '''
+        if is_ddp:
+            vgg_model = vgg_model.cuda(gpu_no)
+            vgg_model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(vgg_model)
+            vgg_model = torch.nn.parallel.DistributedDataParallel(vgg_model, device_ids=[gpu_no], find_unused_parameters=True)
+        else:
+            vgg_model = vgg_model.cuda(gpu_no)
+        '''
+        vgg_model = vgg_model.cuda(gpu_no)
+        if self.feat_type == 'liu':
+            ## conv1_1, conv2_1, conv3_1, conv4_1, conv5_1
+            self.slice1 = nn.Sequential(*list(vgg_model.features)[:2]).eval()
+            self.slice2 = nn.Sequential(*list(vgg_model.features)[2:7]).eval()
+            self.slice3 = nn.Sequential(*list(vgg_model.features)[7:12]).eval()
+            self.slice4 = nn.Sequential(*list(vgg_model.features)[12:21]).eval()
+            self.slice5 = nn.Sequential(*list(vgg_model.features)[21:30]).eval()
+            self.weights = [1.0/32, 1.0/16, 1.0/8, 1.0/4, 1.0]
+        elif self.feat_type == 'lei':
+            ## conv1_2, conv2_2, conv3_2, conv4_2, conv5_2
+            self.slice1 = nn.Sequential(*list(vgg_model.features)[:4]).eval()
+            self.slice2 = nn.Sequential(*list(vgg_model.features)[4:9]).eval()
+            self.slice3 = nn.Sequential(*list(vgg_model.features)[9:14]).eval()
+            self.slice4 = nn.Sequential(*list(vgg_model.features)[14:23]).eval()
+            self.slice5 = nn.Sequential(*list(vgg_model.features)[23:32]).eval()
+            self.weights = [1.0/2.6, 1.0/4.8, 1.0/3.7, 1.0/5.6, 10.0/1.5]
+        else:
+            ## maxpool after conv4_4
+            self.featureExactor = nn.Sequential(*list(vgg_model.features)[:28]).eval()
+        '''
+        for x in range(2):
+            self.slice1.add_module(str(x), pretrained_features[x])
+        for x in range(2, 7):
+            self.slice2.add_module(str(x), pretrained_features[x])
+        for x in range(7, 12):
+            self.slice3.add_module(str(x), pretrained_features[x])
+        for x in range(12, 21):
+            self.slice4.add_module(str(x), pretrained_features[x])
+        for x in range(21, 30):
+            self.slice5.add_module(str(x), pretrained_features[x])
+        '''
+        self.criterion = nn.L1Loss()
+        ## fixed parameters
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+        self.eval()
+        print('[*] VGG19Loss init!')
+    def normalize(self, tensor):
+        tensor = tensor.clone()
+        mean = torch.as_tensor(self.mean, dtype=torch.float32, device=tensor.device)
+        std = torch.as_tensor(self.std, dtype=torch.float32, device=tensor.device)
+        tensor.sub_(mean[None, :, None, None]).div_(std[None, :, None, None])
+        return tensor
+    def forward(self, x, y):
+        norm_x, norm_y = self.normalize(x), self.normalize(y)
+        ## feature extract
+        if self.feat_type == 'liu' or self.feat_type == 'lei':
+            x_relu1, y_relu1 = self.slice1(norm_x), self.slice1(norm_y)
+            x_relu2, y_relu2 = self.slice2(x_relu1), self.slice2(y_relu1)
+            x_relu3, y_relu3 = self.slice3(x_relu2), self.slice3(y_relu2)
+            x_relu4, y_relu4 = self.slice4(x_relu3), self.slice4(y_relu3)
+            x_relu5, y_relu5 = self.slice5(x_relu4), self.slice5(y_relu4)
+            x_vgg = [x_relu1, x_relu2, x_relu3, x_relu4, x_relu5]
+            y_vgg = [y_relu1, y_relu2, y_relu3, y_relu4, y_relu5]
+            loss = 0
+            for i in range(len(x_vgg)):
+                loss += self.weights[i] * self.criterion(x_vgg[i], y_vgg[i].detach())
+        else:
+            x_vgg, y_vgg = self.featureExactor(norm_x), self.featureExactor(norm_y)
+            loss = self.criterion(x_vgg, y_vgg.detach())
+        return loss

models/model.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.network import HourGlass2, SpixelNet, ColorProbNet
+from models.transformer2d import EncoderLayer, DecoderLayer, TransformerEncoder, TransformerDecoder
+from models.position_encoding import build_position_encoding
+from models import basic, clusterkit, anchor_gen
+from collections import OrderedDict
+from utils import util, cielab
+class SpixelSeg(nn.Module):
+    def __init__(self, inChannel=1, outChannel=9, batchNorm=True):
+        super(SpixelSeg, self).__init__()
+        self.net = SpixelNet(inChannel=inChannel, outChannel=outChannel, batchNorm=batchNorm)
+    def get_trainable_params(self, lr=1.0):
+        #print('=> [optimizer] finetune backbone with smaller lr')
+        params = []
+        for name, param in self.named_parameters():
+            if 'xxx' in name:
+                params.append({'params': param, 'lr': lr})
+            else:
+                params.append({'params': param})
+        return params
+    def forward(self, input_grays):
+        pred_probs = self.net(input_grays)
+        return pred_probs
+class AnchorColorProb(nn.Module):
+    def __init__(self, inChannel=1, outChannel=313, sp_size=16, d_model=64, use_dense_pos=True, spix_pos=False, learning_pos=False, \
+                random_hint=False, hint2regress=False, enhanced=False, use_mask=False, rank=0, colorLabeler=None):
+        super(AnchorColorProb, self).__init__()
+        self.sp_size = sp_size
+        self.spix_pos = spix_pos
+        self.use_token_mask = use_mask
+        self.hint2regress = hint2regress
+        self.segnet = SpixelSeg(inChannel=1, outChannel=9, batchNorm=True)
+        self.repnet = ColorProbNet(inChannel=inChannel, outChannel=64)
+        self.enhanced = enhanced
+        if self.enhanced:
+            self.enhanceNet = HourGlass2(inChannel=64+1, outChannel=2, resNum=3, normLayer=nn.BatchNorm2d)
+        ## transformer architecture
+        self.n_vocab = 313
+        d_model, dim_feedforward, nhead = d_model, 4*d_model, 8
+        dropout, activation = 0.1, "relu"
+        n_enc_layers, n_dec_layers = 6, 6
+        enc_layer = EncoderLayer(d_model, nhead, dim_feedforward, dropout, activation, use_dense_pos)
+        self.wildpath = TransformerEncoder(enc_layer, n_enc_layers, use_dense_pos)
+        self.hintpath = TransformerEncoder(enc_layer, n_enc_layers, use_dense_pos)
+        if self.spix_pos:
+            n_pos_x, n_pos_y = 256, 256
+        else:
+            n_pos_x, n_pos_y = 256//sp_size, 16//sp_size
+        self.pos_enc = build_position_encoding(d_model//2, n_pos_x, n_pos_y, is_learned=False)
+        self.mid_word_prj = nn.Linear(d_model, self.n_vocab, bias=False)
+        if self.hint2regress:
+            self.trg_word_emb = nn.Linear(d_model+2+1, d_model, bias=False)
+            self.trg_word_prj = nn.Linear(d_model, 2, bias=False)
+        else:
+            self.trg_word_emb = nn.Linear(d_model+self.n_vocab+1, d_model, bias=False)
+            self.trg_word_prj = nn.Linear(d_model, self.n_vocab, bias=False)
+        self.colorLabeler = colorLabeler
+        anchor_mode = 'random' if random_hint else 'clustering'
+        self.anchorGen = anchor_gen.AnchorAnalysis(mode=anchor_mode, colorLabeler=self.colorLabeler)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def load_and_froze_weight(self, checkpt_path):
+        data_dict = torch.load(checkpt_path, map_location=torch.device('cpu'))
+        '''
+        for param_tensor in data_dict['state_dict']:
+            print(param_tensor,'\t',data_dict['state_dict'][param_tensor].size())
+        '''
+        self.segnet.load_state_dict(data_dict['state_dict'])
+        for name, param in self.segnet.named_parameters():
+            param.requires_grad = False
+        self.segnet.eval()
+    def set_train(self):
+        ## running mode only affect certain modules, e.g. Dropout, BN, etc.
+        self.repnet.train()
+        self.wildpath.train()
+        self.hintpath.train()
+        if self.enhanced:
+            self.enhanceNet.train()
+    def get_entry_mask(self, mask_tensor):
+        if mask_tensor is None:
+            return None
+        ## flatten (N,1,H,W) to (N,HW)
+        return mask_tensor.flatten(1)
+    def forward(self, input_grays, input_colors, n_anchors=8, sampled_T=0):
+        '''
+        Notice: function was customized for inferece only
+        '''
+        affinity_map = self.segnet(input_grays)
+        pred_feats = self.repnet(input_grays)
+        if self.spix_pos:
+            full_pos_feats = self.pos_enc(pred_feats)
+            proxy_feats = torch.cat([pred_feats, input_colors, full_pos_feats], dim=1)
+            pooled_proxy_feats, conf_sum = basic.poolfeat(proxy_feats, affinity_map, self.sp_size, self.sp_size, True)
+            feat_tokens = pooled_proxy_feats[:,:64,:,:]
+            spix_colors = pooled_proxy_feats[:,64:66,:,:]
+            pos_feats = pooled_proxy_feats[:,66:,:,:]
+        else:
+            proxy_feats = torch.cat([pred_feats, input_colors], dim=1)
+            pooled_proxy_feats, conf_sum = basic.poolfeat(proxy_feats, affinity_map, self.sp_size, self.sp_size, True)
+            feat_tokens = pooled_proxy_feats[:,:64,:,:]
+            spix_colors = pooled_proxy_feats[:,64:,:,:]
+            pos_feats = self.pos_enc(feat_tokens)
+        token_labels = torch.max(self.colorLabeler.encode_ab2ind(spix_colors), dim=1, keepdim=True)[1]
+        spixel_sizes = basic.get_spixel_size(affinity_map, self.sp_size, self.sp_size)
+        all_one_map = torch.ones(spixel_sizes.shape, device=input_grays.device)
+        empty_entries = torch.where(spixel_sizes < 25/(self.sp_size**2), all_one_map, 1-all_one_map)
+        src_pad_mask = self.get_entry_mask(empty_entries) if self.use_token_mask else None
+        trg_pad_mask = src_pad_mask
+        ## parallel prob
+        N,C,H,W = feat_tokens.shape
+        ## (N,C,H,W) -> (HW,N,C)
+        src_pos_seq = pos_feats.flatten(2).permute(2, 0, 1)
+        src_seq = feat_tokens.flatten(2).permute(2, 0, 1)
+        ## color prob branch
+        enc_out, _ = self.wildpath(src_seq, src_pos_seq, src_pad_mask)
+        pal_logit = self.mid_word_prj(enc_out)
+        pal_logit = pal_logit.permute(1, 2, 0).view(N,self.n_vocab,H,W)
+        ## seed prob branch
+        ## mask(N,1,H,W): sample anchors at clustering layers
+        color_feat = enc_out.permute(1, 2, 0).view(N,C,H,W)
+        hint_mask, cluster_mask = self.anchorGen(color_feat, n_anchors, spixel_sizes, use_sklearn_kmeans=False)
+        pred_prob = torch.softmax(pal_logit, dim=1)
+        color_feat2 = src_seq.permute(1, 2, 0).view(N,C,H,W)
+        #pred_prob, adj_matrix = self.anchorGen._detect_correlation(color_feat, pred_prob, hint_mask, thres=0.1)
+        if sampled_T < 0:
+            ## GT anchor colors
+            sampled_spix_colors = spix_colors
+        elif sampled_T > 0:
+            top1_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=0)
+            top2_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=1)
+            top3_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=2)
+            ## duplicate meta tensors
+            sampled_spix_colors = torch.cat((top1_spix_colors,top2_spix_colors,top3_spix_colors), dim=0)
+            N = 3*N
+            input_grays = input_grays.expand(N,-1,-1,-1)
+            hint_mask = hint_mask.expand(N,-1,-1,-1)
+            affinity_map = affinity_map.expand(N,-1,-1,-1)
+            src_seq = src_seq.expand(-1, N,-1)
+            src_pos_seq = src_pos_seq.expand(-1, N,-1)
+        else:
+            sampled_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=sampled_T)
+        ## debug: controllable
+        if False:
+            hint_mask, sampled_spix_colors = basic.io_user_control(hint_mask, spix_colors, output=False)
+        sampled_token_labels = torch.max(self.colorLabeler.encode_ab2ind(sampled_spix_colors), dim=1, keepdim=True)[1]
+        ## hint based prediction
+        ## (N,C,H,W) -> (HW,N,C)
+        mask_seq = hint_mask.flatten(2).permute(2, 0, 1)
+        if self.hint2regress:
+            spix_colors_ = sampled_spix_colors
+            gt_seq = spix_colors_.flatten(2).permute(2, 0, 1)
+            hint_seq = self.trg_word_emb(torch.cat([src_seq, mask_seq * gt_seq, mask_seq], dim=2))
+            dec_out, _ = self.hintpath(hint_seq, src_pos_seq, src_pad_mask)
+        else:
+            token_labels_ = sampled_token_labels
+            label_map = F.one_hot(token_labels_, num_classes=313).squeeze(1).float()
+            label_seq = label_map.permute(0, 3, 1, 2).flatten(2).permute(2, 0, 1)
+            hint_seq = self.trg_word_emb(torch.cat([src_seq, mask_seq * label_seq, mask_seq], dim=2))
+            dec_out, _ = self.hintpath(hint_seq, src_pos_seq, src_pad_mask)
+        ref_logit = self.trg_word_prj(dec_out)
+        Ct = 2 if self.hint2regress else self.n_vocab
+        ref_logit = ref_logit.permute(1, 2, 0).view(N,Ct,H,W)
+        ## pixelwise enhancement
+        pred_colors = None
+        if self.enhanced:
+            proc_feats = dec_out.permute(1, 2, 0).view(N,64,H,W)
+            full_feats = basic.upfeat(proc_feats, affinity_map, self.sp_size, self.sp_size)
+            pred_colors = self.enhanceNet(torch.cat((input_grays,full_feats), dim=1))
+            pred_colors = torch.tanh(pred_colors)
+        return pal_logit, ref_logit, pred_colors, affinity_map, spix_colors, hint_mask

models/network.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+import torchvision
+import torch.nn.utils.spectral_norm as spectral_norm
+import math
+class ConvBlock(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum, normLayer=None):
+        super(ConvBlock, self).__init__()
+        self.inConv = nn.Sequential(
+            nn.Conv2d(inChannels, outChannels, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True)
+        )
+        layers = []
+        for _ in range(convNum - 1):
+            layers.append(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1))
+            layers.append(nn.ReLU(inplace=True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.inConv(x)
+        x = self.conv(x)
+        return x
+class ResidualBlock(nn.Module):
+    def __init__(self, channels, normLayer=None):
+        super(ResidualBlock, self).__init__()
+        layers = []
+        layers.append(nn.Conv2d(channels, channels, kernel_size=3, padding=1))
+        layers.append(spectral_norm(nn.Conv2d(channels, channels, kernel_size=3, padding=1)))
+        if not (normLayer is None):
+            layers.append(normLayer(channels))
+        layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.Conv2d(channels, channels, kernel_size=3, padding=1))
+        if not (normLayer is None):
+            layers.append(normLayer(channels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        residual = self.conv(x)
+        return F.relu(x + residual, inplace=True)
+class ResidualBlockSN(nn.Module):
+    def __init__(self, channels, normLayer=None):
+        super(ResidualBlockSN, self).__init__()
+        layers = []
+        layers.append(spectral_norm(nn.Conv2d(channels, channels, kernel_size=3, padding=1)))
+        layers.append(nn.LeakyReLU(0.2, True))
+        layers.append(spectral_norm(nn.Conv2d(channels, channels, kernel_size=3, padding=1)))
+        if not (normLayer is None):
+            layers.append(normLayer(channels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        residual = self.conv(x)
+        return F.leaky_relu(x + residual, 2e-1, inplace=True)
+class DownsampleBlock(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum=2, normLayer=None):
+        super(DownsampleBlock, self).__init__()
+        layers = []
+        layers.append(nn.Conv2d(inChannels, outChannels, kernel_size=3, padding=1, stride=2))
+        layers.append(nn.ReLU(inplace=True))
+        for _ in range(convNum - 1):
+            layers.append(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1))
+            layers.append(nn.ReLU(inplace=True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.conv(x)
+class UpsampleBlock(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum=2, normLayer=None):
+        super(UpsampleBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inChannels, outChannels, kernel_size=3, padding=1, stride=1)
+        self.combine = nn.Conv2d(2 * outChannels, outChannels, kernel_size=3, padding=1)
+        layers = []
+        for _ in range(convNum - 1):
+            layers.append(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1))
+            layers.append(nn.ReLU(inplace=True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv2 = nn.Sequential(*layers)
+    def forward(self, x, x0):
+        x = self.conv1(x)
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        x = self.combine(torch.cat((x, x0), 1))
+        x = F.relu(x)
+        return self.conv2(x)
+class UpsampleBlockSN(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum=2, normLayer=None):
+        super(UpsampleBlockSN, self).__init__()
+        self.conv1 = spectral_norm(nn.Conv2d(inChannels, outChannels, kernel_size=3, stride=1, padding=1))
+        self.shortcut = spectral_norm(nn.Conv2d(outChannels, outChannels, kernel_size=3, stride=1, padding=1))
+        layers = []
+        for _ in range(convNum - 1):
+            layers.append(spectral_norm(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1)))
+            layers.append(nn.LeakyReLU(0.2, True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv2 = nn.Sequential(*layers)
+    def forward(self, x, x0):
+        x = self.conv1(x)
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        x = x + self.shortcut(x0)
+        x = F.leaky_relu(x, 2e-1)
+        return self.conv2(x)
+class HourGlass2(nn.Module):
+    def __init__(self, inChannel=3, outChannel=1, resNum=3, normLayer=None):
+        super(HourGlass2, self).__init__()
+        self.inConv = ConvBlock(inChannel, 64, convNum=2, normLayer=normLayer)
+        self.down1 = DownsampleBlock(64, 128, convNum=2, normLayer=normLayer)
+        self.down2 = DownsampleBlock(128, 256, convNum=2, normLayer=normLayer)
+        self.residual = nn.Sequential(*[ResidualBlock(256) for _ in range(resNum)])
+        self.up2 = UpsampleBlock(256, 128, convNum=3, normLayer=normLayer)
+        self.up1 = UpsampleBlock(128, 64, convNum=3, normLayer=normLayer)
+        self.outConv = nn.Conv2d(64, outChannel, kernel_size=3, padding=1)
+    def forward(self, x):
+        f1 = self.inConv(x)
+        f2 = self.down1(f1)
+        f3 = self.down2(f2)
+        r3 = self.residual(f3)
+        r2 = self.up2(r3, f2)
+        r1 = self.up1(r2, f1)
+        y = self.outConv(r1)
+        return y
+class ColorProbNet(nn.Module):
+    def __init__(self, inChannel=1, outChannel=2, with_SA=False):
+        super(ColorProbNet, self).__init__()
+        BNFunc = nn.BatchNorm2d
+        # conv1: 256
+        conv1_2 = [spectral_norm(nn.Conv2d(inChannel, 64, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv1_2 += [spectral_norm(nn.Conv2d(64, 64, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv1_2 += [BNFunc(64, affine=True)]
+        # conv2: 128
+        conv2_3 = [spectral_norm(nn.Conv2d(64, 128, 3, stride=2, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv2_3 += [spectral_norm(nn.Conv2d(128, 128, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv2_3 += [spectral_norm(nn.Conv2d(128, 128, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv2_3 += [BNFunc(128, affine=True)]
+        # conv3: 64
+        conv3_3 = [spectral_norm(nn.Conv2d(128, 256, 3, stride=2, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv3_3 += [spectral_norm(nn.Conv2d(256, 256, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv3_3 += [spectral_norm(nn.Conv2d(256, 256, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv3_3 += [BNFunc(256, affine=True)]
+        # conv4: 32
+        conv4_3 = [spectral_norm(nn.Conv2d(256, 512, 3, stride=2, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv4_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv4_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv4_3 += [BNFunc(512, affine=True)]
+        # conv5: 32
+        conv5_3 = [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv5_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv5_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv5_3 += [BNFunc(512, affine=True)]
+        # conv6: 32
+        conv6_3 = [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv6_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv6_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv6_3 += [BNFunc(512, affine=True),]
+        if with_SA:
+            conv6_3 += [Self_Attn(512)]
+        # conv7: 32
+        conv7_3 = [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv7_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv7_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv7_3 += [BNFunc(512, affine=True)]
+        # conv8: 64
+        conv8up = [nn.Upsample(scale_factor=2, mode='nearest'), nn.Conv2d(512, 256, 3, stride=1, padding=1),]
+        conv3short8 = [nn.Conv2d(256, 256, 3, stride=1, padding=1),]
+        conv8_3 = [nn.ReLU(True),]
+        conv8_3 += [nn.Conv2d(256, 256, 3, stride=1, padding=1), nn.ReLU(True),]
+        conv8_3 += [nn.Conv2d(256, 256, 3, stride=1, padding=1), nn.ReLU(True),]
+        conv8_3 += [BNFunc(256, affine=True),]
+        # conv9: 128
+        conv9up = [nn.Upsample(scale_factor=2, mode='nearest'), nn.Conv2d(256, 128, 3, stride=1, padding=1),]
+        conv9_2 = [nn.Conv2d(128, 128, 3, stride=1, padding=1), nn.ReLU(True),]
+        conv9_2 += [BNFunc(128, affine=True)]
+        # conv10: 64
+        conv10up = [nn.Upsample(scale_factor=2, mode='nearest'), nn.Conv2d(128, 64, 3, stride=1, padding=1),]
+        conv10_2 = [nn.ReLU(True),]
+        conv10_2 += [nn.Conv2d(64, outChannel, 3, stride=1, padding=1), nn.ReLU(True),]
+        self.conv1_2 = nn.Sequential(*conv1_2)
+        self.conv2_3 = nn.Sequential(*conv2_3)
+        self.conv3_3 = nn.Sequential(*conv3_3)
+        self.conv4_3 = nn.Sequential(*conv4_3)
+        self.conv5_3 = nn.Sequential(*conv5_3)
+        self.conv6_3 = nn.Sequential(*conv6_3)
+        self.conv7_3 = nn.Sequential(*conv7_3)
+        self.conv8up = nn.Sequential(*conv8up)
+        self.conv3short8 = nn.Sequential(*conv3short8)
+        self.conv8_3 = nn.Sequential(*conv8_3)
+        self.conv9up = nn.Sequential(*conv9up)
+        self.conv9_2 = nn.Sequential(*conv9_2)
+        self.conv10up = nn.Sequential(*conv10up)
+        self.conv10_2 = nn.Sequential(*conv10_2)
+        # claffificaton output
+        #self.model_class = nn.Sequential(*[nn.Conv2d(256, 313, kernel_size=1, padding=0, stride=1),])
+    def forward(self, input_grays):
+        f1_2 = self.conv1_2(input_grays)
+        f2_3 = self.conv2_3(f1_2)
+        f3_3 = self.conv3_3(f2_3)
+        f4_3 = self.conv4_3(f3_3)
+        f5_3 = self.conv5_3(f4_3)
+        f6_3 = self.conv6_3(f5_3)
+        f7_3 = self.conv7_3(f6_3)
+        f8_up = self.conv8up(f7_3) + self.conv3short8(f3_3)
+        f8_3 = self.conv8_3(f8_up)
+        f9_up = self.conv9up(f8_3)
+        f9_2 = self.conv9_2(f9_up)
+        f10_up = self.conv10up(f9_2)
+        f10_2 = self.conv10_2(f10_up)
+        out_feats = f10_2
+        #out_probs = self.model_class(f8_3)
+        return out_feats
+def conv(batchNorm, in_planes, out_planes, kernel_size=3, stride=1):
+    if batchNorm:
+        return nn.Sequential(
+            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.LeakyReLU(0.1)
+        )
+    else:
+        return nn.Sequential(
+            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=True),
+            nn.LeakyReLU(0.1)
+        )
+def deconv(in_planes, out_planes):
+    return nn.Sequential(
+        nn.ConvTranspose2d(in_planes, out_planes, kernel_size=4, stride=2, padding=1, bias=True),
+        nn.LeakyReLU(0.1)
+    )
+class SpixelNet(nn.Module):
+    def __init__(self, inChannel=3, outChannel=9, batchNorm=True):
+        super(SpixelNet,self).__init__()
+        self.batchNorm = batchNorm
+        self.conv0a = conv(self.batchNorm, inChannel, 16, kernel_size=3)
+        self.conv0b = conv(self.batchNorm, 16, 16, kernel_size=3)
+        self.conv1a = conv(self.batchNorm, 16, 32, kernel_size=3, stride=2)
+        self.conv1b = conv(self.batchNorm, 32, 32, kernel_size=3)
+        self.conv2a = conv(self.batchNorm, 32, 64, kernel_size=3, stride=2)
+        self.conv2b = conv(self.batchNorm, 64, 64, kernel_size=3)
+        self.conv3a = conv(self.batchNorm, 64, 128, kernel_size=3, stride=2)
+        self.conv3b = conv(self.batchNorm, 128, 128, kernel_size=3)
+        self.conv4a = conv(self.batchNorm, 128, 256, kernel_size=3, stride=2)
+        self.conv4b = conv(self.batchNorm, 256, 256, kernel_size=3)
+        self.deconv3 = deconv(256, 128)
+        self.conv3_1 = conv(self.batchNorm, 256, 128)
+        self.deconv2 = deconv(128, 64)
+        self.conv2_1 = conv(self.batchNorm, 128, 64)
+        self.deconv1 = deconv(64, 32)
+        self.conv1_1 = conv(self.batchNorm, 64, 32)
+        self.deconv0 = deconv(32, 16)
+        self.conv0_1 = conv(self.batchNorm, 32, 16)
+        self.pred_mask0 = nn.Conv2d(16, outChannel, kernel_size=3, stride=1, padding=1, bias=True)
+        self.softmax = nn.Softmax(1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+                init.kaiming_normal_(m.weight, 0.1)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+    def forward(self, x):
+        out1 = self.conv0b(self.conv0a(x))    #5*5
+        out2 = self.conv1b(self.conv1a(out1)) #11*11
+        out3 = self.conv2b(self.conv2a(out2)) #23*23
+        out4 = self.conv3b(self.conv3a(out3)) #47*47
+        out5 = self.conv4b(self.conv4a(out4)) #95*95
+        out_deconv3 = self.deconv3(out5)
+        concat3 = torch.cat((out4, out_deconv3), 1)
+        out_conv3_1 = self.conv3_1(concat3)
+        out_deconv2 = self.deconv2(out_conv3_1)
+        concat2 = torch.cat((out3, out_deconv2), 1)
+        out_conv2_1 = self.conv2_1(concat2)
+        out_deconv1 = self.deconv1(out_conv2_1)
+        concat1 = torch.cat((out2, out_deconv1), 1)
+        out_conv1_1 = self.conv1_1(concat1)
+        out_deconv0 = self.deconv0(out_conv1_1)
+        concat0 = torch.cat((out1, out_deconv0), 1)
+        out_conv0_1 = self.conv0_1(concat0)
+        mask0 = self.pred_mask0(out_conv0_1)
+        prob0 = self.softmax(mask0)
+        return prob0
+## VGG architecter, used for the perceptual loss using a pretrained VGG network
+class VGG19(torch.nn.Module):
+    def __init__(self, requires_grad=False, local_pretrained_path='checkpoints/vgg19.pth'):
+        super().__init__()
+        #vgg_pretrained_features = torchvision.models.vgg19(pretrained=True).features
+        model = torchvision.models.vgg19()
+        model.load_state_dict(torch.load(local_pretrained_path))
+        vgg_pretrained_features = model.features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        for x in range(2):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(2, 7):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(7, 12):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(12, 21):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(21, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h_relu1 = self.slice1(X)
+        h_relu2 = self.slice2(h_relu1)
+        h_relu3 = self.slice3(h_relu2)
+        h_relu4 = self.slice4(h_relu3)
+        h_relu5 = self.slice5(h_relu4)
+        out = [h_relu1, h_relu2, h_relu3, h_relu4, h_relu5]
+        return out

models/position_encoding.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, token_tensors):
+        ## input: (B,C,H,W)
+        x = token_tensors
+        h, w = x.shape[-2:]
+        identity_map= torch.ones((h,w), device=x.device)
+        y_embed = identity_map.cumsum(0, dtype=torch.float32)
+        x_embed = identity_map.cumsum(1, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[-1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, None] / dim_t
+        pos_y = y_embed[:, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos = torch.cat((pos_y, pos_x), dim=2).permute(2, 0, 1)
+        batch_pos = pos.unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return batch_pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, n_pos_x=16, n_pos_y=16, num_pos_feats=64):
+        super().__init__()
+        self.row_embed = nn.Embedding(n_pos_y, num_pos_feats)
+        self.col_embed = nn.Embedding(n_pos_x, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, token_tensors):
+        ## input: (B,C,H,W)
+        x = token_tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1)
+        batch_pos = pos.unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return batch_pos
+def build_position_encoding(num_pos_feats=64, n_pos_x=16, n_pos_y=16, is_learned=False):
+    if is_learned:
+        position_embedding = PositionEmbeddingLearned(n_pos_x, n_pos_y, num_pos_feats)
+    else:
+        position_embedding = PositionEmbeddingSine(num_pos_feats, normalize=True)
+    return position_embedding

models/transformer2d.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+import copy, math
+from models.position_encoding import build_position_encoding
+class TransformerEncoder(nn.Module):
+    def __init__(self, enc_layer, num_layers, use_dense_pos=False):
+        super().__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(enc_layer) for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.use_dense_pos = use_dense_pos
+    def forward(self, src, pos, padding_mask=None):
+        if self.use_dense_pos:
+            ## pos encoding at each MH-Attention block (q,k)
+            output, pos_enc = src, pos
+            for layer in self.layers:
+                output, att_map = layer(output, pos_enc, padding_mask)
+        else:
+            ## pos encoding at input only (q,k,v)
+            output, pos_enc = src + pos, None
+            for layer in self.layers:
+                output, att_map = layer(output, pos_enc, padding_mask)
+        return output, att_map
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
+                use_dense_pos=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward(self, src, pos, padding_mask):
+        q = k = self.with_pos_embed(src, pos)
+        src2, attn = self.self_attn(q, k, value=src, key_padding_mask=padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src, attn
+class TransformerDecoder(nn.Module):
+    def __init__(self, dec_layer, num_layers, use_dense_pos=False, return_intermediate=False):
+        super().__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(dec_layer) for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.use_dense_pos = use_dense_pos
+        self.return_intermediate = return_intermediate
+    def forward(self, tgt, tgt_pos, memory, memory_pos,
+                tgt_padding_mask, src_padding_mask, tgt_attn_mask=None):
+        intermediate = []
+        if self.use_dense_pos:
+            ## pos encoding at each MH-Attention block (q,k)
+            output = tgt
+            tgt_pos_enc, memory_pos_enc = tgt_pos, memory_pos
+            for layer in self.layers:
+                output, att_map = layer(output, tgt_pos_enc, memory, memory_pos_enc,
+                                tgt_padding_mask, src_padding_mask, tgt_attn_mask)
+                if self.return_intermediate:
+                    intermediate.append(output)
+        else:
+            ## pos encoding at input only (q,k,v)
+            output = tgt + tgt_pos
+            tgt_pos_enc, memory_pos_enc = None, None
+            for layer in self.layers:
+                output, att_map = layer(output, tgt_pos_enc, memory, memory_pos_enc,
+                                tgt_padding_mask, src_padding_mask, tgt_attn_mask)
+                if self.return_intermediate:
+                    intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output, att_map
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
+                 use_dense_pos=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.corr_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward(self, tgt, tgt_pos, memory, memory_pos,
+                tgt_padding_mask, memory_padding_mask, tgt_attn_mask):
+        q = k = self.with_pos_embed(tgt, tgt_pos)
+        tgt2, attn = self.self_attn(q, k, value=tgt, key_padding_mask=tgt_padding_mask,
+                                    attn_mask=tgt_attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2, attn = self.corr_attn(query=self.with_pos_embed(tgt, tgt_pos),
+                                    key=self.with_pos_embed(memory, memory_pos),
+                                    value=memory, key_padding_mask=memory_padding_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt, attn
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+#-----------------------------------------------------------------------------------
+'''
+copy from the implementatoin of "attention-is-all-you-need-pytorch-master" by Yu-Hsiang Huang
+'''
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
+        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
+        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+    def forward(self, q, k, v, mask=None):
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
+        residual = q
+        # Pass through the pre-attention projection: b x lq x (n*dv)
+        # Separate different heads: b x lq x n x dv
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        # Transpose for attention dot product: b x n x lq x dv
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)   # For head axis broadcasting.
+        q, attn = self.attention(q, k, v, mask=mask)
+        # Transpose to move the head dimension back: b x lq x n x dv
+        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
+        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
+        q = self.dropout(self.fc(q))
+        q += residual
+        q = self.layer_norm(q)
+        return q, attn
+class ScaledDotProductAttention(nn.Module):
+    ''' Scaled Dot-Product Attention '''
+    def __init__(self, temperature, attn_dropout=0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(attn_dropout)
+    def forward(self, q, k, v, mask=None):
+        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, -1e9)
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        output = torch.matmul(attn, v)
+        return output, attn