Spaces:

ethanNeuralImage
/

inversion_testing

Runtime error

App Files Files Community

ethanNeuralImage commited on Aug 19, 2022

Commit

c85e4eb

•

1 Parent(s): 5238ef9

Adding in metrics

Browse files

Files changed (13) hide show

app.py +53 -15
metrics/__init__.py +1 -0
metrics/criteria/__init__.py +0 -0
metrics/criteria/clip_loss.py +17 -0
metrics/criteria/id_loss.py +40 -0
metrics/criteria/parse_related_loss/average_lab_color_loss.py +78 -0
metrics/criteria/parse_related_loss/bg_loss.py +29 -0
metrics/criteria/parse_related_loss/model_utils.py +851 -0
metrics/criteria/parse_related_loss/unet.py +68 -0
metrics/face_eval.py +103 -0
metrics/metrics.py +205 -0
requirements.txt +1 -0
ris/model.py +0 -5

app.py CHANGED Viewed

@@ -26,6 +26,10 @@ import ris.spherical_kmeans as spherical_kmeans
 from ris.blend import blend_latents
 from ris.model import Generator as RIS_Generator
 from PIL import Image
 opts_args = ['--no_fine_mapper']
@@ -70,6 +74,11 @@ ris_gen = RIS_Generator(1024, 512, 8, channel_multiplier=2).to(device).eval()
 ris_ckpt = torch.load('./pretrained_models/ris/stylegan2-ffhq-config-f.pt', map_location=lambda storage, loc: storage)
 ris_gen.load_state_dict(ris_ckpt['g_ema'], strict=False)
 with gr.Blocks() as demo:
     with gr.Row() as row:
@@ -104,14 +113,14 @@ with gr.Blocks() as demo:
                 output_hyperstyle_gd = gr.Image(type='pil', label="Hyperstyle Global Directions", visible=False)
                 output_hyperstyle_ris = gr.Image(type='pil', label='Hyperstyle RIS', visible=False)
             with gr.Row() as hyperstyle_metrics:
-                output_hypersyle_metrics = gr.Text()
             with gr.Row(visible=False) as e4e_images:
                 output_e4e_invert = gr.Image(type='pil', label="E4E Inverted", visible=False)
                 output_e4e_mapper = gr.Image(type='pil', label="E4E Mapper")
                 output_e4e_gd = gr.Image(type='pil', label="E4E Global Directions", visible=False)
                 output_e4e_ris = gr.Image(type='pil', label='E4E RIS', visible=False)
-            with gr.Row() as e4e_metrics:
-                output_e4e_metrics = gr.Text()
     def n_iter_change(number):
         if number < 0:
             return 0
@@ -124,7 +133,9 @@ with gr.Blocks() as demo:
         hyperstyle_bool = 'Hyperstyle' in bools
         return {
             hyperstyle_images: gr.update(visible=hyperstyle_bool),
             e4e_images: gr.update(visible=e4e_bool),
             n_hyperstyle_iterations: gr.update(visible=hyperstyle_bool)
         }
     def outp_toggles(bool):
@@ -153,7 +164,7 @@ with gr.Blocks() as demo:
     n_hyperstyle_iterations.change(n_iter_change, n_hyperstyle_iterations, n_hyperstyle_iterations)
     mapper_choice.change(mapper_change, mapper_choice, [target_text])
-    inverter_bools.change(inverter_toggles, inverter_bools, [hyperstyle_images, e4e_images, n_hyperstyle_iterations])
     invert_bool.change(outp_toggles, invert_bool, [output_hyperstyle_invert, output_e4e_invert])
     mapper_bool.change(mapper_toggles, mapper_bool, [mapper_opts, output_hyperstyle_mapper, output_e4e_mapper])
     gd_bool.change(gd_toggles, gd_bool, [gd_opts, output_hyperstyle_gd, output_e4e_gd])
@@ -173,6 +184,17 @@ with gr.Blocks() as demo:
 			                                       randomize_noise=False, truncation=1, weights_deltas=weight_deltas)
             result_batch = (x_hat, w_hat)
         return result_batch
     def submit(
         src, align_img, inverter_bools, n_iterations, invert_bool,
         mapper_bool, mapper_choice, mapper_alpha,
@@ -188,6 +210,7 @@ with gr.Blocks() as demo:
         mapper = StyleCLIPMapper(mapper_args)
         mapper.eval()
         mapper.to(device)
         with torch.no_grad():
             output_imgs = []
             if align_img:
@@ -208,7 +231,7 @@ with gr.Blocks() as demo:
                 else:
                     ref_input = Image.open(src).convert('RGB')
                 ref_input = im2tensor_transforms(ref_input).to(device)
             if 'Hyperstyle' in inverter_bools:
                 hyperstyle_batch, hyperstyle_latents, hyperstyle_deltas, _ = run_inversion(input_img.unsqueeze(0), hyperstyle, hyperstyle_args, return_intermediate_results=False)
                 if invert_bool:
@@ -217,13 +240,19 @@ with gr.Blocks() as demo:
                     invert_hyperstyle = None
                 if mapper_bool:
                     mapped_hyperstyle, _ = map_latent(mapper, hyperstyle_latents, stylespace=False, weight_deltas=hyperstyle_deltas, strength=mapper_alpha)
                     mapped_hyperstyle = tensor2im(mapped_hyperstyle[0])
                 else:
                     mapped_hyperstyle = None
                 if gd_bool:
-                    gd_hyperstyle = edit_image(_, hyperstyle_latents[0], hyperstyle.decoder, direction_calculator, opts, hyperstyle_deltas)[0]
-                    gd_hyperstyle = tensor2im(gd_hyperstyle)
                 else:
                     gd_hyperstyle = None
@@ -237,10 +266,11 @@ with gr.Blocks() as demo:
                 else:
                     ris_hyperstyle=None
-                hyperstyle_output = [invert_hyperstyle, mapped_hyperstyle,gd_hyperstyle, ris_hyperstyle]
             else:
-                hyperstyle_output = [None, None, None, None]
             output_imgs.extend(hyperstyle_output)
             if 'E4E' in inverter_bools:
                 e4e_batch, e4e_latents = hyperstyle.w_invert(input_img.unsqueeze(0))
                 e4e_deltas = None
@@ -250,13 +280,21 @@ with gr.Blocks() as demo:
                     invert_e4e = None
                 if mapper_bool:
                     mapped_e4e, _ = map_latent(mapper, e4e_latents, stylespace=False, weight_deltas=e4e_deltas, strength=mapper_alpha)
                     mapped_e4e = tensor2im(mapped_e4e[0])
                 else:
                     mapped_e4e = None
                 if gd_bool:
-                    gd_e4e = edit_image(_, e4e_latents[0], hyperstyle.decoder, direction_calculator, opts, e4e_deltas)[0]
-                    gd_e4e = tensor2im(gd_e4e)
                 else:
                     gd_e4e = None
@@ -270,9 +308,9 @@ with gr.Blocks() as demo:
                 else:
                     ris_e4e=None
-                e4e_output = [invert_e4e, mapped_e4e, gd_e4e, ris_e4e]
             else:
-                e4e_output = [None, None, None, None]
             output_imgs.extend(e4e_output)
         return output_imgs
     submit_button.click(
@@ -283,8 +321,8 @@ with gr.Blocks() as demo:
             gd_bool, neutral_text, target_text, alpha, beta,
             ris_bool, ref_img
         ],
-        [output_hyperstyle_invert, output_hyperstyle_mapper, output_hyperstyle_gd, output_hyperstyle_ris,
-         output_e4e_invert, output_e4e_mapper, output_e4e_gd, output_e4e_ris]
             )
 demo.launch()

 from ris.blend import blend_latents
 from ris.model import Generator as RIS_Generator
+from metrics import FaceMetric
+from metrics.criteria.clip_loss import CLIPLoss
+import clip
 from PIL import Image
 opts_args = ['--no_fine_mapper']
 ris_ckpt = torch.load('./pretrained_models/ris/stylegan2-ffhq-config-f.pt', map_location=lambda storage, loc: storage)
 ris_gen.load_state_dict(ris_ckpt['g_ema'], strict=False)
+lpips_metric = FaceMetric(metric_type='lpips', device=device)
+ssim_metric = FaceMetric(metric_type='ms-ssim', device=device)
+id_metric = FaceMetric(metric_type='id', device=device)
+clip_hair = FaceMetric(metric_type='cliphair', device=device)
+clip_text = CLIPLoss(hyperstyle_args)
 with gr.Blocks() as demo:
     with gr.Row() as row:
                 output_hyperstyle_gd = gr.Image(type='pil', label="Hyperstyle Global Directions", visible=False)
                 output_hyperstyle_ris = gr.Image(type='pil', label='Hyperstyle RIS', visible=False)
             with gr.Row() as hyperstyle_metrics:
+                output_hypersyle_metrics = gr.Text(label='Hyperstyle Metrics')
             with gr.Row(visible=False) as e4e_images:
                 output_e4e_invert = gr.Image(type='pil', label="E4E Inverted", visible=False)
                 output_e4e_mapper = gr.Image(type='pil', label="E4E Mapper")
                 output_e4e_gd = gr.Image(type='pil', label="E4E Global Directions", visible=False)
                 output_e4e_ris = gr.Image(type='pil', label='E4E RIS', visible=False)
+            with gr.Row(visible=False) as e4e_metrics:
+                output_e4e_metrics = gr.Text(label='E4E Metrics')
     def n_iter_change(number):
         if number < 0:
             return 0
         hyperstyle_bool = 'Hyperstyle' in bools
         return {
             hyperstyle_images: gr.update(visible=hyperstyle_bool),
+            hyperstyle_metrics: gr.update(visible=hyperstyle_bool),
             e4e_images: gr.update(visible=e4e_bool),
+            e4e_metrics: gr.update(visible=e4e_bool),
             n_hyperstyle_iterations: gr.update(visible=hyperstyle_bool)
         }
     def outp_toggles(bool):
     n_hyperstyle_iterations.change(n_iter_change, n_hyperstyle_iterations, n_hyperstyle_iterations)
     mapper_choice.change(mapper_change, mapper_choice, [target_text])
+    inverter_bools.change(inverter_toggles, inverter_bools, [hyperstyle_images, hyperstyle_metrics, e4e_images, e4e_metrics, n_hyperstyle_iterations])
     invert_bool.change(outp_toggles, invert_bool, [output_hyperstyle_invert, output_e4e_invert])
     mapper_bool.change(mapper_toggles, mapper_bool, [mapper_opts, output_hyperstyle_mapper, output_e4e_mapper])
     gd_bool.change(gd_toggles, gd_bool, [gd_opts, output_hyperstyle_gd, output_e4e_gd])
 			                                       randomize_noise=False, truncation=1, weights_deltas=weight_deltas)
             result_batch = (x_hat, w_hat)
         return result_batch
+    def run_metrics(base_img, edited_img):
+        lpips_score = lpips_metric(base_img, edited_img)[0]
+        ssim_score = ssim_metric(base_img, edited_img)[0]
+        id_score = id_metric(base_img, edited_img)[0]
+        return lpips_score, ssim_score, id_score
+    def clip_text_metric(tensor, text):
+        clip_embed = torch.cat([clip.tokenize(text)]).cuda()
+        clip_score = 1-clip_text(tensor.unsqueeze(0), clip_embed).item()
+        return clip_score
     def submit(
         src, align_img, inverter_bools, n_iterations, invert_bool,
         mapper_bool, mapper_choice, mapper_alpha,
         mapper = StyleCLIPMapper(mapper_args)
         mapper.eval()
         mapper.to(device)
+        resize_to = (256, 256) if hyperstyle_args.resize_outputs else (hyperstyle_args.output_size, hyperstyle_args.output_size)
         with torch.no_grad():
             output_imgs = []
             if align_img:
                 else:
                     ref_input = Image.open(src).convert('RGB')
                 ref_input = im2tensor_transforms(ref_input).to(device)
+            hyperstyle_metrics_text = ''
             if 'Hyperstyle' in inverter_bools:
                 hyperstyle_batch, hyperstyle_latents, hyperstyle_deltas, _ = run_inversion(input_img.unsqueeze(0), hyperstyle, hyperstyle_args, return_intermediate_results=False)
                 if invert_bool:
                     invert_hyperstyle = None
                 if mapper_bool:
                     mapped_hyperstyle, _ = map_latent(mapper, hyperstyle_latents, stylespace=False, weight_deltas=hyperstyle_deltas, strength=mapper_alpha)
+                    #clip_score = clip_text_metric(mapped_hyperstyle[0], mapper_args.description)
                     mapped_hyperstyle = tensor2im(mapped_hyperstyle[0])
+                    #lpips_score, ssim_score, id_score = run_metrics(invert_hyperstyle.resize(resize_to), mapped_hyperstyle.resize(resize_to))
+                    #hyperstyle_metrics_text += f'Mapper Metrics:\n\tLPIPS: \t{lpips_score} \n\tSSIM: \t{ssim_score}\n\tID Score: \t{id_score}\n\tCLIP Text Score: \t{clip_score}'
                 else:
                     mapped_hyperstyle = None
                 if gd_bool:
+                    gd_hyperstyle = edit_image(_, hyperstyle_latents[0], hyperstyle.decoder, direction_calculator, opts, hyperstyle_deltas)
+                    #clip_score = clip_text_metric(gd_hyperstyle[0], opts.target_text)
+                    gd_hyperstyle = tensor2im(gd_hyperstyle[0])
+                    #lpips_score, ssim_score, id_score = run_metrics(invert_hyperstyle.resize(resize_to), gd_hyperstyle.resize(resize_to))
+                    #hyperstyle_metrics_text += f'Global Direction Metrics:\n\tLPIPS: \t{lpips_score} \n\tSSIM: \t{ssim_score}\n\tID Score: \t{id_score}\n\tCLIP Text Score: \t{clip_score}'
                 else:
                     gd_hyperstyle = None
                 else:
                     ris_hyperstyle=None
+                hyperstyle_output = [invert_hyperstyle, mapped_hyperstyle,gd_hyperstyle, ris_hyperstyle, hyperstyle_metrics_text]
             else:
+                hyperstyle_output = [None, None, None, None, hyperstyle_metrics_text]
             output_imgs.extend(hyperstyle_output)
+            e4e_metrics_text = ''
             if 'E4E' in inverter_bools:
                 e4e_batch, e4e_latents = hyperstyle.w_invert(input_img.unsqueeze(0))
                 e4e_deltas = None
                     invert_e4e = None
                 if mapper_bool:
                     mapped_e4e, _ = map_latent(mapper, e4e_latents, stylespace=False, weight_deltas=e4e_deltas, strength=mapper_alpha)
+                    #clip_score = clip_text_metric(mapped_e4e[0], mapper_args.description)
                     mapped_e4e = tensor2im(mapped_e4e[0])
+                    #lpips_score, ssim_score, id_score = run_metrics(invert_e4e, mapped_e4e)
+                    #e4e_metrics_text += f'Mapper Metrics:\n\tLPIPS: \t{lpips_score} \n\tSSIM: \t{ssim_score}\n\tID Score: \t{id_score}\n\tCLIP Text Score: \t{clip_score}'
                 else:
                     mapped_e4e = None
                 if gd_bool:
+                    gd_e4e = edit_image(_, e4e_latents[0], hyperstyle.decoder, direction_calculator, opts, e4e_deltas)
+                    clip_score = clip_text_metric(gd_e4e[0], opts.target_text)
+                    gd_e4e = tensor2im(gd_e4e[0])
+                    lpips_score, ssim_score, id_score = run_metrics(invert_e4e, gd_e4e)
+                    e4e_metrics_text += f'Global Direction Metrics:\n\tLPIPS: \t{lpips_score} \n\tSSIM: \t{ssim_score}\n\tID Score: \t{id_score}\n\tCLIP Text Score: \t{clip_score}'
                 else:
                     gd_e4e = None
                 else:
                     ris_e4e=None
+                e4e_output = [invert_e4e, mapped_e4e, gd_e4e, ris_e4e, e4e_metrics_text]
             else:
+                e4e_output = [None, None, None, None, e4e_metrics_text]
             output_imgs.extend(e4e_output)
         return output_imgs
     submit_button.click(
             gd_bool, neutral_text, target_text, alpha, beta,
             ris_bool, ref_img
         ],
+        [output_hyperstyle_invert, output_hyperstyle_mapper, output_hyperstyle_gd, output_hyperstyle_ris, output_hypersyle_metrics,
+         output_e4e_invert, output_e4e_mapper, output_e4e_gd, output_e4e_ris, output_e4e_metrics]
             )
 demo.launch()

metrics/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .face_eval import FaceMetric

metrics/criteria/__init__.py ADDED Viewed

File without changes

metrics/criteria/clip_loss.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import clip
+class CLIPLoss(torch.nn.Module):
+    def __init__(self, opts):
+        super(CLIPLoss, self).__init__()
+        self.model, self.preprocess = clip.load("ViT-B/32", device="cuda")
+        self.upsample = torch.nn.Upsample(scale_factor=7)
+        self.avg_pool = torch.nn.AvgPool2d(kernel_size=opts.stylegan_size // 32)
+    def forward(self, image, text):
+        image = self.avg_pool(self.upsample(image))
+        similarity = 1 - self.model(image, text)[0] / 100
+        return similarity

metrics/criteria/id_loss.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch import nn
+from models.facial_recognition.model_irse import Backbone
+class IDLoss(nn.Module):
+    def __init__(self, opts):
+        super(IDLoss, self).__init__()
+        print('Loading ResNet ArcFace')
+        self.facenet = Backbone(input_size=112, num_layers=50, drop_ratio=0.6, mode='ir_se')
+        self.facenet.load_state_dict(torch.load(opts.ir_se50_weights))
+        self.pool = torch.nn.AdaptiveAvgPool2d((256, 256))
+        self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112))
+        self.facenet.eval()
+        self.facenet.cuda()
+        self.opts = opts
+    def extract_feats(self, x):
+        if x.shape[2] != 256:
+            x = self.pool(x)
+        x = x[:, :, 35:223, 32:220]  # Crop interesting region
+        x = self.face_pool(x)
+        x_feats = self.facenet(x)
+        return x_feats
+    def forward(self, y_hat, y):
+        n_samples = y.shape[0]
+        y_feats = self.extract_feats(y)  # Otherwise use the feature from there
+        y_hat_feats = self.extract_feats(y_hat)
+        y_feats = y_feats.detach()
+        loss = 0
+        sim_improvement = 0
+        count = 0
+        for i in range(n_samples):
+            diff_target = y_hat_feats[i].dot(y_feats[i])
+            loss += 1 - diff_target
+            count += 1
+        return loss / count, sim_improvement / count

metrics/criteria/parse_related_loss/average_lab_color_loss.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+from torch import nn
+from criteria.parse_related_loss.unet import unet
+class AvgLabLoss(nn.Module):
+    def __init__(self, opts):
+        super(AvgLabLoss, self).__init__()
+        self.criterion = nn.L1Loss()
+        self.M = torch.tensor([[0.412453, 0.357580, 0.180423], [0.212671, 0.715160, 0.072169], [0.019334, 0.119193, 0.950227]])
+        print('Loading UNet for AvgLabLoss')
+        self.parsenet = unet()
+        self.parsenet.load_state_dict(torch.load(opts.parsenet_weights))
+        self.parsenet.eval()
+        self.shrink = torch.nn.AdaptiveAvgPool2d((512, 512))
+        self.magnify = torch.nn.AdaptiveAvgPool2d((1024, 1024))
+    def gen_hair_mask(self, input_image):
+        labels_predict = self.parsenet(self.shrink(input_image)).detach()
+        mask_512 = (torch.unsqueeze(torch.max(labels_predict, 1)[1], 1)==13).float()
+        mask_1024 = self.magnify(mask_512)
+        return mask_1024
+    # cal lab written by liuqk
+    def f(self, input):
+        output = input * 1
+        mask = input > 0.008856
+        output[mask] = torch.pow(input[mask], 1 / 3)
+        output[~mask] = 7.787 * input[~mask] + 0.137931
+        return output
+    def rgb2xyz(self, input):
+        assert input.size(1) == 3
+        M_tmp = self.M.to(input.device).unsqueeze(0)
+        M_tmp = M_tmp.repeat(input.size(0), 1, 1)  # BxCxC
+        output = torch.einsum('bnc,bchw->bnhw', M_tmp, input)  # BxCxHxW
+        M_tmp = M_tmp.sum(dim=2, keepdim=True)  # BxCx1
+        M_tmp = M_tmp.unsqueeze(3)  # BxCx1x1
+        return output / M_tmp
+    def xyz2lab(self, input):
+        assert input.size(1) == 3
+        output = input * 1
+        xyz_f = self.f(input)
+        # compute l
+        mask = input[:, 1, :, :] > 0.008856
+        output[:, 0, :, :][mask] = 116 * xyz_f[:, 1, :, :][mask] - 16
+        output[:, 0, :, :][~mask] = 903.3 * input[:, 1, :, :][~mask]
+        # compute a
+        output[:, 1, :, :] = 500 * (xyz_f[:, 0, :, :] - xyz_f[:, 1, :, :])
+        # compute b
+        output[:, 2, :, :] = 200 * (xyz_f[:, 1, :, :] - xyz_f[:, 2, :, :])
+        return output
+    def cal_hair_avg(self, input, mask):
+        x = input * mask
+        sum = torch.sum(torch.sum(x, dim=2, keepdim=True), dim=3, keepdim=True) # [n,3,1,1]
+        mask_sum = torch.sum(torch.sum(mask, dim=2, keepdim=True), dim=3, keepdim=True) # [n,1,1,1]
+        mask_sum[mask_sum == 0] = 1
+        avg = sum / mask_sum
+        return avg
+    def forward(self, fake, real):
+        # the mask is [n,1,h,w]
+        # normalize to 0~1
+        mask_fake = self.gen_hair_mask(fake)
+        mask_real = self.gen_hair_mask(real)
+        fake_RGB = (fake + 1) / 2.0
+        real_RGB = (real + 1) / 2.0
+        # from RGB to Lab by liuqk
+        fake_xyz = self.rgb2xyz(fake_RGB)
+        fake_Lab = self.xyz2lab(fake_xyz)
+        real_xyz = self.rgb2xyz(real_RGB)
+        real_Lab = self.xyz2lab(real_xyz)
+        # cal average value
+        fake_Lab_avg = self.cal_hair_avg(fake_Lab, mask_fake)
+        real_Lab_avg = self.cal_hair_avg(real_Lab, mask_real)
+        loss = self.criterion(fake_Lab_avg, real_Lab_avg)
+        return loss

metrics/criteria/parse_related_loss/bg_loss.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from torch import nn
+from criteria.parse_related_loss.unet import unet
+class BackgroundLoss(nn.Module):
+    def __init__(self, opts):
+        super(BackgroundLoss, self).__init__()
+        print('Loading UNet for Background Loss')
+        self.parsenet = unet()
+        self.parsenet.load_state_dict(torch.load(opts.parsenet_weights))
+        self.parsenet.eval()
+        self.bg_mask_l2_loss = torch.nn.MSELoss()
+        self.shrink = torch.nn.AdaptiveAvgPool2d((512, 512))
+        self.magnify = torch.nn.AdaptiveAvgPool2d((1024, 1024))
+    def gen_bg_mask(self, input_image):
+        labels_predict = self.parsenet(self.shrink(input_image)).detach()
+        mask_512 = (torch.unsqueeze(torch.max(labels_predict, 1)[1], 1)!=13).float()
+        mask_1024 = self.magnify(mask_512)
+        return mask_1024
+    def forward(self, x, x_hat):
+        x_bg_mask = self.gen_bg_mask(x)
+        x_hat_bg_mask = self.gen_bg_mask(x_hat)
+        bg_mask = ((x_bg_mask+x_hat_bg_mask)==2).float()
+        loss = self.bg_mask_l2_loss(x * bg_mask, x_hat * bg_mask) / self.bg_mask_l2_loss(bg_mask, torch.zeros_like(bg_mask))
+        return loss

metrics/criteria/parse_related_loss/model_utils.py ADDED Viewed

	@@ -0,0 +1,851 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+class conv2DBatchNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        n_filters,
+        k_size,
+        stride,
+        padding,
+        bias=True,
+        dilation=1,
+        is_batchnorm=True,
+    ):
+        super(conv2DBatchNorm, self).__init__()
+        conv_mod = nn.Conv2d(int(in_channels),
+                             int(n_filters),
+                             kernel_size=k_size,
+                             padding=padding,
+                             stride=stride,
+                             bias=bias,
+                             dilation=dilation,)
+        if is_batchnorm:
+            self.cb_unit = nn.Sequential(conv_mod, nn.BatchNorm2d(int(n_filters)))
+        else:
+            self.cb_unit = nn.Sequential(conv_mod)
+    def forward(self, inputs):
+        outputs = self.cb_unit(inputs)
+        return outputs
+class conv2DGroupNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        n_filters,
+        k_size,
+        stride,
+        padding,
+        bias=True,
+        dilation=1,
+        n_groups=16,
+    ):
+        super(conv2DGroupNorm, self).__init__()
+        conv_mod = nn.Conv2d(int(in_channels),
+                             int(n_filters),
+                             kernel_size=k_size,
+                             padding=padding,
+                             stride=stride,
+                             bias=bias,
+                             dilation=dilation,)
+        self.cg_unit = nn.Sequential(conv_mod,
+                                     nn.GroupNorm(n_groups, int(n_filters)))
+    def forward(self, inputs):
+        outputs = self.cg_unit(inputs)
+        return outputs
+class deconv2DBatchNorm(nn.Module):
+    def __init__(self, in_channels, n_filters, k_size, stride, padding, bias=True):
+        super(deconv2DBatchNorm, self).__init__()
+        self.dcb_unit = nn.Sequential(
+            nn.ConvTranspose2d(
+                int(in_channels),
+                int(n_filters),
+                kernel_size=k_size,
+                padding=padding,
+                stride=stride,
+                bias=bias,
+            ),
+            nn.BatchNorm2d(int(n_filters)),
+        )
+    def forward(self, inputs):
+        outputs = self.dcb_unit(inputs)
+        return outputs
+class conv2DBatchNormRelu(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        n_filters,
+        k_size,
+        stride,
+        padding,
+        bias=True,
+        dilation=1,
+        is_batchnorm=True,
+    ):
+        super(conv2DBatchNormRelu, self).__init__()
+        conv_mod = nn.Conv2d(int(in_channels),
+                             int(n_filters),
+                             kernel_size=k_size,
+                             padding=padding,
+                             stride=stride,
+                             bias=bias,
+                             dilation=dilation,)
+        if is_batchnorm:
+            self.cbr_unit = nn.Sequential(conv_mod,
+                                          nn.BatchNorm2d(int(n_filters)),
+                                          nn.ReLU(inplace=True))
+        else:
+            self.cbr_unit = nn.Sequential(conv_mod, nn.ReLU(inplace=True))
+    def forward(self, inputs):
+        outputs = self.cbr_unit(inputs)
+        return outputs
+class conv2DGroupNormRelu(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        n_filters,
+        k_size,
+        stride,
+        padding,
+        bias=True,
+        dilation=1,
+        n_groups=16,
+    ):
+        super(conv2DGroupNormRelu, self).__init__()
+        conv_mod = nn.Conv2d(int(in_channels),
+                             int(n_filters),
+                             kernel_size=k_size,
+                             padding=padding,
+                             stride=stride,
+                             bias=bias,
+                             dilation=dilation,)
+        self.cgr_unit = nn.Sequential(conv_mod,
+                                      nn.GroupNorm(n_groups, int(n_filters)),
+                                      nn.ReLU(inplace=True))
+    def forward(self, inputs):
+        outputs = self.cgr_unit(inputs)
+        return outputs
+class deconv2DBatchNormRelu(nn.Module):
+    def __init__(self, in_channels, n_filters, k_size, stride, padding, bias=True):
+        super(deconv2DBatchNormRelu, self).__init__()
+        self.dcbr_unit = nn.Sequential(
+            nn.ConvTranspose2d(
+                int(in_channels),
+                int(n_filters),
+                kernel_size=k_size,
+                padding=padding,
+                stride=stride,
+                bias=bias,
+            ),
+            nn.BatchNorm2d(int(n_filters)),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, inputs):
+        outputs = self.dcbr_unit(inputs)
+        return outputs
+class unetConv2(nn.Module):
+    def __init__(self, in_size, out_size, is_batchnorm):
+        super(unetConv2, self).__init__()
+        if is_batchnorm:
+            self.conv1 = nn.Sequential(
+                nn.Conv2d(in_size, out_size, 3, 1, 1),
+                nn.BatchNorm2d(out_size),
+                nn.ReLU(),
+            )
+            self.conv2 = nn.Sequential(
+                nn.Conv2d(out_size, out_size, 3, 1, 1),
+                nn.BatchNorm2d(out_size),
+                nn.ReLU(),
+            )
+        else:
+            self.conv1 = nn.Sequential(nn.Conv2d(in_size, out_size, 3, 1, 1), nn.ReLU())
+            self.conv2 = nn.Sequential(
+                nn.Conv2d(out_size, out_size, 3, 1, 1), nn.ReLU()
+            )
+    def forward(self, inputs):
+        outputs = self.conv1(inputs)
+        #print (outputs.shape)
+        outputs = self.conv2(outputs)
+        #print (outputs.shape)
+        return outputs
+class unetUp(nn.Module):
+    def __init__(self, in_size, out_size, is_deconv, is_batchnorm):
+        super(unetUp, self).__init__()
+        self.conv = unetConv2(in_size, out_size, is_batchnorm)
+        if is_deconv:
+            self.up = nn.ConvTranspose2d(in_size, out_size, kernel_size=2, stride=2)
+        else:
+            self.up = nn.UpsamplingBilinear2d(scale_factor=2)
+    def forward(self, inputs1, inputs2):
+        outputs2 = self.up(inputs2)
+        offset = outputs2.size()[2] - inputs1.size()[2]
+        padding = 2 * [offset // 2, offset // 2]
+        outputs1 = F.pad(inputs1, padding)
+        return self.conv(torch.cat([outputs1, outputs2], 1))
+class segnetDown2(nn.Module):
+    def __init__(self, in_size, out_size):
+        super(segnetDown2, self).__init__()
+        self.conv1 = conv2DBatchNormRelu(in_size, out_size, 3, 1, 1)
+        self.conv2 = conv2DBatchNormRelu(out_size, out_size, 3, 1, 1)
+        self.maxpool_with_argmax = nn.MaxPool2d(2, 2, return_indices=True)
+    def forward(self, inputs):
+        outputs = self.conv1(inputs)
+        outputs = self.conv2(outputs)
+        unpooled_shape = outputs.size()
+        outputs, indices = self.maxpool_with_argmax(outputs)
+        return outputs, indices, unpooled_shape
+class segnetDown3(nn.Module):
+    def __init__(self, in_size, out_size):
+        super(segnetDown3, self).__init__()
+        self.conv1 = conv2DBatchNormRelu(in_size, out_size, 3, 1, 1)
+        self.conv2 = conv2DBatchNormRelu(out_size, out_size, 3, 1, 1)
+        self.conv3 = conv2DBatchNormRelu(out_size, out_size, 3, 1, 1)
+        self.maxpool_with_argmax = nn.MaxPool2d(2, 2, return_indices=True)
+    def forward(self, inputs):
+        outputs = self.conv1(inputs)
+        outputs = self.conv2(outputs)
+        outputs = self.conv3(outputs)
+        unpooled_shape = outputs.size()
+        outputs, indices = self.maxpool_with_argmax(outputs)
+        return outputs, indices, unpooled_shape
+class segnetUp2(nn.Module):
+    def __init__(self, in_size, out_size):
+        super(segnetUp2, self).__init__()
+        self.unpool = nn.MaxUnpool2d(2, 2)
+        self.conv1 = conv2DBatchNormRelu(in_size, in_size, 3, 1, 1)
+        self.conv2 = conv2DBatchNormRelu(in_size, out_size, 3, 1, 1)
+    def forward(self, inputs, indices, output_shape):
+        outputs = self.unpool(input=inputs, indices=indices, output_size=output_shape)
+        outputs = self.conv1(outputs)
+        outputs = self.conv2(outputs)
+        return outputs
+class segnetUp3(nn.Module):
+    def __init__(self, in_size, out_size):
+        super(segnetUp3, self).__init__()
+        self.unpool = nn.MaxUnpool2d(2, 2)
+        self.conv1 = conv2DBatchNormRelu(in_size, in_size, 3, 1, 1)
+        self.conv2 = conv2DBatchNormRelu(in_size, in_size, 3, 1, 1)
+        self.conv3 = conv2DBatchNormRelu(in_size, out_size, 3, 1, 1)
+    def forward(self, inputs, indices, output_shape):
+        outputs = self.unpool(input=inputs, indices=indices, output_size=output_shape)
+        outputs = self.conv1(outputs)
+        outputs = self.conv2(outputs)
+        outputs = self.conv3(outputs)
+        return outputs
+class residualBlock(nn.Module):
+    expansion = 1
+    def __init__(self, in_channels, n_filters, stride=1, downsample=None):
+        super(residualBlock, self).__init__()
+        self.convbnrelu1 = conv2DBatchNormRelu(
+            in_channels, n_filters, 3, stride, 1, bias=False
+        )
+        self.convbn2 = conv2DBatchNorm(n_filters, n_filters, 3, 1, 1, bias=False)
+        self.downsample = downsample
+        self.stride = stride
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        residual = x
+        out = self.convbnrelu1(x)
+        out = self.convbn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class residualBottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, in_channels, n_filters, stride=1, downsample=None):
+        super(residualBottleneck, self).__init__()
+        self.convbn1 = nn.Conv2DBatchNorm(in_channels, n_filters, k_size=1, bias=False)
+        self.convbn2 = nn.Conv2DBatchNorm(
+            n_filters, n_filters, k_size=3, padding=1, stride=stride, bias=False
+        )
+        self.convbn3 = nn.Conv2DBatchNorm(
+            n_filters, n_filters * 4, k_size=1, bias=False
+        )
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.convbn1(x)
+        out = self.convbn2(out)
+        out = self.convbn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class linknetUp(nn.Module):
+    def __init__(self, in_channels, n_filters):
+        super(linknetUp, self).__init__()
+        # B, 2C, H, W -> B, C/2, H, W
+        self.convbnrelu1 = conv2DBatchNormRelu(
+            in_channels, n_filters / 2, k_size=1, stride=1, padding=1
+        )
+        # B, C/2, H, W -> B, C/2, H, W
+        self.deconvbnrelu2 = nn.deconv2DBatchNormRelu(
+            n_filters / 2, n_filters / 2, k_size=3, stride=2, padding=0
+        )
+        # B, C/2, H, W -> B, C, H, W
+        self.convbnrelu3 = conv2DBatchNormRelu(
+            n_filters / 2, n_filters, k_size=1, stride=1, padding=1
+        )
+    def forward(self, x):
+        x = self.convbnrelu1(x)
+        x = self.deconvbnrelu2(x)
+        x = self.convbnrelu3(x)
+        return x
+class FRRU(nn.Module):
+    """
+    Full Resolution Residual Unit for FRRN
+    """
+    def __init__(self,
+                 prev_channels,
+                 out_channels,
+                 scale,
+                 group_norm=False,
+                 n_groups=None):
+        super(FRRU, self).__init__()
+        self.scale = scale
+        self.prev_channels = prev_channels
+        self.out_channels = out_channels
+        self.group_norm = group_norm
+        self.n_groups = n_groups
+        if self.group_norm:
+            conv_unit = conv2DGroupNormRelu
+            self.conv1 = conv_unit(
+                prev_channels + 32, out_channels, k_size=3,
+                stride=1, padding=1, bias=False, n_groups=self.n_groups
+            )
+            self.conv2 = conv_unit(
+                out_channels, out_channels, k_size=3,
+                stride=1, padding=1, bias=False, n_groups=self.n_groups
+            )
+        else:
+            conv_unit = conv2DBatchNormRelu
+            self.conv1 = conv_unit(prev_channels + 32, out_channels, k_size=3,
+                                   stride=1, padding=1, bias=False,)
+            self.conv2 = conv_unit(out_channels, out_channels, k_size=3,
+                                   stride=1, padding=1, bias=False,)
+        self.conv_res = nn.Conv2d(out_channels, 32, kernel_size=1, stride=1, padding=0)
+    def forward(self, y, z):
+        x = torch.cat([y, nn.MaxPool2d(self.scale, self.scale)(z)], dim=1)
+        y_prime = self.conv1(x)
+        y_prime = self.conv2(y_prime)
+        x = self.conv_res(y_prime)
+        upsample_size = torch.Size([_s * self.scale for _s in y_prime.shape[-2:]])
+        x = F.upsample(x, size=upsample_size, mode="nearest")
+        z_prime = z + x
+        return y_prime, z_prime
+class RU(nn.Module):
+    """
+    Residual Unit for FRRN
+    """
+    def __init__(self,
+                 channels,
+                 kernel_size=3,
+                 strides=1,
+                 group_norm=False,
+                 n_groups=None):
+        super(RU, self).__init__()
+        self.group_norm = group_norm
+        self.n_groups = n_groups
+        if self.group_norm:
+            self.conv1 = conv2DGroupNormRelu(
+               channels, channels, k_size=kernel_size,
+               stride=strides, padding=1, bias=False,n_groups=self.n_groups)
+            self.conv2 = conv2DGroupNorm(
+                channels, channels, k_size=kernel_size,
+                stride=strides, padding=1, bias=False,n_groups=self.n_groups)
+        else:
+            self.conv1 = conv2DBatchNormRelu(
+               channels, channels, k_size=kernel_size, stride=strides, padding=1, bias=False,)
+            self.conv2 = conv2DBatchNorm(
+                channels, channels, k_size=kernel_size, stride=strides, padding=1, bias=False,)
+    def forward(self, x):
+        incoming = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x + incoming
+class residualConvUnit(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(residualConvUnit, self).__init__()
+        self.residual_conv_unit = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channels, channels, kernel_size=kernel_size),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channels, channels, kernel_size=kernel_size),
+        )
+    def forward(self, x):
+        input = x
+        x = self.residual_conv_unit(x)
+        return x + input
+class multiResolutionFusion(nn.Module):
+    def __init__(self, channels, up_scale_high, up_scale_low, high_shape, low_shape):
+        super(multiResolutionFusion, self).__init__()
+        self.up_scale_high = up_scale_high
+        self.up_scale_low = up_scale_low
+        self.conv_high = nn.Conv2d(high_shape[1], channels, kernel_size=3)
+        if low_shape is not None:
+            self.conv_low = nn.Conv2d(low_shape[1], channels, kernel_size=3)
+    def forward(self, x_high, x_low):
+        high_upsampled = F.upsample(
+            self.conv_high(x_high), scale_factor=self.up_scale_high, mode="bilinear"
+        )
+        if x_low is None:
+            return high_upsampled
+        low_upsampled = F.upsample(
+            self.conv_low(x_low), scale_factor=self.up_scale_low, mode="bilinear"
+        )
+        return low_upsampled + high_upsampled
+class chainedResidualPooling(nn.Module):
+    def __init__(self, channels, input_shape):
+        super(chainedResidualPooling, self).__init__()
+        self.chained_residual_pooling = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(5, 1, 2),
+            nn.Conv2d(input_shape[1], channels, kernel_size=3),
+        )
+    def forward(self, x):
+        input = x
+        x = self.chained_residual_pooling(x)
+        return x + input
+class pyramidPooling(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        pool_sizes,
+        model_name="pspnet",
+        fusion_mode="cat",
+        is_batchnorm=True,
+    ):
+        super(pyramidPooling, self).__init__()
+        bias = not is_batchnorm
+        self.paths = []
+        for i in range(len(pool_sizes)):
+            self.paths.append(
+                conv2DBatchNormRelu(
+                    in_channels,
+                    int(in_channels / len(pool_sizes)),
+                    1,
+                    1,
+                    0,
+                    bias=bias,
+                    is_batchnorm=is_batchnorm,
+                )
+            )
+        self.path_module_list = nn.ModuleList(self.paths)
+        self.pool_sizes = pool_sizes
+        self.model_name = model_name
+        self.fusion_mode = fusion_mode
+    def forward(self, x):
+        h, w = x.shape[2:]
+        if self.training or self.model_name != "icnet":  # general settings or pspnet
+            k_sizes = []
+            strides = []
+            for pool_size in self.pool_sizes:
+                k_sizes.append((int(h / pool_size), int(w / pool_size)))
+                strides.append((int(h / pool_size), int(w / pool_size)))
+        else:  # eval mode and icnet: pre-trained for 1025 x 2049
+            k_sizes = [(8, 15), (13, 25), (17, 33), (33, 65)]
+            strides = [(5, 10), (10, 20), (16, 32), (33, 65)]
+        if self.fusion_mode == "cat":  # pspnet: concat (including x)
+            output_slices = [x]
+            for i, (module, pool_size) in enumerate(
+                zip(self.path_module_list, self.pool_sizes)
+            ):
+                out = F.avg_pool2d(x, k_sizes[i], stride=strides[i], padding=0)
+                # out = F.adaptive_avg_pool2d(x, output_size=(pool_size, pool_size))
+                if self.model_name != "icnet":
+                    out = module(out)
+                out = F.interpolate(out, size=(h, w), mode="bilinear", align_corners=True)
+                output_slices.append(out)
+            return torch.cat(output_slices, dim=1)
+        else:  # icnet: element-wise sum (including x)
+            pp_sum = x
+            for i, (module, pool_size) in enumerate(
+                zip(self.path_module_list, self.pool_sizes)
+            ):
+                out = F.avg_pool2d(x, k_sizes[i], stride=strides[i], padding=0)
+                # out = F.adaptive_avg_pool2d(x, output_size=(pool_size, pool_size))
+                if self.model_name != "icnet":
+                    out = module(out)
+                out = F.interpolate(out, size=(h, w), mode="bilinear", align_corners=True)
+                pp_sum = pp_sum + out
+            return pp_sum
+class bottleNeckPSP(nn.Module):
+    def __init__(
+        self, in_channels, mid_channels, out_channels, stride, dilation=1, is_batchnorm=True
+    ):
+        super(bottleNeckPSP, self).__init__()
+        bias = not is_batchnorm
+        self.cbr1 = conv2DBatchNormRelu(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            padding=0,
+            bias=bias,
+            is_batchnorm=is_batchnorm,
+        )
+        if dilation > 1:
+            self.cbr2 = conv2DBatchNormRelu(
+                mid_channels,
+                mid_channels,
+                3,
+                stride=stride,
+                padding=dilation,
+                bias=bias,
+                dilation=dilation,
+                is_batchnorm=is_batchnorm,
+            )
+        else:
+            self.cbr2 = conv2DBatchNormRelu(
+                mid_channels,
+                mid_channels,
+                3,
+                stride=stride,
+                padding=1,
+                bias=bias,
+                dilation=1,
+                is_batchnorm=is_batchnorm,
+            )
+        self.cb3 = conv2DBatchNorm(
+            mid_channels,
+            out_channels,
+            1,
+            stride=1,
+            padding=0,
+            bias=bias,
+            is_batchnorm=is_batchnorm,
+        )
+        self.cb4 = conv2DBatchNorm(
+            in_channels,
+            out_channels,
+            1,
+            stride=stride,
+            padding=0,
+            bias=bias,
+            is_batchnorm=is_batchnorm,
+        )
+    def forward(self, x):
+        conv = self.cb3(self.cbr2(self.cbr1(x)))
+        residual = self.cb4(x)
+        return F.relu(conv + residual, inplace=True)
+class bottleNeckIdentifyPSP(nn.Module):
+    def __init__(self, in_channels, mid_channels, stride, dilation=1, is_batchnorm=True):
+        super(bottleNeckIdentifyPSP, self).__init__()
+        bias = not is_batchnorm
+        self.cbr1 = conv2DBatchNormRelu(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            padding=0,
+            bias=bias,
+            is_batchnorm=is_batchnorm,
+        )
+        if dilation > 1:
+            self.cbr2 = conv2DBatchNormRelu(
+                mid_channels,
+                mid_channels,
+                3,
+                stride=1,
+                padding=dilation,
+                bias=bias,
+                dilation=dilation,
+                is_batchnorm=is_batchnorm,
+            )
+        else:
+            self.cbr2 = conv2DBatchNormRelu(
+                mid_channels,
+                mid_channels,
+                3,
+                stride=1,
+                padding=1,
+                bias=bias,
+                dilation=1,
+                is_batchnorm=is_batchnorm,
+            )
+        self.cb3 = conv2DBatchNorm(
+            mid_channels,
+            in_channels,
+            1,
+            stride=1,
+            padding=0,
+            bias=bias,
+            is_batchnorm=is_batchnorm,
+        )
+    def forward(self, x):
+        residual = x
+        x = self.cb3(self.cbr2(self.cbr1(x)))
+        return F.relu(x + residual, inplace=True)
+class residualBlockPSP(nn.Module):
+    def __init__(
+        self,
+        n_blocks,
+        in_channels,
+        mid_channels,
+        out_channels,
+        stride,
+        dilation=1,
+        include_range="all",
+        is_batchnorm=True,
+    ):
+        super(residualBlockPSP, self).__init__()
+        if dilation > 1:
+            stride = 1
+        # residualBlockPSP = convBlockPSP + identityBlockPSPs
+        layers = []
+        if include_range in ["all", "conv"]:
+            layers.append(
+                bottleNeckPSP(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    stride,
+                    dilation,
+                    is_batchnorm=is_batchnorm,
+                )
+            )
+        if include_range in ["all", "identity"]:
+            for i in range(n_blocks - 1):
+                layers.append(
+                    bottleNeckIdentifyPSP(
+                        out_channels, mid_channels, stride, dilation, is_batchnorm=is_batchnorm
+                    )
+                )
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class cascadeFeatureFusion(nn.Module):
+    def __init__(
+        self, n_classes, low_in_channels, high_in_channels, out_channels, is_batchnorm=True
+    ):
+        super(cascadeFeatureFusion, self).__init__()
+        bias = not is_batchnorm
+        self.low_dilated_conv_bn = conv2DBatchNorm(
+            low_in_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=2,
+            bias=bias,
+            dilation=2,
+            is_batchnorm=is_batchnorm,
+        )
+        self.low_classifier_conv = nn.Conv2d(
+            int(low_in_channels),
+            int(n_classes),
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            bias=True,
+            dilation=1,
+        )  # Train only
+        self.high_proj_conv_bn = conv2DBatchNorm(
+            high_in_channels,
+            out_channels,
+            1,
+            stride=1,
+            padding=0,
+            bias=bias,
+            is_batchnorm=is_batchnorm,
+        )
+    def forward(self, x_low, x_high):
+        x_low_upsampled = F.interpolate(
+            x_low, size=get_interp_size(x_low, z_factor=2), mode="bilinear", align_corners=True
+        )
+        low_cls = self.low_classifier_conv(x_low_upsampled)
+        low_fm = self.low_dilated_conv_bn(x_low_upsampled)
+        high_fm = self.high_proj_conv_bn(x_high)
+        high_fused_fm = F.relu(low_fm + high_fm, inplace=True)
+        return high_fused_fm, low_cls
+def get_interp_size(input, s_factor=1, z_factor=1):  # for caffe
+    ori_h, ori_w = input.shape[2:]
+    # shrink (s_factor >= 1)
+    ori_h = (ori_h - 1) / s_factor + 1
+    ori_w = (ori_w - 1) / s_factor + 1
+    # zoom (z_factor >= 1)
+    ori_h = ori_h + (ori_h - 1) * (z_factor - 1)
+    ori_w = ori_w + (ori_w - 1) * (z_factor - 1)
+    resize_shape = (int(ori_h), int(ori_w))
+    return resize_shape
+def interp(input, output_size, mode="bilinear"):
+    n, c, ih, iw = input.shape
+    oh, ow = output_size
+    # normalize to [-1, 1]
+    h = torch.arange(0, oh, dtype=torch.float, device='cuda' if input.is_cuda else 'cpu') / (oh - 1) * 2 - 1
+    w = torch.arange(0, ow, dtype=torch.float, device='cuda' if input.is_cuda else 'cpu') / (ow - 1) * 2 - 1
+    grid = torch.zeros(oh, ow, 2, dtype=torch.float, device='cuda' if input.is_cuda else 'cpu')
+    grid[:, :, 0] = w.unsqueeze(0).repeat(oh, 1)
+    grid[:, :, 1] = h.unsqueeze(0).repeat(ow, 1).transpose(0, 1)
+    grid = grid.unsqueeze(0).repeat(n, 1, 1, 1)  # grid.shape: [n, oh, ow, 2]
+    return F.grid_sample(input, grid, mode=mode)
+def get_upsampling_weight(in_channels, out_channels, kernel_size):
+    """Make a 2D bilinear kernel suitable for upsampling"""
+    factor = (kernel_size + 1) // 2
+    if kernel_size % 2 == 1:
+        center = factor - 1
+    else:
+        center = factor - 0.5
+    og = np.ogrid[:kernel_size, :kernel_size]
+    filt = (1 - abs(og[0] - center) / factor) * \
+           (1 - abs(og[1] - center) / factor)
+    weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size),
+                      dtype=np.float64)
+    weight[range(in_channels), range(out_channels), :, :] = filt
+    return torch.from_numpy(weight).float()

metrics/criteria/parse_related_loss/unet.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch.nn as nn
+from criteria.parse_related_loss.model_utils import *
+class unet(nn.Module):
+    def __init__(
+        self,
+        feature_scale=4,
+        n_classes=19,
+        is_deconv=True,
+        in_channels=3,
+        is_batchnorm=True,
+    ):
+        super(unet, self).__init__()
+        self.is_deconv = is_deconv
+        self.in_channels = in_channels
+        self.is_batchnorm = is_batchnorm
+        self.feature_scale = feature_scale
+        filters = [64, 128, 256, 512, 1024]
+        filters = [int(x / self.feature_scale) for x in filters]
+        # downsampling
+        self.conv1 = unetConv2(self.in_channels, filters[0], self.is_batchnorm)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
+        self.conv2 = unetConv2(filters[0], filters[1], self.is_batchnorm)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
+        self.conv3 = unetConv2(filters[1], filters[2], self.is_batchnorm)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=2)
+        self.conv4 = unetConv2(filters[2], filters[3], self.is_batchnorm)
+        self.maxpool4 = nn.MaxPool2d(kernel_size=2)
+        self.center = unetConv2(filters[3], filters[4], self.is_batchnorm)
+        # upsampling
+        self.up_concat4 = unetUp(filters[4], filters[3], self.is_deconv, self.is_batchnorm)
+        self.up_concat3 = unetUp(filters[3], filters[2], self.is_deconv, self.is_batchnorm)
+        self.up_concat2 = unetUp(filters[2], filters[1], self.is_deconv, self.is_batchnorm)
+        self.up_concat1 = unetUp(filters[1], filters[0], self.is_deconv, self.is_batchnorm)
+        # final conv (without any concat)
+        self.final = nn.Conv2d(filters[0], n_classes, 1)
+    def forward(self, inputs):
+        conv1 = self.conv1(inputs)
+        maxpool1 = self.maxpool1(conv1)
+        conv2 = self.conv2(maxpool1)
+        maxpool2 = self.maxpool2(conv2)
+        conv3 = self.conv3(maxpool2)
+        maxpool3 = self.maxpool3(conv3)
+        conv4 = self.conv4(maxpool3)
+        maxpool4 = self.maxpool4(conv4)
+        center = self.center(maxpool4)
+        up4 = self.up_concat4(conv4, center)
+        up3 = self.up_concat3(conv3, up4)
+        up2 = self.up_concat2(conv2, up3)
+        up1 = self.up_concat1(conv1, up2)
+        final = self.final(up1)
+        return final

metrics/face_eval.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from .face_parsing import BiSeNet
+import numpy as np
+from .metrics import LPIPS, MS_SSIM, IdScore, ClipHair
+import torch.nn as nn
+import torch
+from torchvision import transforms
+class FaceSegmentation(nn.Module):
+    def __init__(self, n_classes=19, device='cuda', save_pth='./pretrained_models/79999_iter.pth'):
+        super(FaceSegmentation, self).__init__()
+        self.net = BiSeNet(n_classes=n_classes).to(device)
+        self.net.load_state_dict(torch.load(save_pth))
+        self.net.eval()
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+        ])
+        self.device=device
+    def get_facemask(self, parsing_anno):
+        """
+        Returns a binary image of the face.
+        """
+        # face_attr = {1: 'skin', 2: 'l_brow', 3: 'r_brow', 4: 'l_eye', 5: 'r_eye', 6: 'eye_glass', 7: 'l_ear', 8: 'r_ear', 10: 'nose', 11: 'mouth', 12: 'u_lip', 13: 'l_lip', 14: 'neck'}
+        face_attr = torch.tensor([1,2,3,4,5,6,7,8,10,11,12,13,14],device=self.device)
+        face_mask = torch.isin(parsing_anno, face_attr)
+        return(face_mask.int())
+    def get_hairmask(self, parsing_anno):
+        """
+        Returns a binary image of the hair.
+        """
+        hair_mask = parsing_anno == 17
+        return(hair_mask.int())
+    def forward(self, img):
+        """
+        Returns a binary image of the face and hair.
+        """
+        img = self.transform(img).to(self.device)
+        parsing_anno = self.net(img.unsqueeze(0))[0].squeeze(0).argmax(0)
+        face_mask = self.get_facemask(parsing_anno).to(self.device)
+        hair_mask = self.get_hairmask(parsing_anno).to(self.device)
+        return img, face_mask, hair_mask
+class FaceMetric(nn.Module):
+    def __init__(self, metric_type, eval_face=True, eval_hair=True, device='cuda', seg_save_pth='./pretrained_models/79999_iter.pth'):
+        super(FaceMetric, self).__init__()
+        if metric_type == 'ms-ssim':
+            self.metric =  MS_SSIM()
+            self.eval_hair= eval_hair
+            self.eval_face= eval_face
+        elif metric_type == 'lpips':
+            self.metric = LPIPS(device=device)
+            self.eval_hair= eval_hair
+            self.eval_face= eval_face
+        elif metric_type == 'id':
+            self.metric = IdScore(device=device)
+            self.eval_hair = False
+            self.eval_face = eval_face
+        elif metric_type == 'cliphair':
+            self.metric = ClipHair(device=device)
+            self.eval_face = False
+            self.eval_hair = eval_hair
+        else:
+            raise NotImplementedError
+        self.parser = FaceSegmentation(device=device, save_pth=seg_save_pth)
+        self.device=device
+    def forward(self, x, y):
+        face_score, hair_score = None, None
+        x_tensor, x_face_seg, x_hair_seg = self.parser(x)
+        y_tensor, y_face_seg, y_hair_seg = self.parser(y)
+        if self.eval_hair == True:
+            ## Get union of two hair masks
+            #hair_mask = (x_hair_seg + y_hair_seg) > 0
+            x_hair = x_tensor * x_hair_seg
+            y_hair = y_tensor * y_hair_seg
+            hair_score = self.metric(x_hair, y_hair).item()
+        if self.eval_face == True:
+            ## Get intersection of two face masks
+            face_mask = (x_face_seg + y_face_seg) > 1
+            x_face = x_tensor * face_mask
+            y_face = y_tensor * face_mask
+            face_score = self.metric(x_face, y_face).item()
+        return face_score, hair_score

metrics/metrics.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+import torch.nn as nn
+from torch.nn import Module, Sequential, Conv2d, BatchNorm2d, PReLU, Dropout, Flatten, Linear, BatchNorm1d, MaxPool2d, AdaptiveAvgPool2d, ReLU, Sigmoid
+from collections import namedtuple
+from pytorch_msssim import ms_ssim
+import lpips
+import clip
+from torchvision import transforms
+class LPIPS(nn.Module):
+	def __init__(self, net='alex', device='cuda'):
+		super(LPIPS, self).__init__()
+		self.lpips = lpips.LPIPS(net='alex').to(device)
+	def forward(self, x, y):
+		return 1- self.lpips(x, y).squeeze()
+class MS_SSIM(nn.Module):
+    def __init__(self, avg=False):
+        super(MS_SSIM, self).__init__()
+        self.ssim = ms_ssim
+        self.avg = avg
+    def forward(self, x, y):
+        ## normalize images to [0, 1]
+        x = (x+1)/2
+        y = (y+1)/2
+        return self.ssim(x.unsqueeze(0), y.unsqueeze(0), data_range=1, size_average=self.avg)
+class IdScore(nn.Module):
+	# def __init__(self, opts):
+	def __init__(self, device='cuda'):
+		super(IdScore, self).__init__()
+		# print('Loading ResNet ArcFace')
+		self.facenet = Backbone(input_size=112, num_layers=50, drop_ratio=0.6).to(device)
+		self.facenet.load_state_dict(torch.load('./pretrained_models/model_ir_se50.pth'))
+		self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112))
+		self.facenet.eval()
+		self.cosine_sim = nn.CosineSimilarity(dim=1)
+	def extract_feats(self, x):
+		x = self.face_pool(x)
+		x_feats = self.facenet(x)
+		return x_feats
+	def forward(self, y, x):
+		x = x.unsqueeze(0)
+		y = y.unsqueeze(0)
+		x_feats = self.extract_feats(x)
+		y_feats = self.extract_feats(y)  # Otherwise use the feature from there
+		y_feats = y_feats.detach()
+		# diff_views = y_feats[0].dot(x_feats[0])
+		cosine_sim = self.cosine_sim(y_feats, x_feats)
+		return cosine_sim
+class ClipHair(nn.Module):
+	def __init__(self, device='cuda'):
+		super(ClipHair, self).__init__()
+		self.model, self.preprocessing  = clip.load("ViT-B/32", device=device)
+		self.cosine_sim = nn.CosineSimilarity(dim=1)
+		self.device = device
+		# self.model, self.preprocessing = model, preprocessing
+	def extract_feats(self, x):
+		x = transforms.ToPILImage()(x.squeeze())
+		x = self.preprocessing(x).unsqueeze(0).to(self.device)
+		x = self.model.encode_image(x)
+		return x
+	def forward(self, y, x):
+		x = x.unsqueeze(0)
+		y = y.unsqueeze(0)
+		x_feats = self.extract_feats(x)
+		y_feats = self.extract_feats(y)
+		y_feats = y_feats.detach()
+		cosine_sim = self.cosine_sim(x_feats, y_feats)
+		# diff_views = y_feats[0].dot(x_feats[0])/ (y_feats[0].norm() * x_feats[0].norm())
+		return cosine_sim
+class bottleneck_IR_SE(Module):
+	def __init__(self, in_channel, depth, stride):
+		super(bottleneck_IR_SE, self).__init__()
+		if in_channel == depth:
+			self.shortcut_layer = MaxPool2d(1, stride)
+		else:
+			self.shortcut_layer = Sequential(
+				Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+				BatchNorm2d(depth)
+			)
+		self.res_layer = Sequential(
+			BatchNorm2d(in_channel),
+			Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+			PReLU(depth),
+			Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+			BatchNorm2d(depth),
+			SEModule(depth, 16)
+		)
+	def forward(self, x):
+		shortcut = self.shortcut_layer(x)
+		res = self.res_layer(x)
+		return res + shortcut
+class Backbone(Module):
+	def __init__(self, input_size, num_layers, drop_ratio=0.4, affine=True):
+		super(Backbone, self).__init__()
+		assert input_size in [112, 224], "input_size should be 112 or 224"
+		assert num_layers in [50, 100, 152], "num_layers should be 50, 100 or 152"
+		blocks = get_blocks(num_layers)
+		self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False),
+									  BatchNorm2d(64),
+									  PReLU(64))
+		if input_size == 112:
+			self.output_layer = Sequential(BatchNorm2d(512),
+										   Dropout(drop_ratio),
+										   Flatten(),
+										   Linear(512 * 7 * 7, 512),
+										   BatchNorm1d(512, affine=affine))
+		else:
+			self.output_layer = Sequential(BatchNorm2d(512),
+										   Dropout(drop_ratio),
+										   Flatten(),
+										   Linear(512 * 14 * 14, 512),
+										   BatchNorm1d(512, affine=affine))
+		modules = []
+		for block in blocks:
+			for bottleneck in block:
+				modules.append(bottleneck_IR_SE(bottleneck.in_channel,
+										   bottleneck.depth,
+										   bottleneck.stride))
+		self.body = Sequential(*modules)
+	def forward(self, x):
+		x = self.input_layer(x)
+		x = self.body(x)
+		x = self.output_layer(x)
+		return l2_norm(x)
+def get_blocks(num_layers):
+	if num_layers == 50:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=4),
+			get_block(in_channel=128, depth=256, num_units=14),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	elif num_layers == 100:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=13),
+			get_block(in_channel=128, depth=256, num_units=30),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	elif num_layers == 152:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=8),
+			get_block(in_channel=128, depth=256, num_units=36),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	else:
+		raise ValueError("Invalid number of layers: {}. Must be one of [50, 100, 152]".format(num_layers))
+	return blocks
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+	""" A named tuple describing a ResNet block. """
+def get_block(in_channel, depth, num_units, stride=2):
+	return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+def l2_norm(input, axis=1):
+	norm = torch.norm(input, 2, axis, True)
+	output = torch.div(input, norm)
+	return output
+class SEModule(Module):
+	def __init__(self, channels, reduction):
+		super(SEModule, self).__init__()
+		self.avg_pool = AdaptiveAvgPool2d(1)
+		self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0, bias=False)
+		self.relu = ReLU(inplace=True)
+		self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0, bias=False)
+		self.sigmoid = Sigmoid()
+	def forward(self, x):
+		module_input = x
+		x = self.avg_pool(x)
+		x = self.fc1(x)
+		x = self.relu(x)
+		x = self.fc2(x)
+		x = self.sigmoid(x)
+		return module_input * x

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 torch
 torchvision
 dlib
 pillow
 numpy

 torch
 torchvision
+cudatoolkit
 dlib
 pillow
 numpy

ris/model.py CHANGED Viewed

@@ -508,12 +508,7 @@ class Generator(nn.Module):
         output.append(self.to_rgb1.get_latent(latent[:, 1]))
         i = 1
-        # print("Get latent dimensions:")
         for conv1, conv2, to_rgb in zip(self.convs[::2], self.convs[1::2], self.to_rgbs):
-            # print(f'{i}: {conv1.get_latent(latent[:, i]).shape}')
-            # print(f'{i+1}: {conv2.get_latent(latent[:, i+1]).shape}')
-            # print(f'{i+2}: {to_rgb.get_latent(latent[:, i+2]).shape}')
-            # print("")
             output.append(conv1.get_latent(latent[:, i]))
             output.append(conv2.get_latent(latent[:, i+1]))
             output.append(to_rgb.get_latent(latent[:, i+2]))

         output.append(self.to_rgb1.get_latent(latent[:, 1]))
         i = 1
         for conv1, conv2, to_rgb in zip(self.convs[::2], self.convs[1::2], self.to_rgbs):
             output.append(conv1.get_latent(latent[:, i]))
             output.append(conv2.get_latent(latent[:, i+1]))
             output.append(to_rgb.get_latent(latent[:, i+2]))