Spaces:

qihang
/

BerfScene

Sleeping

App Files Files Community

3v324v23 commited on Apr 11

Commit

2f85de4

•

1 Parent(s): cd0130f

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +30 -0
app.py +267 -0
models/__init__.py +63 -0
models/__pycache__/__init__.cpython-37.pyc +0 -0
models/__pycache__/__init__.cpython-39.pyc +0 -0
models/__pycache__/bev3d_generator.cpython-37.pyc +0 -0
models/__pycache__/bev3d_generator.cpython-39.pyc +0 -0
models/__pycache__/eg3d_discriminator.cpython-37.pyc +0 -0
models/__pycache__/eg3d_discriminator.cpython-39.pyc +0 -0
models/__pycache__/eg3d_generator.cpython-37.pyc +0 -0
models/__pycache__/eg3d_generator.cpython-39.pyc +0 -0
models/__pycache__/eg3d_generator_fv.cpython-37.pyc +0 -0
models/__pycache__/eg3d_generator_fv.cpython-39.pyc +0 -0
models/__pycache__/ghfeat_encoder.cpython-37.pyc +0 -0
models/__pycache__/ghfeat_encoder.cpython-39.pyc +0 -0
models/__pycache__/inception_model.cpython-37.pyc +0 -0
models/__pycache__/inception_model.cpython-39.pyc +0 -0
models/__pycache__/perceptual_model.cpython-37.pyc +0 -0
models/__pycache__/perceptual_model.cpython-39.pyc +0 -0
models/__pycache__/pggan_discriminator.cpython-37.pyc +0 -0
models/__pycache__/pggan_discriminator.cpython-39.pyc +0 -0
models/__pycache__/pggan_generator.cpython-37.pyc +0 -0
models/__pycache__/pggan_generator.cpython-39.pyc +0 -0
models/__pycache__/pigan_discriminator.cpython-37.pyc +0 -0
models/__pycache__/pigan_discriminator.cpython-39.pyc +0 -0
models/__pycache__/pigan_generator.cpython-37.pyc +0 -0
models/__pycache__/pigan_generator.cpython-39.pyc +0 -0
models/__pycache__/sgbev3d_generator.cpython-37.pyc +0 -0
models/__pycache__/sgbev3d_generator.cpython-39.pyc +0 -0
models/__pycache__/stylegan2_discriminator.cpython-37.pyc +0 -0
models/__pycache__/stylegan2_discriminator.cpython-39.pyc +0 -0
models/__pycache__/stylegan2_generator.cpython-37.pyc +0 -0
models/__pycache__/stylegan2_generator.cpython-39.pyc +0 -0
models/__pycache__/stylegan3_generator.cpython-37.pyc +0 -0
models/__pycache__/stylegan3_generator.cpython-39.pyc +0 -0
models/__pycache__/stylegan_discriminator.cpython-37.pyc +0 -0
models/__pycache__/stylegan_discriminator.cpython-39.pyc +0 -0
models/__pycache__/stylegan_generator.cpython-37.pyc +0 -0
models/__pycache__/stylegan_generator.cpython-39.pyc +0 -0
models/__pycache__/volumegan_discriminator.cpython-37.pyc +0 -0
models/__pycache__/volumegan_discriminator.cpython-39.pyc +0 -0
models/__pycache__/volumegan_generator.cpython-37.pyc +0 -0
models/__pycache__/volumegan_generator.cpython-39.pyc +0 -0
models/bev3d_generator.py +301 -0
models/eg3d_discriminator.py +243 -0
models/eg3d_generator.py +315 -0
models/eg3d_generator_fv.py +320 -0
models/ghfeat_encoder.py +563 -0
models/inception_model.py +562 -0
models/perceptual_model.py +519 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM nvidia/cuda:11.1.0-devel-ubuntu22.04
+ENV CUDA_HOME=/usr/local/cuda
+ENV PATH=${CUDA_HOME}/bin:/home/${USER_NAME}/.local/bin:${PATH}
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH=${CUDA_HOME}/lib64/stubs:${LIBRARY_PATH}
+# apt install by root user
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    git \
+    python-is-python3 \
+    python3.7-dev \
+    python3-pip \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+RUN pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import gradio as gr
+from models import build_model
+from PIL import Image
+import numpy as np
+import torchvision
+import ninja
+import torch
+from tqdm import trange
+import imageio
+checkpoint = '/mnt/petrelfs/zhangqihang/data/berfscene_clevr.pth'
+state = torch.load(checkpoint, map_location='cpu')
+G = build_model(**state['model_kwargs_init']['generator_smooth'])
+o0, o1 = G.load_state_dict(state['models']['generator_smooth'], strict=False)
+G.eval().cuda()
+G.backbone.synthesis.input.x_offset =0
+G.backbone.synthesis.input.y_offset =0
+G_kwargs= dict(noise_mode='const',
+                fused_modulate=False,
+                impl='cuda',
+                fp16_res=None)
+def trans(x, y, z, length):
+    w = h = length
+    x = 0.5 * w - 128 + 256 - (x/9 + .5) * 256
+    y = 0.5 * h - 128 + (y/9 + .5) * 256
+    z = z / 9 * 256
+    return x, y, z
+def get_bev_from_objs(objs, length=256, scale = 6):
+    h, w = length, length *scale
+    nc = 14
+    canvas = np.zeros([h, w, nc])
+    xx = np.ones([h,w]).cumsum(0)
+    yy = np.ones([h,w]).cumsum(1)
+    for x, y, z, shape, color, material, rot in objs:
+        y, x, z = trans(x, y, z, length)
+        feat = [0] * nc
+        feat[0] = 1
+        feat[COLOR_NAME_LIST.index(color) + 1] = 1
+        feat[SHAPE_NAME_LIST.index(shape) + 1 + len(COLOR_NAME_LIST)] = 1
+        feat[MATERIAL_NAME_LIST.index(material) + 1 + len(COLOR_NAME_LIST) + len(SHAPE_NAME_LIST)] = 1
+        feat = np.array(feat)
+        rot_sin = np.sin(rot / 180 * np.pi)
+        rot_cos = np.cos(rot / 180 * np.pi)
+        if shape == 'cube':
+            mask = (np.abs(+rot_cos * (xx-x) + rot_sin * (yy-y)) <= z) * \
+                   (np.abs(-rot_sin * (xx-x) + rot_cos * (yy-y)) <= z)
+        else:
+            mask = ((xx-x)**2 + (y-yy)**2) ** 0.5 <= z
+        canvas[mask] = feat
+    canvas = np.transpose(canvas, [2, 0, 1]).astype(np.float32)
+    rotate_angle = 0
+    canvas = torchvision.transforms.functional.rotate(torch.tensor(canvas), rotate_angle).numpy()
+    return canvas
+# COLOR_NAME_LIST = ['cyan', 'green', 'purple', 'red', 'yellow', 'gray', 'brown', 'blue']
+COLOR_NAME_LIST = ['cyan', 'green', 'purple', 'red', 'yellow', 'gray', 'purple', 'blue']
+SHAPE_NAME_LIST = ['cube', 'sphere', 'cylinder']
+MATERIAL_NAME_LIST = ['rubber', 'metal']
+xy_lib = dict()
+xy_lib['B'] = [
+    [-2, -1],
+    [-1, -1],
+    [-2, 0],
+    [-2, 1],
+    [-1, .5],
+    [0, 1],
+    [0, 0],
+    [0, -1],
+    [0, 2],
+    [-1, 2],
+    [-2, 2]
+]
+xy_lib['B'] = [
+    [-2.5, 1.25],
+    [-2, 2],
+    [-2, 0.5],
+    [-2, -0.75],
+    [-1, -1],
+    [-1, 2],
+    [-1, 0],
+    [-1, 2],
+    [0, 1],
+    [0, 0],
+    [0, -1],
+    [0, 2],
+    # [-1, 2],
+]
+xy_lib['B'] = [
+    [-2.5, 1.25],
+    [-2, 2],
+    [-2, 0.5],
+    [-2, -1],
+    [-1, -1.25],
+    [-1, 2],
+    [-1, 0],
+    [-1, 2],
+    [0, 1],
+    [0, 0],
+    [0, -1.25],
+    [0, 2],
+    # [-1, 2],
+]
+xy_lib['R'] = [
+    [0, -1],
+    [0, 0],
+    [0, 1],
+    [0, 2],
+    [-1, -1],
+    # [-1, 2],
+    [-2, -1],
+    [-2, 0],
+    [-2.25, 2],
+    [-1, 1]
+]
+xy_lib['C'] = [
+    [0, -1],
+    [0, 0],
+    [0, 1],
+    [0, 2],
+    [-1, -1],
+    [-1, 2],
+    [-2, -1],
+    # [-2, .5],
+    [-2, 2],
+    # [-1, .5]
+]
+xy_lib['s'] = [
+    [0, -1],
+    [0, 0],
+    [0, 2],
+    [-1, -1],
+    [-1, 2],
+    [-2, -1],
+    [-2, 1],
+    [-2, 2],
+    [-1, .5]
+]
+xy_lib['F'] = [
+    [0, -1],
+    [0, 0],
+    [0, 1],
+    [0, 2],
+    [-1, -1],
+    # [-1, 2],
+    [-2, -1],
+    [-2, .5],
+    # [-2, 2],
+    [-1, .5]
+]
+xy_lib['c'] = [
+    [0.8,1],
+    # [-0.8,1],
+    [0,0.1],
+    [0,1.9],
+]
+xy_lib['e'] = [
+    [0, -1],
+    [0, 0],
+    [0, 1],
+    [0, 2],
+    [-1, -1],
+    [-1, 2],
+    [-2, -1],
+    [-2, .5],
+    [-2, 2],
+    [-1, .5]
+]
+xy_lib['n'] = [
+    [0,1],
+    [0,-1],
+    [0,0.1],
+    [0,1.9],
+    [-1,0],
+    [-2,1],
+    [-3,-1],
+    [-3,1],
+    [-3,0.1],
+    [-3,1.9],
+]
+offset_x = dict(B=4, R=4, C=4, F=4, c=3, s=4, e=4, n=4.8)
+s = 'BeRFsCene'
+objs = []
+offset = 2
+for idx, c in enumerate(s):
+    xy = xy_lib[c]
+    color = np.random.choice(COLOR_NAME_LIST)
+    for i in range(len(xy)):
+        # while 1:
+        #     is_ok = 1
+        #     x, y =
+        #     for prev_x, prev_y in zip(xpool, ypool):
+        x, y = xy[i]
+        y *= 1.5
+        y -= 0.5
+        x -= offset
+        z = 0.35
+        # if idx<4:
+        #     color = np.random.choice(COLOR_NAME_LIST[:-1])
+        # else:
+        #     color = 'blue'
+        shape = 'cube'
+        material = 'rubber'
+        rot = 0
+        objs.append([x, y, z,  shape, color, material, rot])
+    offset += offset_x[c]
+Image.fromarray((255 * .8 - get_bev_from_objs(objs)[0] *.8 * 255).astype(np.uint8))
+batch_size = 1
+code = torch.randn(1, G.z_dim).cuda()
+to_pil = torchvision.transforms.ToPILImage()
+large_bevs = torch.tensor(get_bev_from_objs(objs)).cuda()[None]
+bevs = large_bevs[..., 0: 0+256]
+RT = torch.tensor([[ -1.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.5000,  -0.8660,
+          10.3923,   0.0000,  -0.8660,  -0.5000,   6.0000,   0.0000,   0.0000,
+           0.0000,   1.0000, 262.5000,   0.0000,  32.0000,   0.0000, 262.5000,
+          32.0000,   0.0000,   0.0000,   1.0000]], device='cuda')
+print('prepare finish', flush=True)
+def inference(name):
+    print('inference', name, flush=True)
+    gen = G(code, RT, bevs)
+    rgb = gen['gen_output']['image'][0] * .5 + .5
+    print('inference', name, flush=True)
+    return np.array(to_pil(rgb))
+    # to_pil(rgb).save('tmp.png')
+    # save_path = '/mnt/petrelfs/zhangqihang/code/3d-scene-gen/tmp.png'
+    # return [save_path]
+with gr.Blocks() as demo:
+    gr.HTML(
+        """
+        abc
+        """)
+    with gr.Group():
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            num_frames = gr.Dropdown(["24 - frames", "32 - frames", "40 - frames", "48 - frames", "56 - frames", "80 - recommended to run on local GPUs", "240 - recommended to run on local GPUs", "600 - recommended to run on local GPUs", "1200 - recommended to run on local GPUs", "10000 - recommended to run on local GPUs"], label="Number of Video Frames", info="For >56 frames use local workstation!", value="24 - frames")
+                with gr.Row():
+                    with gr.Row():
+                        btn = gr.Button("Result")
+        gallery = gr.Image(label='img', show_label=True, elem_id="gallery")
+        btn.click(fn=inference, inputs=num_frames, outputs=[gallery], postprocess=False)
+demo.queue()
+demo.launch(server_name='0.0.0.0', server_port=10093, debug=True, show_error=True)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# python3.7
+"""Collects all models."""
+from .pggan_generator import PGGANGenerator
+from .pggan_discriminator import PGGANDiscriminator
+from .stylegan_generator import StyleGANGenerator
+from .stylegan_discriminator import StyleGANDiscriminator
+from .stylegan2_generator import StyleGAN2Generator
+from .stylegan2_discriminator import StyleGAN2Discriminator
+from .stylegan3_generator import StyleGAN3Generator
+from .ghfeat_encoder import GHFeatEncoder
+from .perceptual_model import PerceptualModel
+from .inception_model import InceptionModel
+from .eg3d_generator import EG3DGenerator
+from .eg3d_discriminator import DualDiscriminator
+from .pigan_generator import PiGANGenerator
+from .pigan_discriminator import PiGANDiscriminator
+from .volumegan_generator import VolumeGANGenerator
+from .volumegan_discriminator import VolumeGANDiscriminator
+from .eg3d_generator_fv import EG3DGeneratorFV
+from .bev3d_generator import BEV3DGenerator
+from .sgbev3d_generator import SGBEV3DGenerator
+__all__ = ['build_model']
+_MODELS = {
+    'PGGANGenerator': PGGANGenerator,
+    'PGGANDiscriminator': PGGANDiscriminator,
+    'StyleGANGenerator': StyleGANGenerator,
+    'StyleGANDiscriminator': StyleGANDiscriminator,
+    'StyleGAN2Generator': StyleGAN2Generator,
+    'StyleGAN2Discriminator': StyleGAN2Discriminator,
+    'StyleGAN3Generator': StyleGAN3Generator,
+    'GHFeatEncoder': GHFeatEncoder,
+    'PerceptualModel': PerceptualModel.build_model,
+    'InceptionModel': InceptionModel.build_model,
+    'EG3DGenerator': EG3DGenerator,
+    'EG3DDiscriminator': DualDiscriminator,
+    'PiGANGenerator': PiGANGenerator,
+    'PiGANDiscriminator': PiGANDiscriminator,
+    'VolumeGANGenerator': VolumeGANGenerator,
+    'VolumeGANDiscriminator': VolumeGANDiscriminator,
+    'EG3DGeneratorFV': EG3DGeneratorFV,
+    'BEV3DGenerator': BEV3DGenerator,
+    'SGBEV3DGenerator': SGBEV3DGenerator,
+}
+def build_model(model_type, **kwargs):
+    """Builds a model based on its class type.
+    Args:
+        model_type: Class type to which the model belongs, which is case
+            sensitive.
+        **kwargs: Additional arguments to build the model.
+    Raises:
+        ValueError: If the `model_type` is not supported.
+    """
+    if model_type not in _MODELS:
+        raise ValueError(f'Invalid model type: `{model_type}`!\n'
+                         f'Types allowed: {list(_MODELS)}.')
+    return _MODELS[model_type](**kwargs)

models/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (2.06 kB). View file

models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (2.08 kB). View file

models/__pycache__/bev3d_generator.cpython-37.pyc ADDED Viewed

Binary file (6.16 kB). View file

models/__pycache__/bev3d_generator.cpython-39.pyc ADDED Viewed

Binary file (6.07 kB). View file

models/__pycache__/eg3d_discriminator.cpython-37.pyc ADDED Viewed

Binary file (8.01 kB). View file

models/__pycache__/eg3d_discriminator.cpython-39.pyc ADDED Viewed

Binary file (7.73 kB). View file

models/__pycache__/eg3d_generator.cpython-37.pyc ADDED Viewed

Binary file (6.21 kB). View file

models/__pycache__/eg3d_generator.cpython-39.pyc ADDED Viewed

Binary file (6.3 kB). View file

models/__pycache__/eg3d_generator_fv.cpython-37.pyc ADDED Viewed

Binary file (6.35 kB). View file

models/__pycache__/eg3d_generator_fv.cpython-39.pyc ADDED Viewed

Binary file (6.43 kB). View file

models/__pycache__/ghfeat_encoder.cpython-37.pyc ADDED Viewed

Binary file (14.3 kB). View file

models/__pycache__/ghfeat_encoder.cpython-39.pyc ADDED Viewed

Binary file (14.1 kB). View file

models/__pycache__/inception_model.cpython-37.pyc ADDED Viewed

Binary file (16 kB). View file

models/__pycache__/inception_model.cpython-39.pyc ADDED Viewed

Binary file (15.7 kB). View file

models/__pycache__/perceptual_model.cpython-37.pyc ADDED Viewed

Binary file (14.3 kB). View file

models/__pycache__/perceptual_model.cpython-39.pyc ADDED Viewed

Binary file (14 kB). View file

models/__pycache__/pggan_discriminator.cpython-37.pyc ADDED Viewed

Binary file (12 kB). View file

models/__pycache__/pggan_discriminator.cpython-39.pyc ADDED Viewed

Binary file (11.9 kB). View file

models/__pycache__/pggan_generator.cpython-37.pyc ADDED Viewed

Binary file (10.6 kB). View file

models/__pycache__/pggan_generator.cpython-39.pyc ADDED Viewed

Binary file (10.6 kB). View file

models/__pycache__/pigan_discriminator.cpython-37.pyc ADDED Viewed

Binary file (8.32 kB). View file

models/__pycache__/pigan_discriminator.cpython-39.pyc ADDED Viewed

Binary file (8.31 kB). View file

models/__pycache__/pigan_generator.cpython-37.pyc ADDED Viewed

Binary file (12.7 kB). View file

models/__pycache__/pigan_generator.cpython-39.pyc ADDED Viewed

Binary file (12.8 kB). View file

models/__pycache__/sgbev3d_generator.cpython-37.pyc ADDED Viewed

Binary file (7.01 kB). View file

models/__pycache__/sgbev3d_generator.cpython-39.pyc ADDED Viewed

Binary file (7.04 kB). View file

models/__pycache__/stylegan2_discriminator.cpython-37.pyc ADDED Viewed

Binary file (17.7 kB). View file

models/__pycache__/stylegan2_discriminator.cpython-39.pyc ADDED Viewed

Binary file (17.7 kB). View file

models/__pycache__/stylegan2_generator.cpython-37.pyc ADDED Viewed

Binary file (32.9 kB). View file

models/__pycache__/stylegan2_generator.cpython-39.pyc ADDED Viewed

Binary file (32.9 kB). View file

models/__pycache__/stylegan3_generator.cpython-37.pyc ADDED Viewed

Binary file (35.8 kB). View file

models/__pycache__/stylegan3_generator.cpython-39.pyc ADDED Viewed

Binary file (35.7 kB). View file

models/__pycache__/stylegan_discriminator.cpython-37.pyc ADDED Viewed

Binary file (15.9 kB). View file

models/__pycache__/stylegan_discriminator.cpython-39.pyc ADDED Viewed

Binary file (15.9 kB). View file

models/__pycache__/stylegan_generator.cpython-37.pyc ADDED Viewed

Binary file (24.9 kB). View file

models/__pycache__/stylegan_generator.cpython-39.pyc ADDED Viewed

Binary file (24.9 kB). View file

models/__pycache__/volumegan_discriminator.cpython-37.pyc ADDED Viewed

Binary file (17.8 kB). View file

models/__pycache__/volumegan_discriminator.cpython-39.pyc ADDED Viewed

Binary file (17.8 kB). View file

models/__pycache__/volumegan_generator.cpython-37.pyc ADDED Viewed

Binary file (18.2 kB). View file

models/__pycache__/volumegan_generator.cpython-39.pyc ADDED Viewed

Binary file (18.2 kB). View file

models/bev3d_generator.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# python3.8
+"""Contains the implementation of generator described in BEV3D."""
+import torch
+import torch.nn as nn
+from models.utils.official_stylegan2_model_helper import Generator as StyleGAN2Backbone
+from models.utils.official_stylegan2_model_helper import FullyConnectedLayer
+from models.utils.eg3d_superres import SuperresolutionHybrid2X
+from models.utils.eg3d_superres import SuperresolutionHybrid4X
+from models.utils.eg3d_superres import SuperresolutionHybrid4X_conststyle
+from models.utils.eg3d_superres import SuperresolutionHybrid8XDC
+from models.rendering.renderer import Renderer
+from models.rendering.feature_extractor import FeatureExtractor
+from models.utils.spade import SPADEGenerator
+class BEV3DGenerator(nn.Module):
+    def __init__(
+            self,
+            z_dim,
+            semantic_nc,
+            ngf,
+            bev_grid_size,
+            aspect_ratio,
+            num_upsampling_layers,
+            not_use_vae,
+            norm_G,
+            img_resolution,
+            interpolate_sr,
+            segmask=False,
+            dim_seq='16,8,4,2,1',
+            xyz_pe=False,
+            hidden_dim=64,
+            additional_layer_num=0,
+            sr_num_fp16_res=0,      # Number of fp16 layers of SR Network.
+            rendering_kwargs={},    # Arguments for rendering.
+            sr_kwargs={},           # Arguments for SuperResolution Network.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.interpolate_sr = interpolate_sr
+        self.segmask = segmask
+        # Set up the overall renderer.
+        self.renderer = Renderer()
+        # Set up the feature extractor.
+        self.feature_extractor = FeatureExtractor(ref_mode='bev_plane_clevr', xyz_pe=xyz_pe)
+        # Set up the reference representation generator.
+        self.backbone = SPADEGenerator(z_dim=z_dim, semantic_nc=semantic_nc, ngf=ngf, dim_seq=dim_seq, bev_grid_size=bev_grid_size,
+                                        aspect_ratio=aspect_ratio, num_upsampling_layers=num_upsampling_layers,
+                                        not_use_vae=not_use_vae, norm_G=norm_G)
+        print('backbone SPADEGenerator set up!')
+        # Set up the post module in the feature extractor.
+        self.post_module = None
+        # Set up the post neural renderer.
+        self.post_neural_renderer = None
+        sr_kwargs_total = dict(
+            channels=32,
+            img_resolution=img_resolution,
+            sr_num_fp16_res=sr_num_fp16_res,
+            sr_antialias=rendering_kwargs['sr_antialias'],)
+        sr_kwargs_total.update(**sr_kwargs)
+        if img_resolution == 128:
+            self.post_neural_renderer = SuperresolutionHybrid2X(
+                **sr_kwargs_total)
+        elif img_resolution == 256:
+            self.post_neural_renderer = SuperresolutionHybrid4X_conststyle(
+                **sr_kwargs_total)
+        elif img_resolution == 512:
+            self.post_neural_renderer = SuperresolutionHybrid8XDC(
+                **sr_kwargs_total)
+        else:
+            raise TypeError(f'Unsupported image resolution: {img_resolution}!')
+        # Set up the fully-connected layer head.
+        self.fc_head = OSGDecoder(
+            128 if xyz_pe else 64 , {
+                'decoder_lr_mul': rendering_kwargs.get('decoder_lr_mul', 1),
+                'decoder_output_dim': 32
+            },
+            hidden_dim=hidden_dim,
+            additional_layer_num=additional_layer_num
+            )
+        # Set up some rendering related arguments.
+        self.neural_rendering_resolution = rendering_kwargs.get(
+            'resolution', 64)
+        self.rendering_kwargs = rendering_kwargs
+    def synthesis(self,
+                  z,
+                  c,
+                  seg,
+                  neural_rendering_resolution=None,
+                  update_emas=False,
+                  **synthesis_kwargs):
+        cam2world_matrix = c[:, :16].view(-1, 4, 4)
+        if self.rendering_kwargs.get('random_pose', False):
+            cam2world_matrix = None
+        if neural_rendering_resolution is None:
+            neural_rendering_resolution = self.neural_rendering_resolution
+        else:
+            self.neural_rendering_resolution = neural_rendering_resolution
+        xy_planes = self.backbone(z=z, input=seg)
+        if self.segmask:
+            xy_planes = xy_planes * seg[:, 0, ...][:, None, ...]
+        # import pdb;pdb.set_trace()
+        wp = z   # in our case, we do not use wp.
+        rendering_result = self.renderer(
+            wp=wp,
+            feature_extractor=self.feature_extractor,
+            rendering_options=self.rendering_kwargs,
+            cam2world_matrix=cam2world_matrix,
+            position_encoder=None,
+            ref_representation=xy_planes,
+            post_module=self.post_module,
+            fc_head=self.fc_head)
+        feature_samples = rendering_result['composite_rgb']
+        depth_samples = rendering_result['composite_depth']
+        # Reshape to keep consistent with 'raw' neural-rendered image.
+        N = wp.shape[0]
+        H = W = self.neural_rendering_resolution
+        feature_image = feature_samples.permute(0, 2, 1).reshape(
+            N, feature_samples.shape[-1], H, W).contiguous()
+        depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
+        # Run the post neural renderer to get final image.
+        # Here, the post neural renderer is a super-resolution network.
+        rgb_image = feature_image[:, :3]
+        if self.interpolate_sr:
+            sr_image = torch.nn.functional.interpolate(rgb_image, size=(256, 256), mode='bilinear', align_corners=False)
+        else:
+            sr_image = self.post_neural_renderer(
+            rgb_image,
+            feature_image,
+            # wp,
+            noise_mode=self.rendering_kwargs['superresolution_noise_mode'],
+            **{
+                k: synthesis_kwargs[k]
+                for k in synthesis_kwargs.keys() if k != 'noise_mode'
+            })
+        return {
+            'image': sr_image,
+            'image_raw': rgb_image,
+            'image_depth': depth_image
+        }
+    def sample(self,
+               coordinates,
+               directions,
+               z,
+               c,
+               seg,
+               truncation_psi=1,
+               truncation_cutoff=None,
+               update_emas=False,
+               **synthesis_kwargs):
+        # Compute RGB features, density for arbitrary 3D coordinates.
+        # Mostly used for extracting shapes.
+        cam2world_matrix = c[:, :16].view(-1, 4, 4)
+        xy_planes = self.backbone(z=z, input=seg)
+        wp = z
+        result = self.renderer.get_sigma_rgb(
+            wp=wp,
+            points=coordinates,
+            feature_extractor=self.feature_extractor,
+            fc_head=self.fc_head,
+            rendering_options=self.rendering_kwargs,
+            ref_representation=xy_planes,
+            post_module=self.post_module,
+            ray_dirs=directions,
+            cam_matrix=cam2world_matrix)
+        return result
+    def sample_mixed(self,
+                     coordinates,
+                     directions,
+                     z, c, seg,
+                     truncation_psi=1,
+                     truncation_cutoff=None,
+                     update_emas=False,
+                     **synthesis_kwargs):
+        # Same as function `self.sample()`, but expects latent vectors 'wp'
+        # instead of Gaussian noise 'z'.
+        cam2world_matrix = c[:, :16].view(-1, 4, 4)
+        xy_planes = self.backbone(z=z, input=seg)
+        wp = z
+        result = self.renderer.get_sigma_rgb(
+            wp=wp,
+            points=coordinates,
+            feature_extractor=self.feature_extractor,
+            fc_head=self.fc_head,
+            rendering_options=self.rendering_kwargs,
+            ref_representation=xy_planes,
+            post_module=self.post_module,
+            ray_dirs=directions,
+            cam_matrix=cam2world_matrix)
+        return result
+    def forward(self,
+                z,
+                c,
+                seg,
+                c_swapped=None,      # `c_swapped` is swapped pose conditioning.
+                style_mixing_prob=0,
+                truncation_psi=1,
+                truncation_cutoff=None,
+                neural_rendering_resolution=None,
+                update_emas=False,
+                sample_mixed=False,
+                coordinates=None,
+                **synthesis_kwargs):
+        # Render a batch of generated images.
+        c_wp = c.clone()
+        if c_swapped is not None:
+            c_wp = c_swapped.clone()
+        if not sample_mixed:
+            gen_output = self.synthesis(
+                z,
+                c,
+                seg,
+                update_emas=update_emas,
+                neural_rendering_resolution=neural_rendering_resolution,
+                **synthesis_kwargs)
+            return {
+                'wp': z,
+                'gen_output': gen_output,
+            }
+        else:
+            # Only for density regularization in training process.
+            assert coordinates is not None
+            sample_sigma = self.sample_mixed(coordinates,
+                                             torch.randn_like(coordinates),
+                                             z, c, seg,
+                                             update_emas=False)['sigma']
+            return {
+                'wp': z,
+                'sample_sigma': sample_sigma
+            }
+class OSGDecoder(nn.Module):
+    """Defines fully-connected layer head in EG3D."""
+    def __init__(self, n_features, options, hidden_dim=64, additional_layer_num=0):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        lst = []
+        lst.append(FullyConnectedLayer(n_features, self.hidden_dim, lr_multiplier=options['decoder_lr_mul']))
+        lst.append(nn.Softplus())
+        for i in range(additional_layer_num):
+            lst.append(FullyConnectedLayer(self.hidden_dim, self.hidden_dim, lr_multiplier=options['decoder_lr_mul']))
+            lst.append(nn.Softplus())
+        lst.append(FullyConnectedLayer(self.hidden_dim, 1+options['decoder_output_dim'], lr_multiplier=options['decoder_lr_mul']))
+        self.net = nn.Sequential(*lst)
+        # self.net = nn.Sequential(
+        #     FullyConnectedLayer(n_features,
+        #                         self.hidden_dim,
+        #                         lr_multiplier=options['decoder_lr_mul']),
+        #     nn.Softplus(),
+        #     FullyConnectedLayer(self.hidden_dim,
+        #                         1 + options['decoder_output_dim'],
+        #                         lr_multiplier=options['decoder_lr_mul']))
+    def forward(self, point_features, wp=None, dirs=None):
+        # Aggregate features
+        # point_features.shape: [N, R, K, C].
+        # Average across 'X, Y, Z' planes.
+        N, R, K, C = point_features.shape
+        x = point_features.reshape(-1, point_features.shape[-1])
+        x = self.net(x)
+        x = x.view(N, -1, x.shape[-1])
+        # Uses sigmoid clamping from MipNeRF
+        rgb = torch.sigmoid(x[..., 1:]) * (1 + 2 * 0.001) - 0.001
+        sigma = x[..., 0:1]
+        return {'rgb': rgb, 'sigma': sigma}

models/eg3d_discriminator.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# python 3.7
+"""Contains the implementation of discriminator described in EG3D."""
+import numpy as np
+import torch
+from third_party.stylegan2_official_ops import upfirdn2d
+from models.utils.official_stylegan2_model_helper import DiscriminatorBlock
+from models.utils.official_stylegan2_model_helper import MappingNetwork
+from models.utils.official_stylegan2_model_helper import DiscriminatorEpilogue
+class SingleDiscriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = 256,      # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        sr_upsample_factor  = 1,        # Ignored for SingleDiscriminator
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+        common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+    def forward(self, img, c, update_emas=False, **block_kwargs):
+        img = img['image']
+        _ = update_emas # unused
+        x = None
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+        cmap = None
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+    def extra_repr(self):
+        return f'c_dim={self.c_dim:d}, img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d}'
+#----------------------------------------------------------------------------
+def filtered_resizing(image_orig_tensor, size, f, filter_mode='antialiased'):
+    if filter_mode == 'antialiased':
+        ada_filtered_64 = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False)
+    elif filter_mode == 'classic':
+        ada_filtered_64 = upfirdn2d.upsample2d(image_orig_tensor, f, up=2)
+        ada_filtered_64 = torch.nn.functional.interpolate(ada_filtered_64, size=(size * 2 + 2, size * 2 + 2), mode='bilinear', align_corners=False)
+        ada_filtered_64 = upfirdn2d.downsample2d(ada_filtered_64, f, down=2, flip_filter=True, padding=-1)
+    elif filter_mode == 'none':
+        ada_filtered_64 = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False)
+    elif type(filter_mode) == float:
+        assert 0 < filter_mode < 1
+        filtered = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False)
+        aliased  = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False)
+        ada_filtered_64 = (1 - filter_mode) * aliased + (filter_mode) * filtered
+    return ada_filtered_64
+#----------------------------------------------------------------------------
+class DualDiscriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        bev_channels        = 0,
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = 256,      # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        disc_c_noise        = 0,        # Corrupt camera parameters with X std dev of noise before disc. pose conditioning.
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        img_channels *= 2
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels + bev_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+        common_kwargs = dict(img_channels=self.img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+        self.disc_c_noise = disc_c_noise
+    def forward(self, img, c, bev=None, update_emas=False, **block_kwargs):
+        image_raw = filtered_resizing(img['image_raw'], size=img['image'].shape[-1], f=self.resample_filter)
+        img = torch.cat([img['image'], image_raw], 1)
+        if bev is not None:
+            img = torch.cat([img, bev], 1)
+        _ = update_emas # unused
+        x = None
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+        cmap = None
+        if self.c_dim > 0:
+            if self.disc_c_noise > 0: c += torch.randn_like(c) * c.std(0) * self.disc_c_noise
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+    def extra_repr(self):
+        return f'c_dim={self.c_dim:d}, img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d}'
+#----------------------------------------------------------------------------
+class DummyDualDiscriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = 256,      # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        img_channels *= 2
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+        common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+        self.raw_fade = 1
+    def forward(self, img, c, update_emas=False, **block_kwargs):
+        self.raw_fade = max(0, self.raw_fade - 1/(500000/32))
+        image_raw = filtered_resizing(img['image_raw'], size=img['image'].shape[-1], f=self.resample_filter) * self.raw_fade
+        img = torch.cat([img['image'], image_raw], 1)
+        _ = update_emas # unused
+        x = None
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+        cmap = None
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+    def extra_repr(self):
+        return f'c_dim={self.c_dim:d}, img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d}'
+#----------------------------------------------------------------------------

models/eg3d_generator.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# python3.8
+"""Contains the implementation of generator described in EG3D."""
+import torch
+import torch.nn as nn
+from models.utils.official_stylegan2_model_helper import Generator as StyleGAN2Backbone
+from models.utils.official_stylegan2_model_helper import FullyConnectedLayer
+from models.utils.eg3d_superres import SuperresolutionHybrid2X
+from models.utils.eg3d_superres import SuperresolutionHybrid4X
+from models.utils.eg3d_superres import SuperresolutionHybrid8XDC
+from models.rendering.renderer import Renderer
+from models.rendering.feature_extractor import FeatureExtractor
+class EG3DGenerator(nn.Module):
+    def __init__(
+            self,
+            z_dim,                  # Input latent (Z) dimensionality.
+            c_dim,                  # Conditioning label (C) dimensionality.
+            w_dim,                  # Intermediate latent (W) dimensionality.
+            img_resolution,         # Output resolution.
+            img_channels,           # Number of output color channels.
+            sr_num_fp16_res=0,      # Number of fp16 layers of SR Network.
+            mapping_kwargs={},      # Arguments for MappingNetwork.
+            rendering_kwargs={},    # Arguments for rendering.
+            sr_kwargs={},           # Arguments for SuperResolution Network.
+            **synthesis_kwargs,     # Arguments for SynthesisNetwork.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        # Set up the overall renderer.
+        self.renderer = Renderer()
+        # Set up the feature extractor.
+        self.feature_extractor = FeatureExtractor(ref_mode='tri_plane')
+        # Set up the reference representation generator.
+        self.backbone = StyleGAN2Backbone(z_dim,
+                                          c_dim,
+                                          w_dim,
+                                          img_resolution=256,
+                                          img_channels=32 * 3,
+                                          mapping_kwargs=mapping_kwargs,
+                                          **synthesis_kwargs)
+        # Set up the post module in the feature extractor.
+        self.post_module = None
+        # Set up the post neural renderer.
+        self.post_neural_renderer = None
+        sr_kwargs_total = dict(
+            channels=32,
+            img_resolution=img_resolution,
+            sr_num_fp16_res=sr_num_fp16_res,
+            sr_antialias=rendering_kwargs['sr_antialias'],)
+        sr_kwargs_total.update(**sr_kwargs)
+        if img_resolution == 128:
+            self.post_neural_renderer = SuperresolutionHybrid2X(
+                **sr_kwargs_total)
+        elif img_resolution == 256:
+            self.post_neural_renderer = SuperresolutionHybrid4X(
+                **sr_kwargs_total)
+        elif img_resolution == 512:
+            self.post_neural_renderer = SuperresolutionHybrid8XDC(
+                **sr_kwargs_total)
+        else:
+            raise TypeError(f'Unsupported image resolution: {img_resolution}!')
+        # Set up the fully-connected layer head.
+        self.fc_head = OSGDecoder(
+            32, {
+                'decoder_lr_mul': rendering_kwargs.get('decoder_lr_mul', 1),
+                'decoder_output_dim': 32
+            })
+        # Set up some rendering related arguments.
+        self.neural_rendering_resolution = rendering_kwargs.get(
+            'resolution', 64)
+        self.rendering_kwargs = rendering_kwargs
+    def mapping(self,
+                z,
+                c,
+                truncation_psi=1,
+                truncation_cutoff=None,
+                update_emas=False):
+        if self.rendering_kwargs['c_gen_conditioning_zero']:
+            c = torch.zeros_like(c)
+        return self.backbone.mapping(z,
+                                     c *
+                                     self.rendering_kwargs.get('c_scale', 0),
+                                     truncation_psi=truncation_psi,
+                                     truncation_cutoff=truncation_cutoff,
+                                     update_emas=update_emas)
+    def synthesis(self,
+                  wp,
+                  c,
+                  neural_rendering_resolution=None,
+                  update_emas=False,
+                  **synthesis_kwargs):
+        cam2world_matrix = c[:, :16].view(-1, 4, 4)
+        if self.rendering_kwargs.get('random_pose', False):
+            cam2world_matrix = None
+        if neural_rendering_resolution is None:
+            neural_rendering_resolution = self.neural_rendering_resolution
+        else:
+            self.neural_rendering_resolution = neural_rendering_resolution
+        tri_planes = self.backbone.synthesis(wp,
+                                             update_emas=update_emas,
+                                             **synthesis_kwargs)
+        tri_planes = tri_planes.view(len(tri_planes), 3, -1,
+                                     tri_planes.shape[-2],
+                                     tri_planes.shape[-1])
+        rendering_result = self.renderer(
+            wp=wp,
+            feature_extractor=self.feature_extractor,
+            rendering_options=self.rendering_kwargs,
+            cam2world_matrix=cam2world_matrix,
+            position_encoder=None,
+            ref_representation=tri_planes,
+            post_module=self.post_module,
+            fc_head=self.fc_head)
+        feature_samples = rendering_result['composite_rgb']
+        depth_samples = rendering_result['composite_depth']
+        # Reshape to keep consistent with 'raw' neural-rendered image.
+        N = wp.shape[0]
+        H = W = self.neural_rendering_resolution
+        feature_image = feature_samples.permute(0, 2, 1).reshape(
+            N, feature_samples.shape[-1], H, W).contiguous()
+        depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
+        # Run the post neural renderer to get final image.
+        # Here, the post neural renderer is a super-resolution network.
+        rgb_image = feature_image[:, :3]
+        sr_image = self.post_neural_renderer(
+            rgb_image,
+            feature_image,
+            wp,
+            noise_mode=self.rendering_kwargs['superresolution_noise_mode'],
+            **{
+                k: synthesis_kwargs[k]
+                for k in synthesis_kwargs.keys() if k != 'noise_mode'
+            })
+        return {
+            'image': sr_image,
+            'image_raw': rgb_image,
+            'image_depth': depth_image
+        }
+    def sample(self,
+               coordinates,
+               directions,
+               z,
+               c,
+               truncation_psi=1,
+               truncation_cutoff=None,
+               update_emas=False,
+               **synthesis_kwargs):
+        # Compute RGB features, density for arbitrary 3D coordinates.
+        # Mostly used for extracting shapes.
+        wp = self.mapping(z,
+                          c,
+                          truncation_psi=truncation_psi,
+                          truncation_cutoff=truncation_cutoff,
+                          update_emas=update_emas)
+        tri_planes = self.backbone.synthesis(wp,
+                                             update_emas=update_emas,
+                                             **synthesis_kwargs)
+        tri_planes = tri_planes.view(len(tri_planes), 3, -1,
+                                     tri_planes.shape[-2],
+                                     tri_planes.shape[-1])
+        result = self.renderer.get_sigma_rgb(
+            wp=wp,
+            points=coordinates,
+            feature_extractor=self.feature_extractor,
+            fc_head=self.fc_head,
+            rendering_options=self.rendering_kwargs,
+            ref_representation=tri_planes,
+            post_module=self.post_module,
+            ray_dirs=directions)
+        return result
+    def sample_mixed(self,
+                     coordinates,
+                     directions,
+                     wp,
+                     truncation_psi=1,
+                     truncation_cutoff=None,
+                     update_emas=False,
+                     **synthesis_kwargs):
+        # Same as function `self.sample()`, but expects latent vectors 'wp'
+        # instead of Gaussian noise 'z'.
+        tri_planes = self.backbone.synthesis(wp,
+                                             update_emas=update_emas,
+                                             **synthesis_kwargs)
+        tri_planes = tri_planes.view(len(tri_planes), 3, -1,
+                                     tri_planes.shape[-2],
+                                     tri_planes.shape[-1])
+        result = self.renderer.get_sigma_rgb(
+            wp=wp,
+            points=coordinates,
+            feature_extractor=self.feature_extractor,
+            fc_head=self.fc_head,
+            rendering_options=self.rendering_kwargs,
+            ref_representation=tri_planes,
+            post_module=self.post_module,
+            ray_dirs=directions)
+        return result
+    def forward(self,
+                z,
+                c,
+                c_swapped=None,      # `c_swapped` is swapped pose conditioning.
+                style_mixing_prob=0,
+                truncation_psi=1,
+                truncation_cutoff=None,
+                neural_rendering_resolution=None,
+                update_emas=False,
+                sample_mixed=False,
+                coordinates=None,
+                **synthesis_kwargs):
+        # Render a batch of generated images.
+        c_wp = c.clone()
+        if c_swapped is not None:
+            c_wp = c_swapped.clone()
+        wp = self.mapping(z,
+                          c_wp,
+                          truncation_psi=truncation_psi,
+                          truncation_cutoff=truncation_cutoff,
+                          update_emas=update_emas)
+        if style_mixing_prob > 0:
+            cutoff = torch.empty([], dtype=torch.int64,
+                                 device=wp.device).random_(1, wp.shape[1])
+            cutoff = torch.where(
+                torch.rand([], device=wp.device) < style_mixing_prob,
+                cutoff, torch.full_like(cutoff, wp.shape[1]))
+            wp[:, cutoff:] = self.mapping(torch.randn_like(z),
+                                          c,
+                                          update_emas=update_emas)[:, cutoff:]
+        if not sample_mixed:
+            gen_output = self.synthesis(
+                wp,
+                c,
+                update_emas=update_emas,
+                neural_rendering_resolution=neural_rendering_resolution,
+                **synthesis_kwargs)
+            return {
+                'wp': wp,
+                'gen_output': gen_output,
+            }
+        else:
+            # Only for density regularization in training process.
+            assert coordinates is not None
+            sample_sigma = self.sample_mixed(coordinates,
+                                             torch.randn_like(coordinates),
+                                             wp,
+                                             update_emas=False)['sigma']
+            return {
+                'wp': wp,
+                'sample_sigma': sample_sigma
+            }
+class OSGDecoder(nn.Module):
+    """Defines fully-connected layer head in EG3D."""
+    def __init__(self, n_features, options):
+        super().__init__()
+        self.hidden_dim = 64
+        self.net = nn.Sequential(
+            FullyConnectedLayer(n_features,
+                                self.hidden_dim,
+                                lr_multiplier=options['decoder_lr_mul']),
+            nn.Softplus(),
+            FullyConnectedLayer(self.hidden_dim,
+                                1 + options['decoder_output_dim'],
+                                lr_multiplier=options['decoder_lr_mul']))
+    def forward(self, point_features, wp=None, dirs=None):
+        # Aggregate features
+        # point_features.shape: [N, 3, M, C].
+        # Average across 'X, Y, Z' planes.
+        point_features = point_features.mean(1)
+        x = point_features
+        N, M, C = x.shape
+        x = x.view(N * M, C)
+        x = self.net(x)
+        x = x.view(N, M, -1)
+        # Uses sigmoid clamping from MipNeRF
+        rgb = torch.sigmoid(x[..., 1:]) * (1 + 2 * 0.001) - 0.001
+        sigma = x[..., 0:1]
+        return {'rgb': rgb, 'sigma': sigma}

models/eg3d_generator_fv.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# python3.8
+"""Contains the implementation of generator described in EG3D."""
+import torch
+import torch.nn as nn
+import numpy as np
+from models.utils.official_stylegan2_model_helper import MappingNetwork
+from models.utils.official_stylegan2_model_helper import FullyConnectedLayer
+from models.utils.eg3d_superres import SuperresolutionHybrid2X
+from models.utils.eg3d_superres import SuperresolutionHybrid4X
+from models.utils.eg3d_superres import SuperresolutionHybrid8XDC
+from models.rendering.renderer import Renderer
+from models.rendering.feature_extractor import FeatureExtractor
+from models.volumegan_generator import FeatureVolume
+from models.volumegan_generator import PositionEncoder
+class EG3DGeneratorFV(nn.Module):
+    def __init__(
+            self,
+            # Input latent (Z) dimensionality.
+            z_dim,
+            # Conditioning label (C) dimensionality.
+            c_dim,
+            # Intermediate latent (W) dimensionality.
+            w_dim,
+            # Final output image resolution.
+            img_resolution,
+            # Number of output color channels.
+            img_channels,
+            # Number of fp16 layers of SR Network.
+            sr_num_fp16_res=0,
+            # Arguments for MappingNetwork.
+            mapping_kwargs={},
+            # Arguments for rendering.
+            rendering_kwargs={},
+            # Arguments for SuperResolution Network.
+            sr_kwargs={},
+            # Configs for FeatureVolume.
+            fv_cfg=dict(feat_res=32,
+                        init_res=4,
+                        base_channels=256,
+                        output_channels=32,
+                        w_dim=512),
+            # Configs for position encoder.
+            embed_cfg=dict(input_dim=3, max_freq_log2=10 - 1, N_freqs=10),
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        # Set up mapping network.
+        # Here `num_ws = 2`: one for FeatureVolume Network injection and one for
+        # post_neural_renderer injection.
+        num_ws = 2
+        self.mapping_network = MappingNetwork(z_dim=z_dim,
+                                              c_dim=c_dim,
+                                              w_dim=w_dim,
+                                              num_ws=num_ws,
+                                              **mapping_kwargs)
+        # Set up the overall renderer.
+        self.renderer = Renderer()
+        # Set up the feature extractor.
+        self.feature_extractor = FeatureExtractor(ref_mode='feature_volume')
+        # Set up the reference representation generator.
+        self.ref_representation_generator = FeatureVolume(**fv_cfg)
+        # Set up the position encoder.
+        self.position_encoder = PositionEncoder(**embed_cfg)
+        # Set up the post module in the feature extractor.
+        self.post_module = None
+        # Set up the post neural renderer.
+        self.post_neural_renderer = None
+        sr_kwargs_total = dict(
+            channels=32,
+            img_resolution=img_resolution,
+            sr_num_fp16_res=sr_num_fp16_res,
+            sr_antialias=rendering_kwargs['sr_antialias'],)
+        sr_kwargs_total.update(**sr_kwargs)
+        if img_resolution == 128:
+            self.post_neural_renderer = SuperresolutionHybrid2X(
+                **sr_kwargs_total)
+        elif img_resolution == 256:
+            self.post_neural_renderer = SuperresolutionHybrid4X(
+                **sr_kwargs_total)
+        elif img_resolution == 512:
+            self.post_neural_renderer = SuperresolutionHybrid8XDC(
+                **sr_kwargs_total)
+        else:
+            raise TypeError(f'Unsupported image resolution: {img_resolution}!')
+        # Set up the fully-connected layer head.
+        self.fc_head = OSGDecoder(
+            32, {
+                'decoder_lr_mul': rendering_kwargs.get('decoder_lr_mul', 1),
+                'decoder_output_dim': 32
+            })
+        # Set up some rendering related arguments.
+        self.neural_rendering_resolution = rendering_kwargs.get(
+            'resolution', 64)
+        self.rendering_kwargs = rendering_kwargs
+    def mapping(self,
+                z,
+                c,
+                truncation_psi=1,
+                truncation_cutoff=None,
+                update_emas=False):
+        if self.rendering_kwargs['c_gen_conditioning_zero']:
+            c = torch.zeros_like(c)
+        return self.mapping_network(z,
+                                    c *
+                                    self.rendering_kwargs.get('c_scale', 0),
+                                    truncation_psi=truncation_psi,
+                                    truncation_cutoff=truncation_cutoff,
+                                    update_emas=update_emas)
+    def synthesis(self,
+                  wp,
+                  c,
+                  neural_rendering_resolution=None,
+                  update_emas=False,
+                  **synthesis_kwargs):
+        cam2world_matrix = c[:, :16].view(-1, 4, 4)
+        if self.rendering_kwargs.get('random_pose', False):
+            cam2world_matrix = None
+        if neural_rendering_resolution is None:
+            neural_rendering_resolution = self.neural_rendering_resolution
+        else:
+            self.neural_rendering_resolution = neural_rendering_resolution
+        feature_volume = self.ref_representation_generator(wp)
+        rendering_result = self.renderer(
+            wp=wp,
+            feature_extractor=self.feature_extractor,
+            rendering_options=self.rendering_kwargs,
+            cam2world_matrix=cam2world_matrix,
+            position_encoder=self.position_encoder,
+            ref_representation=feature_volume,
+            post_module=self.post_module,
+            fc_head=self.fc_head)
+        feature_samples = rendering_result['composite_rgb']
+        depth_samples = rendering_result['composite_depth']
+        # Reshape to keep consistent with 'raw' neural-rendered image.
+        N = wp.shape[0]
+        H = W = self.neural_rendering_resolution
+        feature_image = feature_samples.permute(0, 2, 1).reshape(
+            N, feature_samples.shape[-1], H, W).contiguous()
+        depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
+        # Run the post neural renderer to get final image.
+        # Here, the post neural renderer is a super-resolution network.
+        rgb_image = feature_image[:, :3]
+        sr_image = self.post_neural_renderer(
+            rgb_image,
+            feature_image,
+            wp,
+            noise_mode=self.rendering_kwargs['superresolution_noise_mode'],
+            **{
+                k: synthesis_kwargs[k]
+                for k in synthesis_kwargs.keys() if k != 'noise_mode'
+            })
+        return {
+            'image': sr_image,
+            'image_raw': rgb_image,
+            'image_depth': depth_image
+        }
+    def sample(self,
+               coordinates,
+               directions,
+               z,
+               c,
+               truncation_psi=1,
+               truncation_cutoff=None,
+               update_emas=False):
+        # Compute RGB features, density for arbitrary 3D coordinates.
+        # Mostly used for extracting shapes.
+        wp = self.mapping_network(z,
+                          c,
+                          truncation_psi=truncation_psi,
+                          truncation_cutoff=truncation_cutoff,
+                          update_emas=update_emas)
+        feature_volume = self.ref_representation_generator(wp)
+        result = self.renderer.get_sigma_rgb(
+            wp=wp,
+            points=coordinates,
+            feature_extractor=self.feature_extractor,
+            fc_head=self.fc_head,
+            rendering_options=self.rendering_kwargs,
+            ref_representation=feature_volume,
+            position_encoder=self.position_encoder,
+            post_module=self.post_module,
+            ray_dirs=directions)
+        return result
+    def sample_mixed(self,
+                     coordinates,
+                     directions,
+                     wp):
+        # Same as function `self.sample()`, but expects latent vectors 'wp'
+        # instead of Gaussian noise 'z'.
+        feature_volume = self.ref_representation_generator(wp)
+        result = self.renderer.get_sigma_rgb(
+            wp=wp,
+            points=coordinates,
+            feature_extractor=self.feature_extractor,
+            fc_head=self.fc_head,
+            rendering_options=self.rendering_kwargs,
+            ref_representation=feature_volume,
+            position_encoder=self.position_encoder,
+            post_module=self.post_module,
+            ray_dirs=directions)
+        return result
+    def forward(self,
+                z,
+                c,
+                c_swapped=None,      # `c_swapped` is swapped pose conditioning.
+                style_mixing_prob=0,
+                truncation_psi=1,
+                truncation_cutoff=None,
+                neural_rendering_resolution=None,
+                update_emas=False,
+                sample_mixed=False,
+                coordinates=None,
+                **synthesis_kwargs):
+        # Render a batch of generated images.
+        c_wp = c.clone()
+        if c_swapped is not None:
+            c_wp = c_swapped.clone()
+        wp = self.mapping_network(z,
+                                  c_wp,
+                                  truncation_psi=truncation_psi,
+                                  truncation_cutoff=truncation_cutoff,
+                                  update_emas=update_emas)
+        if style_mixing_prob > 0:
+            cutoff = torch.empty([], dtype=torch.int64,
+                                 device=wp.device).random_(1, wp.shape[1])
+            cutoff = torch.where(
+                torch.rand([], device=wp.device) < style_mixing_prob, cutoff,
+                torch.full_like(cutoff, wp.shape[1]))
+            wp[:, cutoff:] = self.mapping_network(
+                torch.randn_like(z), c, update_emas=update_emas)[:, cutoff:]
+        if not sample_mixed:
+            gen_output = self.synthesis(
+                wp,
+                c,
+                update_emas=update_emas,
+                neural_rendering_resolution=neural_rendering_resolution,
+                **synthesis_kwargs)
+            return {
+                'wp': wp,
+                'gen_output': gen_output,
+            }
+        else:
+            # Only for density regularization in training process.
+            assert coordinates is not None
+            sample_sigma = self.sample_mixed(coordinates,
+                                             torch.randn_like(coordinates),
+                                             wp)['sigma']
+            return {
+                'wp': wp,
+                'sample_sigma': sample_sigma
+            }
+class OSGDecoder(nn.Module):
+    """Defines fully-connected layer head in EG3D."""
+    def __init__(self, n_features, options):
+        super().__init__()
+        self.hidden_dim = 64
+        self.net = nn.Sequential(
+            FullyConnectedLayer(n_features,
+                                self.hidden_dim,
+                                lr_multiplier=options['decoder_lr_mul']),
+            nn.Softplus(),
+            FullyConnectedLayer(self.hidden_dim,
+                                1 + options['decoder_output_dim'],
+                                lr_multiplier=options['decoder_lr_mul']))
+    def forward(self, point_features, wp=None, dirs=None):
+        # point_features.shape: [N, C, M, 1].
+        point_features = point_features.squeeze(-1)
+        point_features = point_features.permute(0, 2, 1)
+        x = point_features
+        N, M, C = x.shape
+        x = x.reshape(N * M, C)
+        x = self.net(x)
+        x = x.reshape(N, M, -1)
+        # Uses sigmoid clamping from MipNeRF
+        rgb = torch.sigmoid(x[..., 1:]) * (1 + 2 * 0.001) - 0.001
+        sigma = x[..., 0:1]
+        return {'rgb': rgb, 'sigma': sigma}

models/ghfeat_encoder.py ADDED Viewed

	@@ -0,0 +1,563 @@

+# python3.7
+"""Contains the implementation of encoder used in GH-Feat (including IDInvert).
+ResNet is used as the backbone.
+GH-Feat paper: https://arxiv.org/pdf/2007.10379.pdf
+IDInvert paper: https://arxiv.org/pdf/2004.00049.pdf
+NOTE: Please use `latent_num` and `num_latents_per_head` to control the
+inversion space, such as Y-space used in GH-Feat and W-space used in IDInvert.
+In addition, IDInvert sets `use_fpn` and `use_sam` as `False` by default.
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+__all__ = ['GHFeatEncoder']
+# Resolutions allowed.
+_RESOLUTIONS_ALLOWED = [8, 16, 32, 64, 128, 256, 512, 1024]
+# pylint: disable=missing-function-docstring
+class BasicBlock(nn.Module):
+    """Implementation of ResNet BasicBlock."""
+    expansion = 1
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 base_width=64,
+                 stride=1,
+                 groups=1,
+                 dilation=1,
+                 norm_layer=None,
+                 downsample=None):
+        super().__init__()
+        if base_width != 64:
+            raise ValueError(f'BasicBlock of ResNet only supports '
+                             f'`base_width=64`, but {base_width} received!')
+        if stride not in [1, 2]:
+            raise ValueError(f'BasicBlock of ResNet only supports `stride=1` '
+                             f'and `stride=2`, but {stride} received!')
+        if groups != 1:
+            raise ValueError(f'BasicBlock of ResNet only supports `groups=1`, '
+                             f'but {groups} received!')
+        if dilation != 1:
+            raise ValueError(f'BasicBlock of ResNet only supports '
+                             f'`dilation=1`, but {dilation} received!')
+        assert self.expansion == 1
+        self.stride = stride
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.conv1 = nn.Conv2d(in_channels=inplanes,
+                               out_channels=planes,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=1,
+                               groups=1,
+                               dilation=1,
+                               bias=False)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(in_channels=planes,
+                               out_channels=planes,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               groups=1,
+                               dilation=1,
+                               bias=False)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+    def forward(self, x):
+        identity = self.downsample(x) if self.downsample is not None else x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out + identity)
+        return out
+class Bottleneck(nn.Module):
+    """Implementation of ResNet Bottleneck."""
+    expansion = 4
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 base_width=64,
+                 stride=1,
+                 groups=1,
+                 dilation=1,
+                 norm_layer=None,
+                 downsample=None):
+        super().__init__()
+        if stride not in [1, 2]:
+            raise ValueError(f'Bottleneck of ResNet only supports `stride=1` '
+                             f'and `stride=2`, but {stride} received!')
+        width = int(planes * (base_width / 64)) * groups
+        self.stride = stride
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.conv1 = nn.Conv2d(in_channels=inplanes,
+                               out_channels=width,
+                               kernel_size=1,
+                               stride=1,
+                               padding=0,
+                               dilation=1,
+                               groups=1,
+                               bias=False)
+        self.bn1 = norm_layer(width)
+        self.conv2 = nn.Conv2d(in_channels=width,
+                               out_channels=width,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=dilation,
+                               groups=groups,
+                               dilation=dilation,
+                               bias=False)
+        self.bn2 = norm_layer(width)
+        self.conv3 = nn.Conv2d(in_channels=width,
+                               out_channels=planes * self.expansion,
+                               kernel_size=1,
+                               stride=1,
+                               padding=0,
+                               dilation=1,
+                               groups=1,
+                               bias=False)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+    def forward(self, x):
+        identity = self.downsample(x) if self.downsample is not None else x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.relu(out + identity)
+        return out
+class GHFeatEncoder(nn.Module):
+    """Define the ResNet-based encoder network for GAN inversion.
+    On top of the backbone, there are several task-heads to produce inverted
+    codes. Please use `latent_dim` and `num_latents_per_head` to define the
+    structure. For example, `latent_dim = [512] * 14` and
+    `num_latents_per_head = [4, 4, 6]` can be used for StyleGAN inversion with
+    14-layer latent codes, where 3 task heads (corresponding to 4, 4, 6 layers,
+    respectively) are used.
+    Settings for the encoder network:
+    (1) resolution: The resolution of the output image.
+    (2) latent_dim: Dimension of the latent space. A number (one code will be
+        produced), or a list of numbers regarding layer-wise latent codes.
+    (3) num_latents_per_head: Number of latents that is produced by each head.
+    (4) image_channels: Number of channels of the output image. (default: 3)
+    (5) final_res: Final resolution of the convolutional layers. (default: 4)
+    ResNet-related settings:
+    (1) network_depth: Depth of the network, like 18 for ResNet18. (default: 18)
+    (2) inplanes: Number of channels of the first convolutional layer.
+        (default: 64)
+    (3) groups: Groups of the convolution, used in ResNet. (default: 1)
+    (4) width_per_group: Number of channels per group, used in ResNet.
+        (default: 64)
+    (5) replace_stride_with_dilation: Whether to replace stride with dilation,
+        used in ResNet. (default: None)
+    (6) norm_layer: Normalization layer used in the encoder. If set as `None`,
+        `nn.BatchNorm2d` will be used. Also, please NOTE that when using batch
+        normalization, the batch size is required to be larger than one for
+        training. (default: nn.BatchNorm2d)
+    (7) max_channels: Maximum number of channels in each layer. (default: 512)
+    Task-head related settings:
+    (1) use_fpn: Whether to use Feature Pyramid Network (FPN) before outputting
+        the latent code. (default: True)
+    (2) fpn_channels: Number of channels used in FPN. (default: 512)
+    (3) use_sam: Whether to use Spatial Alignment Module (SAM) before outputting
+        the latent code. (default: True)
+    (4) sam_channels: Number of channels used in SAM. (default: 512)
+    """
+    arch_settings = {
+        18: (BasicBlock,  [2, 2, 2, 2]),
+        34: (BasicBlock,  [3, 4, 6, 3]),
+        50: (Bottleneck,  [3, 4, 6, 3]),
+        101: (Bottleneck, [3, 4, 23, 3]),
+        152: (Bottleneck, [3, 8, 36, 3])
+    }
+    def __init__(self,
+                 resolution,
+                 latent_dim,
+                 num_latents_per_head,
+                 image_channels=3,
+                 final_res=4,
+                 network_depth=18,
+                 inplanes=64,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=nn.BatchNorm2d,
+                 max_channels=512,
+                 use_fpn=True,
+                 fpn_channels=512,
+                 use_sam=True,
+                 sam_channels=512):
+        super().__init__()
+        if resolution not in _RESOLUTIONS_ALLOWED:
+            raise ValueError(f'Invalid resolution: `{resolution}`!\n'
+                             f'Resolutions allowed: {_RESOLUTIONS_ALLOWED}.')
+        if network_depth not in self.arch_settings:
+            raise ValueError(f'Invalid network depth: `{network_depth}`!\n'
+                             f'Options allowed: '
+                             f'{list(self.arch_settings.keys())}.')
+        if isinstance(latent_dim, int):
+            latent_dim = [latent_dim]
+        assert isinstance(latent_dim, (list, tuple))
+        assert isinstance(num_latents_per_head, (list, tuple))
+        assert sum(num_latents_per_head) == len(latent_dim)
+        self.resolution = resolution
+        self.latent_dim = latent_dim
+        self.num_latents_per_head = num_latents_per_head
+        self.num_heads = len(self.num_latents_per_head)
+        self.image_channels = image_channels
+        self.final_res = final_res
+        self.inplanes = inplanes
+        self.network_depth = network_depth
+        self.groups = groups
+        self.dilation = 1
+        self.base_width = width_per_group
+        self.replace_stride_with_dilation = replace_stride_with_dilation
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if norm_layer == nn.BatchNorm2d and dist.is_initialized():
+            norm_layer = nn.SyncBatchNorm
+        self.norm_layer = norm_layer
+        self.max_channels = max_channels
+        self.use_fpn = use_fpn
+        self.fpn_channels = fpn_channels
+        self.use_sam = use_sam
+        self.sam_channels = sam_channels
+        block_fn, num_blocks_per_stage = self.arch_settings[network_depth]
+        self.num_stages = int(np.log2(resolution // final_res)) - 1
+        # Add one block for additional stages.
+        for i in range(len(num_blocks_per_stage), self.num_stages):
+            num_blocks_per_stage.append(1)
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False] * self.num_stages
+        # Backbone.
+        self.conv1 = nn.Conv2d(in_channels=self.image_channels,
+                               out_channels=self.inplanes,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.stage_channels = [self.inplanes]
+        self.stages = nn.ModuleList()
+        for i in range(self.num_stages):
+            inplanes = self.inplanes if i == 0 else planes * block_fn.expansion
+            planes = min(self.max_channels, self.inplanes * (2 ** i))
+            num_blocks = num_blocks_per_stage[i]
+            stride = 1 if i == 0 else 2
+            dilate = replace_stride_with_dilation[i]
+            self.stages.append(self._make_stage(block_fn=block_fn,
+                                                inplanes=inplanes,
+                                                planes=planes,
+                                                num_blocks=num_blocks,
+                                                stride=stride,
+                                                dilate=dilate))
+            self.stage_channels.append(planes * block_fn.expansion)
+        if self.num_heads > len(self.stage_channels):
+            raise ValueError('Number of task heads is larger than number of '
+                             'stages! Please reduce the number of heads.')
+        # Task-head.
+        if self.num_heads == 1:
+            self.use_fpn = False
+            self.use_sam = False
+        if self.use_fpn:
+            fpn_pyramid_channels = self.stage_channels[-self.num_heads:]
+            self.fpn = FPN(pyramid_channels=fpn_pyramid_channels,
+                           out_channels=self.fpn_channels)
+        if self.use_sam:
+            if self.use_fpn:
+                sam_pyramid_channels = [self.fpn_channels] * self.num_heads
+            else:
+                sam_pyramid_channels = self.stage_channels[-self.num_heads:]
+            self.sam = SAM(pyramid_channels=sam_pyramid_channels,
+                           out_channels=self.sam_channels)
+        self.heads = nn.ModuleList()
+        for head_idx in range(self.num_heads):
+            # Parse in_channels.
+            if self.use_sam:
+                in_channels = self.sam_channels
+            elif self.use_fpn:
+                in_channels = self.fpn_channels
+            else:
+                in_channels = self.stage_channels[head_idx - self.num_heads]
+            in_channels = in_channels * final_res * final_res
+            # Parse out_channels.
+            start_latent_idx = sum(self.num_latents_per_head[:head_idx])
+            end_latent_idx = sum(self.num_latents_per_head[:head_idx + 1])
+            out_channels = sum(self.latent_dim[start_latent_idx:end_latent_idx])
+            self.heads.append(CodeHead(in_channels=in_channels,
+                                       out_channels=out_channels,
+                                       norm_layer=self.norm_layer))
+    def _make_stage(self,
+                    block_fn,
+                    inplanes,
+                    planes,
+                    num_blocks,
+                    stride,
+                    dilate):
+        norm_layer = self.norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or inplanes != planes * block_fn.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(in_channels=inplanes,
+                          out_channels=planes * block_fn.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          padding=0,
+                          dilation=1,
+                          groups=1,
+                          bias=False),
+                norm_layer(planes * block_fn.expansion),
+            )
+        blocks = []
+        blocks.append(block_fn(inplanes=inplanes,
+                               planes=planes,
+                               base_width=self.base_width,
+                               stride=stride,
+                               groups=self.groups,
+                               dilation=previous_dilation,
+                               norm_layer=norm_layer,
+                               downsample=downsample))
+        for _ in range(1, num_blocks):
+            blocks.append(block_fn(inplanes=planes * block_fn.expansion,
+                                   planes=planes,
+                                   base_width=self.base_width,
+                                   stride=1,
+                                   groups=self.groups,
+                                   dilation=self.dilation,
+                                   norm_layer=norm_layer,
+                                   downsample=None))
+        return nn.Sequential(*blocks)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        features = [x]
+        for i in range(self.num_stages):
+            x = self.stages[i](x)
+            features.append(x)
+        features = features[-self.num_heads:]
+        if self.use_fpn:
+            features = self.fpn(features)
+        if self.use_sam:
+            features = self.sam(features)
+        else:
+            final_size = features[-1].shape[2:]
+            for i in range(self.num_heads - 1):
+                features[i] = F.adaptive_avg_pool2d(features[i], final_size)
+        outputs = []
+        for head_idx in range(self.num_heads):
+            codes = self.heads[head_idx](features[head_idx])
+            start_latent_idx = sum(self.num_latents_per_head[:head_idx])
+            end_latent_idx = sum(self.num_latents_per_head[:head_idx + 1])
+            split_size = self.latent_dim[start_latent_idx:end_latent_idx]
+            outputs.extend(torch.split(codes, split_size, dim=1))
+        max_dim = max(self.latent_dim)
+        for i, dim in enumerate(self.latent_dim):
+            if dim < max_dim:
+                outputs[i] = F.pad(outputs[i], (0, max_dim - dim))
+            outputs[i] = outputs[i].unsqueeze(1)
+        return torch.cat(outputs, dim=1)
+class FPN(nn.Module):
+    """Implementation of Feature Pyramid Network (FPN).
+    The input of this module is a pyramid of features with reducing resolutions.
+    Then, this module fuses these multi-level features from `top_level` to
+    `bottom_level`. In particular, starting from the `top_level`, each feature
+    is convoluted, upsampled, and fused into its previous feature (which is also
+    convoluted).
+    Args:
+        pyramid_channels: A list of integers, each of which indicates the number
+            of channels of the feature from a particular level.
+        out_channels: Number of channels for each output.
+    Returns:
+        A list of feature maps, each of which has `out_channels` channels.
+    """
+    def __init__(self, pyramid_channels, out_channels):
+        super().__init__()
+        assert isinstance(pyramid_channels, (list, tuple))
+        self.num_levels = len(pyramid_channels)
+        self.lateral_layers = nn.ModuleList()
+        self.feature_layers = nn.ModuleList()
+        for i in range(self.num_levels):
+            in_channels = pyramid_channels[i]
+            self.lateral_layers.append(nn.Conv2d(in_channels=in_channels,
+                                                 out_channels=out_channels,
+                                                 kernel_size=3,
+                                                 padding=1,
+                                                 bias=True))
+            self.feature_layers.append(nn.Conv2d(in_channels=out_channels,
+                                                 out_channels=out_channels,
+                                                 kernel_size=3,
+                                                 padding=1,
+                                                 bias=True))
+    def forward(self, inputs):
+        if len(inputs) != self.num_levels:
+            raise ValueError('Number of inputs and `num_levels` mismatch!')
+        # Project all related features to `out_channels`.
+        laterals = []
+        for i in range(self.num_levels):
+            laterals.append(self.lateral_layers[i](inputs[i]))
+        # Fusion, starting from `top_level`.
+        for i in range(self.num_levels - 1, 0, -1):
+            scale_factor = laterals[i - 1].shape[2] // laterals[i].shape[2]
+            laterals[i - 1] = (laterals[i - 1] +
+                               F.interpolate(laterals[i],
+                                             mode='nearest',
+                                             scale_factor=scale_factor))
+        # Get outputs.
+        outputs = []
+        for i, lateral in enumerate(laterals):
+            outputs.append(self.feature_layers[i](lateral))
+        return outputs
+class SAM(nn.Module):
+    """Implementation of Spatial Alignment Module (SAM).
+    The input of this module is a pyramid of features with reducing resolutions.
+    Then this module downsamples all levels of feature to the minimum resolution
+    and fuses it with the smallest feature map.
+    Args:
+        pyramid_channels: A list of integers, each of which indicates the number
+            of channels of the feature from a particular level.
+        out_channels: Number of channels for each output.
+    Returns:
+        A list of feature maps, each of which has `out_channels` channels.
+    """
+    def __init__(self, pyramid_channels, out_channels):
+        super().__init__()
+        assert isinstance(pyramid_channels, (list, tuple))
+        self.num_levels = len(pyramid_channels)
+        self.fusion_layers = nn.ModuleList()
+        for i in range(self.num_levels):
+            in_channels = pyramid_channels[i]
+            self.fusion_layers.append(nn.Conv2d(in_channels=in_channels,
+                                                out_channels=out_channels,
+                                                kernel_size=3,
+                                                padding=1,
+                                                bias=True))
+    def forward(self, inputs):
+        if len(inputs) != self.num_levels:
+            raise ValueError('Number of inputs and `num_levels` mismatch!')
+        output_res = inputs[-1].shape[2:]
+        for i in range(self.num_levels - 1, -1, -1):
+            if i != self.num_levels - 1:
+                inputs[i] = F.adaptive_avg_pool2d(inputs[i], output_res)
+            inputs[i] = self.fusion_layers[i](inputs[i])
+            if i != self.num_levels - 1:
+                inputs[i] = inputs[i] + inputs[-1]
+        return inputs
+class CodeHead(nn.Module):
+    """Implementation of the task-head to produce inverted codes."""
+    def __init__(self, in_channels, out_channels, norm_layer):
+        super().__init__()
+        self.fc = nn.Linear(in_channels, out_channels, bias=True)
+        if norm_layer is None:
+            self.norm = nn.Identity()
+        else:
+            self.norm = norm_layer(out_channels)
+    def forward(self, x):
+        if x.ndim > 2:
+            x = x.flatten(start_dim=1)
+        latent = self.fc(x)
+        latent = latent.unsqueeze(2).unsqueeze(3)
+        latent = self.norm(latent)
+        return latent.flatten(start_dim=1)
+# pylint: enable=missing-function-docstring

models/inception_model.py ADDED Viewed

	@@ -0,0 +1,562 @@

+# python3.7
+"""Contains the Inception V3 model, which is used for inference ONLY.
+This file is mostly borrowed from `torchvision/models/inception.py`.
+Inception model is widely used to compute FID or IS metric for evaluating
+generative models. However, the pre-trained models from torchvision is slightly
+different from the TensorFlow version
+http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+which is used by the official FID implementation
+https://github.com/bioinf-jku/TTUR
+In particular:
+(1) The number of classes in TensorFlow model is 1008 instead of 1000.
+(2) The avg_pool() layers in TensorFlow model does not include the padded zero.
+(3) The last Inception E Block in TensorFlow model use max_pool() instead of
+    avg_pool().
+Hence, to align the evaluation results with those from TensorFlow
+implementation, we modified the inception model to support both versions. Please
+use `align_tf` argument to control the version.
+"""
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from utils.misc import download_url
+__all__ = ['InceptionModel']
+# pylint: disable=line-too-long
+_MODEL_URL_SHA256 = {
+    # This model is provided by `torchvision`, which is ported from TensorFlow.
+    'torchvision_official': (
+        'https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth',
+        '1a9a5a14f40645a370184bd54f4e8e631351e71399112b43ad0294a79da290c8'  # hash sha256
+    ),
+    # This model is provided by https://github.com/mseitzer/pytorch-fid
+    'tf_inception_v3': (
+        'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth',
+        '6726825d0af5f729cebd5821db510b11b1cfad8faad88a03f1befd49fb9129b2'  # hash sha256
+    )
+}
+class InceptionModel(object):
+    """Defines the Inception (V3) model.
+    This is a static class, which is used to avoid this model to be built
+    repeatedly. Consequently, this model is particularly used for inference,
+    like computing FID. If training is required, please use the model from
+    `torchvision.models` or implement by yourself.
+    NOTE: The pre-trained model assumes the inputs to be with `RGB` channel
+    order and pixel range [-1, 1], and will also resize the images to shape
+    [299, 299] automatically. If your input is normalized by subtracting
+    (0.485, 0.456, 0.406) and dividing (0.229, 0.224, 0.225), please use
+    `transform_input` in the `forward()` function to un-normalize it.
+    """
+    models = dict()
+    @staticmethod
+    def build_model(align_tf=True):
+        """Builds the model and load pre-trained weights.
+        If `align_tf` is set as True, the model will predict 1008 classes, and
+        the pre-trained weight from `https://github.com/mseitzer/pytorch-fid`
+        will be loaded. Otherwise, the model will predict 1000 classes, and will
+        load the model from `torchvision`.
+        The built model supports following arguments when forwarding:
+        - transform_input: Whether to transform the input back to pixel range
+            (-1, 1). Please disable this argument if your input is already with
+            pixel range (-1, 1). (default: False)
+        - output_logits: Whether to output the categorical logits instead of
+            features. (default: False)
+        - remove_logits_bias: Whether to remove the bias when computing the
+            logits. The official implementation removes the bias by default.
+            Please refer to
+            `https://github.com/openai/improved-gan/blob/master/inception_score/model.py`.
+            (default: False)
+        - output_predictions: Whether to output the final predictions, i.e.,
+            `softmax(logits)`. (default: False)
+        """
+        if align_tf:
+            num_classes = 1008
+            model_source = 'tf_inception_v3'
+        else:
+            num_classes = 1000
+            model_source = 'torchvision_official'
+        fingerprint = model_source
+        if fingerprint not in InceptionModel.models:
+            # Build model.
+            model = Inception3(num_classes=num_classes,
+                               aux_logits=False,
+                               init_weights=False,
+                               align_tf=align_tf)
+            # Download pre-trained weights.
+            if dist.is_initialized() and dist.get_rank() != 0:
+                dist.barrier()  # Download by chief.
+            url, sha256 = _MODEL_URL_SHA256[model_source]
+            filename = f'inception_model_{model_source}_{sha256}.pth'
+            model_path, hash_check = download_url(url,
+                                                  filename=filename,
+                                                  sha256=sha256)
+            state_dict = torch.load(model_path, map_location='cpu')
+            if hash_check is False:
+                warnings.warn(f'Hash check failed! The remote file from URL '
+                              f'`{url}` may be changed, or the downloading is '
+                              f'interrupted. The loaded inception model may '
+                              f'have unexpected behavior.')
+            if dist.is_initialized() and dist.get_rank() == 0:
+                dist.barrier()  # Wait for other replicas.
+            # Load weights.
+            model.load_state_dict(state_dict, strict=False)
+            del state_dict
+            # For inference only.
+            model.eval().requires_grad_(False).cuda()
+            InceptionModel.models[fingerprint] = model
+        return InceptionModel.models[fingerprint]
+# pylint: disable=missing-function-docstring
+# pylint: disable=missing-class-docstring
+# pylint: disable=super-with-arguments
+# pylint: disable=consider-merging-isinstance
+# pylint: disable=import-outside-toplevel
+# pylint: disable=no-else-return
+class Inception3(nn.Module):
+    def __init__(self, num_classes=1000, aux_logits=True, inception_blocks=None,
+                 init_weights=True, align_tf=True):
+        super(Inception3, self).__init__()
+        if inception_blocks is None:
+            inception_blocks = [
+                BasicConv2d, InceptionA, InceptionB, InceptionC,
+                InceptionD, InceptionE, InceptionAux
+            ]
+        assert len(inception_blocks) == 7
+        conv_block = inception_blocks[0]
+        inception_a = inception_blocks[1]
+        inception_b = inception_blocks[2]
+        inception_c = inception_blocks[3]
+        inception_d = inception_blocks[4]
+        inception_e = inception_blocks[5]
+        inception_aux = inception_blocks[6]
+        self.aux_logits = aux_logits
+        self.align_tf = align_tf
+        self.Conv2d_1a_3x3 = conv_block(3, 32, kernel_size=3, stride=2)
+        self.Conv2d_2a_3x3 = conv_block(32, 32, kernel_size=3)
+        self.Conv2d_2b_3x3 = conv_block(32, 64, kernel_size=3, padding=1)
+        self.Conv2d_3b_1x1 = conv_block(64, 80, kernel_size=1)
+        self.Conv2d_4a_3x3 = conv_block(80, 192, kernel_size=3)
+        self.Mixed_5b = inception_a(192, pool_features=32, align_tf=self.align_tf)
+        self.Mixed_5c = inception_a(256, pool_features=64, align_tf=self.align_tf)
+        self.Mixed_5d = inception_a(288, pool_features=64, align_tf=self.align_tf)
+        self.Mixed_6a = inception_b(288)
+        self.Mixed_6b = inception_c(768, channels_7x7=128, align_tf=self.align_tf)
+        self.Mixed_6c = inception_c(768, channels_7x7=160, align_tf=self.align_tf)
+        self.Mixed_6d = inception_c(768, channels_7x7=160, align_tf=self.align_tf)
+        self.Mixed_6e = inception_c(768, channels_7x7=192, align_tf=self.align_tf)
+        if aux_logits:
+            self.AuxLogits = inception_aux(768, num_classes)
+        self.Mixed_7a = inception_d(768)
+        self.Mixed_7b = inception_e(1280, align_tf=self.align_tf)
+        self.Mixed_7c = inception_e(2048, use_max_pool=self.align_tf)
+        self.fc = nn.Linear(2048, num_classes)
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                    import scipy.stats as stats
+                    stddev = m.stddev if hasattr(m, 'stddev') else 0.1
+                    X = stats.truncnorm(-2, 2, scale=stddev)
+                    values = torch.as_tensor(X.rvs(m.weight.numel()), dtype=m.weight.dtype)
+                    values = values.view(m.weight.size())
+                    with torch.no_grad():
+                        m.weight.copy_(values)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+    @staticmethod
+    def _transform_input(x, transform_input=False):
+        if transform_input:
+            x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
+            x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
+            x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
+            x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
+        return x
+    def _forward(self,
+                 x,
+                 output_logits=False,
+                 remove_logits_bias=False,
+                 output_predictions=False):
+        # Upsample if necessary.
+        if x.shape[2] != 299 or x.shape[3] != 299:
+            if self.align_tf:
+                theta = torch.eye(2, 3).to(x)
+                theta[0, 2] += theta[0, 0] / x.shape[3] - theta[0, 0] / 299
+                theta[1, 2] += theta[1, 1] / x.shape[2] - theta[1, 1] / 299
+                theta = theta.unsqueeze(0).repeat(x.shape[0], 1, 1)
+                grid = F.affine_grid(theta,
+                                     size=(x.shape[0], x.shape[1], 299, 299),
+                                     align_corners=False)
+                x = F.grid_sample(x, grid,
+                                  mode='bilinear',
+                                  padding_mode='border',
+                                  align_corners=False)
+            else:
+                x = F.interpolate(
+                    x, size=(299, 299), mode='bilinear', align_corners=False)
+        if x.shape[1] == 1:
+            x = x.repeat((1, 3, 1, 1))
+        if self.align_tf:
+            x = (x * 127.5 + 127.5 - 128) / 128
+        # N x 3 x 299 x 299
+        x = self.Conv2d_1a_3x3(x)
+        # N x 32 x 149 x 149
+        x = self.Conv2d_2a_3x3(x)
+        # N x 32 x 147 x 147
+        x = self.Conv2d_2b_3x3(x)
+        # N x 64 x 147 x 147
+        x = F.max_pool2d(x, kernel_size=3, stride=2)
+        # N x 64 x 73 x 73
+        x = self.Conv2d_3b_1x1(x)
+        # N x 80 x 73 x 73
+        x = self.Conv2d_4a_3x3(x)
+        # N x 192 x 71 x 71
+        x = F.max_pool2d(x, kernel_size=3, stride=2)
+        # N x 192 x 35 x 35
+        x = self.Mixed_5b(x)
+        # N x 256 x 35 x 35
+        x = self.Mixed_5c(x)
+        # N x 288 x 35 x 35
+        x = self.Mixed_5d(x)
+        # N x 288 x 35 x 35
+        x = self.Mixed_6a(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6b(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6c(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6d(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6e(x)
+        # N x 768 x 17 x 17
+        if self.training and self.aux_logits:
+            aux = self.AuxLogits(x)
+        else:
+            aux = None
+        # N x 768 x 17 x 17
+        x = self.Mixed_7a(x)
+        # N x 1280 x 8 x 8
+        x = self.Mixed_7b(x)
+        # N x 2048 x 8 x 8
+        x = self.Mixed_7c(x)
+        # N x 2048 x 8 x 8
+        # Adaptive average pooling
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        # N x 2048 x 1 x 1
+        x = F.dropout(x, training=self.training)
+        # N x 2048 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 2048
+        if output_logits or output_predictions:
+            x = self.fc(x)
+            # N x 1000 (num_classes)
+            if remove_logits_bias:
+                x = x - self.fc.bias.view(1, -1)
+            if output_predictions:
+                x = F.softmax(x, dim=1)
+        return x, aux
+    def forward(self,
+                x,
+                transform_input=False,
+                output_logits=False,
+                remove_logits_bias=False,
+                output_predictions=False):
+        x = self._transform_input(x, transform_input)
+        x, aux = self._forward(
+            x, output_logits, remove_logits_bias, output_predictions)
+        if self.training and self.aux_logits:
+            return x, aux
+        else:
+            return x
+class InceptionA(nn.Module):
+    def __init__(self, in_channels, pool_features, conv_block=None, align_tf=False):
+        super(InceptionA, self).__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1)
+        self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2)
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1)
+        self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1)
+        self.pool_include_padding = not align_tf
+    def _forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=self.pool_include_padding)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return outputs
+    def forward(self, x):
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+class InceptionB(nn.Module):
+    def __init__(self, in_channels, conv_block=None):
+        super(InceptionB, self).__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2)
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2)
+    def _forward(self, x):
+        branch3x3 = self.branch3x3(x)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
+        outputs = [branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+    def forward(self, x):
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+class InceptionC(nn.Module):
+    def __init__(self, in_channels, channels_7x7, conv_block=None, align_tf=False):
+        super(InceptionC, self).__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 192, kernel_size=1)
+        c7 = channels_7x7
+        self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3))
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+        self.pool_include_padding = not align_tf
+    def _forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=self.pool_include_padding)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return outputs
+    def forward(self, x):
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+class InceptionD(nn.Module):
+    def __init__(self, in_channels, conv_block=None):
+        super(InceptionD, self).__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2)
+        self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2)
+    def _forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
+        outputs = [branch3x3, branch7x7x3, branch_pool]
+        return outputs
+    def forward(self, x):
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+class InceptionE(nn.Module):
+    def __init__(self, in_channels, conv_block=None, align_tf=False, use_max_pool=False):
+        super(InceptionE, self).__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 320, kernel_size=1)
+        self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1)
+        self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+        self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1)
+        self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+        self.pool_include_padding = not align_tf
+        self.use_max_pool = use_max_pool
+    def _forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        if self.use_max_pool:
+            branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        else:
+            branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                       count_include_pad=self.pool_include_padding)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+    def forward(self, x):
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+class InceptionAux(nn.Module):
+    def __init__(self, in_channels, num_classes, conv_block=None):
+        super(InceptionAux, self).__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.conv0 = conv_block(in_channels, 128, kernel_size=1)
+        self.conv1 = conv_block(128, 768, kernel_size=5)
+        self.conv1.stddev = 0.01
+        self.fc = nn.Linear(768, num_classes)
+        self.fc.stddev = 0.001
+    def forward(self, x):
+        # N x 768 x 17 x 17
+        x = F.avg_pool2d(x, kernel_size=5, stride=3)
+        # N x 768 x 5 x 5
+        x = self.conv0(x)
+        # N x 128 x 5 x 5
+        x = self.conv1(x)
+        # N x 768 x 1 x 1
+        # Adaptive average pooling
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        # N x 768 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 768
+        x = self.fc(x)
+        # N x 1000
+        return x
+class BasicConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return F.relu(x, inplace=True)
+# pylint: enable=line-too-long
+# pylint: enable=missing-function-docstring
+# pylint: enable=missing-class-docstring
+# pylint: enable=super-with-arguments
+# pylint: enable=consider-merging-isinstance
+# pylint: enable=import-outside-toplevel
+# pylint: enable=no-else-return

models/perceptual_model.py ADDED Viewed

	@@ -0,0 +1,519 @@

+# python3.7
+"""Contains the VGG16 model, which is used for inference ONLY.
+VGG16 is commonly used for perceptual feature extraction. The model implemented
+in this file can be used for evaluation (like computing LPIPS, perceptual path
+length, etc.), OR be used in training for loss computation (like perceptual
+loss, etc.).
+The pre-trained model is officially shared by
+https://www.robots.ox.ac.uk/~vgg/research/very_deep/
+and ported by
+https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt
+Compared to the official VGG16 model, this ported model also support evaluating
+LPIPS, which is introduced in
+https://github.com/richzhang/PerceptualSimilarity
+"""
+import warnings
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from utils.misc import download_url
+__all__ = ['PerceptualModel']
+# pylint: disable=line-too-long
+_MODEL_URL_SHA256 = {
+    # This model is provided by `torchvision`, which is ported from TensorFlow.
+    'torchvision_official': (
+        'https://download.pytorch.org/models/vgg16-397923af.pth',
+        '397923af8e79cdbb6a7127f12361acd7a2f83e06b05044ddf496e83de57a5bf0'  # hash sha256
+    ),
+    # This model is provided by https://github.com/NVlabs/stylegan2-ada-pytorch
+    'vgg_perceptual_lpips': (
+        'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt',
+        'b437eb095feaeb0b83eb3fa11200ebca4548ee39a07fb944a417ddc516cc07c3'  # hash sha256
+    )
+}
+# pylint: enable=line-too-long
+class PerceptualModel(object):
+    """Defines the perceptual model, which is based on VGG16 structure.
+    This is a static class, which is used to avoid this model to be built
+    repeatedly. Consequently, this model is particularly used for inference,
+    like computing LPIPS, or for loss computation, like perceptual loss. If
+    training is required, please use the model from `torchvision.models` or
+    implement by yourself.
+    NOTE: The pre-trained model assumes the inputs to be with `RGB` channel
+    order and pixel range [-1, 1], and will NOT resize the input automatically
+    if only perceptual feature is needed.
+    """
+    models = dict()
+    @staticmethod
+    def build_model(use_torchvision=False, no_top=True, enable_lpips=True):
+        """Builds the model and load pre-trained weights.
+        1. If `use_torchvision` is set as True, the model released by
+           `torchvision` will be loaded, otherwise, the model released by
+           https://www.robots.ox.ac.uk/~vgg/research/very_deep/ will be used.
+           (default: False)
+        2. To save computing resources, these is an option to only load the
+           backbone (i.e., without the last three fully-connected layers). This
+           is commonly used for perceptual loss or LPIPS loss computation.
+           Please use argument `no_top` to control this. (default: True)
+        3. For LPIPS loss computation, some additional weights (which is used
+           for balancing the features from different resolutions) are employed
+           on top of the original VGG16 backbone. Details can be found at
+           https://github.com/richzhang/PerceptualSimilarity. Please use
+           `enable_lpips` to enable this feature. (default: True)
+        The built model supports following arguments when forwarding:
+        - resize_input: Whether to resize the input image to size [224, 224]
+            before forwarding. For feature-based computation (i.e., only
+            convolutional layers are used), image resizing is not essential.
+            (default: False)
+        - return_tensor: This field resolves the model behavior. Following
+            options are supported:
+                `feature1`: Before the first max pooling layer.
+                `pool1`: After the first max pooling layer.
+                `feature2`: Before the second max pooling layer.
+                `pool2`: After the second max pooling layer.
+                `feature3`: Before the third max pooling layer.
+                `pool3`: After the third max pooling layer.
+                `feature4`: Before the fourth max pooling layer.
+                `pool4`: After the fourth max pooling layer.
+                `feature5`: Before the fifth max pooling layer.
+                `pool5`: After the fifth max pooling layer.
+                `flatten`: The flattened feature, after `adaptive_avgpool`.
+                `feature`: The 4096d feature for logits computation. (default)
+                `logits`: The 1000d categorical logits.
+                `prediction`: The 1000d predicted probability.
+                `lpips`: The LPIPS score between two input images.
+        """
+        if use_torchvision:
+            model_source = 'torchvision_official'
+            align_tf_resize = False
+            is_torch_script = False
+        else:
+            model_source = 'vgg_perceptual_lpips'
+            align_tf_resize = True
+            is_torch_script = True
+        if enable_lpips and model_source != 'vgg_perceptual_lpips':
+            warnings.warn('The pre-trained model officially released by '
+                          '`torchvision` does not support LPIPS computation! '
+                          'Equal weights will be used for each resolution.')
+        fingerprint = (model_source, no_top, enable_lpips)
+        if fingerprint not in PerceptualModel.models:
+            # Build model.
+            model = VGG16(align_tf_resize=align_tf_resize,
+                          no_top=no_top,
+                          enable_lpips=enable_lpips)
+            # Download pre-trained weights.
+            if dist.is_initialized() and dist.get_rank() != 0:
+                dist.barrier()  # Download by chief.
+            url, sha256 = _MODEL_URL_SHA256[model_source]
+            filename = f'perceptual_model_{model_source}_{sha256}.pth'
+            model_path, hash_check = download_url(url,
+                                                  filename=filename,
+                                                  sha256=sha256)
+            if is_torch_script:
+                src_state_dict = torch.jit.load(model_path, map_location='cpu')
+            else:
+                src_state_dict = torch.load(model_path, map_location='cpu')
+            if hash_check is False:
+                warnings.warn(f'Hash check failed! The remote file from URL '
+                              f'`{url}` may be changed, or the downloading is '
+                              f'interrupted. The loaded perceptual model may '
+                              f'have unexpected behavior.')
+            if dist.is_initialized() and dist.get_rank() == 0:
+                dist.barrier()  # Wait for other replicas.
+            # Load weights.
+            dst_state_dict = _convert_weights(src_state_dict, model_source)
+            model.load_state_dict(dst_state_dict, strict=False)
+            del src_state_dict, dst_state_dict
+            # For inference only.
+            model.eval().requires_grad_(False).cuda()
+            PerceptualModel.models[fingerprint] = model
+        return PerceptualModel.models[fingerprint]
+def _convert_weights(src_state_dict, model_source):
+    if model_source not in _MODEL_URL_SHA256:
+        raise ValueError(f'Invalid model source `{model_source}`!\n'
+                         f'Sources allowed: {list(_MODEL_URL_SHA256.keys())}.')
+    if model_source == 'torchvision_official':
+        dst_to_src_var_mapping = {
+            'conv11.weight': 'features.0.weight',
+            'conv11.bias': 'features.0.bias',
+            'conv12.weight': 'features.2.weight',
+            'conv12.bias': 'features.2.bias',
+            'conv21.weight': 'features.5.weight',
+            'conv21.bias': 'features.5.bias',
+            'conv22.weight': 'features.7.weight',
+            'conv22.bias': 'features.7.bias',
+            'conv31.weight': 'features.10.weight',
+            'conv31.bias': 'features.10.bias',
+            'conv32.weight': 'features.12.weight',
+            'conv32.bias': 'features.12.bias',
+            'conv33.weight': 'features.14.weight',
+            'conv33.bias': 'features.14.bias',
+            'conv41.weight': 'features.17.weight',
+            'conv41.bias': 'features.17.bias',
+            'conv42.weight': 'features.19.weight',
+            'conv42.bias': 'features.19.bias',
+            'conv43.weight': 'features.21.weight',
+            'conv43.bias': 'features.21.bias',
+            'conv51.weight': 'features.24.weight',
+            'conv51.bias': 'features.24.bias',
+            'conv52.weight': 'features.26.weight',
+            'conv52.bias': 'features.26.bias',
+            'conv53.weight': 'features.28.weight',
+            'conv53.bias': 'features.28.bias',
+            'fc1.weight': 'classifier.0.weight',
+            'fc1.bias': 'classifier.0.bias',
+            'fc2.weight': 'classifier.3.weight',
+            'fc2.bias': 'classifier.3.bias',
+            'fc3.weight': 'classifier.6.weight',
+            'fc3.bias': 'classifier.6.bias',
+        }
+    elif model_source == 'vgg_perceptual_lpips':
+        src_state_dict = src_state_dict.state_dict()
+        dst_to_src_var_mapping = {
+            'conv11.weight': 'layers.conv1.weight',
+            'conv11.bias': 'layers.conv1.bias',
+            'conv12.weight': 'layers.conv2.weight',
+            'conv12.bias': 'layers.conv2.bias',
+            'conv21.weight': 'layers.conv3.weight',
+            'conv21.bias': 'layers.conv3.bias',
+            'conv22.weight': 'layers.conv4.weight',
+            'conv22.bias': 'layers.conv4.bias',
+            'conv31.weight': 'layers.conv5.weight',
+            'conv31.bias': 'layers.conv5.bias',
+            'conv32.weight': 'layers.conv6.weight',
+            'conv32.bias': 'layers.conv6.bias',
+            'conv33.weight': 'layers.conv7.weight',
+            'conv33.bias': 'layers.conv7.bias',
+            'conv41.weight': 'layers.conv8.weight',
+            'conv41.bias': 'layers.conv8.bias',
+            'conv42.weight': 'layers.conv9.weight',
+            'conv42.bias': 'layers.conv9.bias',
+            'conv43.weight': 'layers.conv10.weight',
+            'conv43.bias': 'layers.conv10.bias',
+            'conv51.weight': 'layers.conv11.weight',
+            'conv51.bias': 'layers.conv11.bias',
+            'conv52.weight': 'layers.conv12.weight',
+            'conv52.bias': 'layers.conv12.bias',
+            'conv53.weight': 'layers.conv13.weight',
+            'conv53.bias': 'layers.conv13.bias',
+            'fc1.weight': 'layers.fc1.weight',
+            'fc1.bias': 'layers.fc1.bias',
+            'fc2.weight': 'layers.fc2.weight',
+            'fc2.bias': 'layers.fc2.bias',
+            'fc3.weight': 'layers.fc3.weight',
+            'fc3.bias': 'layers.fc3.bias',
+            'lpips.0.weight': 'lpips0',
+            'lpips.1.weight': 'lpips1',
+            'lpips.2.weight': 'lpips2',
+            'lpips.3.weight': 'lpips3',
+            'lpips.4.weight': 'lpips4',
+        }
+    else:
+        raise NotImplementedError(f'Not implemented model source '
+                                  f'`{model_source}`!')
+    dst_state_dict = {}
+    for dst_name, src_name in dst_to_src_var_mapping.items():
+        if dst_name.startswith('lpips'):
+            dst_state_dict[dst_name] = src_state_dict[src_name].unsqueeze(0)
+        else:
+            dst_state_dict[dst_name] = src_state_dict[src_name].clone()
+    return dst_state_dict
+_IMG_MEAN = (0.485, 0.456, 0.406)
+_IMG_STD  = (0.229, 0.224, 0.225)
+_ALLOWED_RETURN = [
+    'feature1', 'pool1', 'feature2', 'pool2', 'feature3', 'pool3', 'feature4',
+    'pool4', 'feature5', 'pool5', 'flatten', 'feature', 'logits', 'prediction',
+    'lpips'
+]
+# pylint: disable=missing-function-docstring
+class VGG16(nn.Module):
+    """Defines the VGG16 structure.
+    This model takes `RGB` images with data format `NCHW` as the raw inputs. The
+    pixel range are assumed to be [-1, 1].
+    """
+    def __init__(self, align_tf_resize=False, no_top=True, enable_lpips=True):
+        """Defines the network structure."""
+        super().__init__()
+        self.align_tf_resize = align_tf_resize
+        self.no_top = no_top
+        self.enable_lpips = enable_lpips
+        self.conv11 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
+        self.relu11 = nn.ReLU(inplace=True)
+        self.conv12 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.relu12 = nn.ReLU(inplace=True)
+        # output `feature1`, with shape [N, 64, 224, 224]
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # output `pool1`, with shape [N, 64, 112, 112]
+        self.conv21 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
+        self.relu21 = nn.ReLU(inplace=True)
+        self.conv22 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
+        self.relu22 = nn.ReLU(inplace=True)
+        # output `feature2`, with shape [N, 128, 112, 112]
+        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # output `pool2`, with shape [N, 128, 56, 56]
+        self.conv31 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
+        self.relu31 = nn.ReLU(inplace=True)
+        self.conv32 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
+        self.relu32 = nn.ReLU(inplace=True)
+        self.conv33 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
+        self.relu33 = nn.ReLU(inplace=True)
+        # output `feature3`, with shape [N, 256, 56, 56]
+        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # output `pool3`, with shape [N,256, 28, 28]
+        self.conv41 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
+        self.relu41 = nn.ReLU(inplace=True)
+        self.conv42 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.relu42 = nn.ReLU(inplace=True)
+        self.conv43 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.relu43 = nn.ReLU(inplace=True)
+        # output `feature4`, with shape [N, 512, 28, 28]
+        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # output `pool4`, with shape [N, 512, 14, 14]
+        self.conv51 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.relu51 = nn.ReLU(inplace=True)
+        self.conv52 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.relu52 = nn.ReLU(inplace=True)
+        self.conv53 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.relu53 = nn.ReLU(inplace=True)
+        # output `feature5`, with shape [N, 512, 14, 14]
+        self.pool5 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # output `pool5`, with shape [N, 512, 7, 7]
+        if self.enable_lpips:
+            self.lpips = nn.ModuleList()
+            for idx, ch in enumerate([64, 128, 256, 512, 512]):
+                self.lpips.append(nn.Conv2d(ch, 1, kernel_size=1, bias=False))
+                self.lpips[idx].weight.data.copy_(torch.ones(1, ch, 1, 1))
+        if not self.no_top:
+            self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+            self.flatten = nn.Flatten(start_dim=1, end_dim=-1)
+            # output `flatten`, with shape [N, 25088]
+            self.fc1 = nn.Linear(512 * 7 * 7, 4096)
+            self.fc1_relu = nn.ReLU(inplace=True)
+            self.fc1_dropout = nn.Dropout(0.5, inplace=False)
+            self.fc2 = nn.Linear(4096, 4096)
+            self.fc2_relu = nn.ReLU(inplace=True)
+            self.fc2_dropout = nn.Dropout(0.5, inplace=False)
+            # output `feature`, with shape [N, 4096]
+            self.fc3 = nn.Linear(4096, 1000)
+            # output `logits`, with shape [N, 1000]
+            self.out = nn.Softmax(dim=1)
+            # output `softmax`, with shape [N, 1000]
+        img_mean = np.array(_IMG_MEAN).reshape((1, 3, 1, 1)).astype(np.float32)
+        img_std = np.array(_IMG_STD).reshape((1, 3, 1, 1)).astype(np.float32)
+        self.register_buffer('img_mean', torch.from_numpy(img_mean))
+        self.register_buffer('img_std', torch.from_numpy(img_std))
+    def forward(self,
+                x,
+                y=None,
+                *,
+                resize_input=False,
+                return_tensor='feature'):
+        return_tensor = return_tensor.lower()
+        if return_tensor not in _ALLOWED_RETURN:
+            raise ValueError(f'Invalid output tensor name `{return_tensor}` '
+                             f'for perceptual model (VGG16)!\n'
+                             f'Names allowed: {_ALLOWED_RETURN}.')
+        if return_tensor == 'lpips' and y is None:
+            raise ValueError('Two images are required for LPIPS computation, '
+                             'but only one is received!')
+        if return_tensor == 'lpips':
+            assert x.shape == y.shape
+            x = torch.cat([x, y], dim=0)
+            features = []
+        if resize_input:
+            if self.align_tf_resize:
+                theta = torch.eye(2, 3).to(x)
+                theta[0, 2] += theta[0, 0] / x.shape[3] - theta[0, 0] / 224
+                theta[1, 2] += theta[1, 1] / x.shape[2] - theta[1, 1] / 224
+                theta = theta.unsqueeze(0).repeat(x.shape[0], 1, 1)
+                grid = F.affine_grid(theta,
+                                     size=(x.shape[0], x.shape[1], 224, 224),
+                                     align_corners=False)
+                x = F.grid_sample(x, grid,
+                                  mode='bilinear',
+                                  padding_mode='border',
+                                  align_corners=False)
+            else:
+                x = F.interpolate(x,
+                                  size=(224, 224),
+                                  mode='bilinear',
+                                  align_corners=False)
+        if x.shape[1] == 1:
+            x = x.repeat((1, 3, 1, 1))
+        x = (x + 1) / 2
+        x = (x - self.img_mean) / self.img_std
+        x = self.conv11(x)
+        x = self.relu11(x)
+        x = self.conv12(x)
+        x = self.relu12(x)
+        if return_tensor == 'feature1':
+            return x
+        if return_tensor == 'lpips':
+            features.append(x)
+        x = self.pool1(x)
+        if return_tensor == 'pool1':
+            return x
+        x = self.conv21(x)
+        x = self.relu21(x)
+        x = self.conv22(x)
+        x = self.relu22(x)
+        if return_tensor == 'feature2':
+            return x
+        if return_tensor == 'lpips':
+            features.append(x)
+        x = self.pool2(x)
+        if return_tensor == 'pool2':
+            return x
+        x = self.conv31(x)
+        x = self.relu31(x)
+        x = self.conv32(x)
+        x = self.relu32(x)
+        x = self.conv33(x)
+        x = self.relu33(x)
+        if return_tensor == 'feature3':
+            return x
+        if return_tensor == 'lpips':
+            features.append(x)
+        x = self.pool3(x)
+        if return_tensor == 'pool3':
+            return x
+        x = self.conv41(x)
+        x = self.relu41(x)
+        x = self.conv42(x)
+        x = self.relu42(x)
+        x = self.conv43(x)
+        x = self.relu43(x)
+        if return_tensor == 'feature4':
+            return x
+        if return_tensor == 'lpips':
+            features.append(x)
+        x = self.pool4(x)
+        if return_tensor == 'pool4':
+            return x
+        x = self.conv51(x)
+        x = self.relu51(x)
+        x = self.conv52(x)
+        x = self.relu52(x)
+        x = self.conv53(x)
+        x = self.relu53(x)
+        if return_tensor == 'feature5':
+            return x
+        if return_tensor == 'lpips':
+            features.append(x)
+        x = self.pool5(x)
+        if return_tensor == 'pool5':
+            return x
+        if return_tensor == 'lpips':
+            score = 0
+            assert len(features) == 5
+            for idx in range(5):
+                feature = features[idx]
+                norm = feature.norm(dim=1, keepdim=True)
+                feature = feature / (norm + 1e-10)
+                feature_x, feature_y = feature.chunk(2, dim=0)
+                diff = (feature_x - feature_y).square()
+                score += self.lpips[idx](diff).mean(dim=(2, 3), keepdim=False)
+            return score.sum(dim=1, keepdim=False)
+        x = self.avgpool(x)
+        x = self.flatten(x)
+        if return_tensor == 'flatten':
+            return x
+        x = self.fc1(x)
+        x = self.fc1_relu(x)
+        x = self.fc1_dropout(x)
+        x = self.fc2(x)
+        x = self.fc2_relu(x)
+        x = self.fc2_dropout(x)
+        if return_tensor == 'feature':
+            return x
+        x = self.fc3(x)
+        if return_tensor == 'logits':
+            return x
+        x = self.out(x)
+        if return_tensor == 'prediction':
+            return x
+        raise NotImplementedError(f'Output tensor name `{return_tensor}` is '
+                                  f'not implemented!')
+# pylint: enable=missing-function-docstring