diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..ac481c8eb05e4d2496fbe076a38a7b4835dd733d
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,27 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index f45107c4085c66ecaf9def52dccec11b14de4bc1..b8e449f3ff8a4951e8122cefa463ce506b590246 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,11 +131,3 @@ dmypy.json
 wandb/
 *.lmdb/
 *.pkl
-
-# results
-results
-results_old
-log
-checkpoint
-*.pt
-*.old
diff --git a/.gitmodules b/.gitmodules
index 3af478d02d57a4780acfe58144c80583f6c61872..38b6813c174912e68f43ef6b96c6566b972cd351 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,4 @@
-[submodule "third_party/face_parsing"]
-	path = third_party/face_parsing
-	url = https://github.com/Time-Travel-Rephotography/face-parsing.PyTorch.git
-[submodule "models/encoder4editing"]
-	path = models/encoder4editing
-	url = https://github.com/Time-Travel-Rephotography/encoder4editing.git
-[submodule "losses/contextual_loss"]
-	path = losses/contextual_loss
-	url = https://github.com/Time-Travel-Rephotography/contextual_loss_pytorch.git
+[submodule "StyleGAN-Human"]
+	path = StyleGAN-Human
+	url = https://github.com/stylegan-human/StyleGAN-Human
+
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index c28c084daacd23c8c48b4fe550a7615f4b5faa5a..0000000000000000000000000000000000000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2020 Time-Travel-Rephotography
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/LICENSE-NVIDIA b/LICENSE-NVIDIA
deleted file mode 100644
index 288fb3247529fc0d19ee2040c29adc65886d9426..0000000000000000000000000000000000000000
--- a/LICENSE-NVIDIA
+++ /dev/null
@@ -1,101 +0,0 @@
-Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
-
-
-Nvidia Source Code License-NC
-
-=======================================================================
-
-1. Definitions
-
-"Licensor" means any person or entity that distributes its Work.
-
-"Software" means the original work of authorship made available under
-this License.
-
-"Work" means the Software and any additions to or derivative works of
-the Software that are made available under this License.
-
-"Nvidia Processors" means any central processing unit (CPU), graphics
-processing unit (GPU), field-programmable gate array (FPGA),
-application-specific integrated circuit (ASIC) or any combination
-thereof designed, made, sold, or provided by Nvidia or its affiliates.
-
-The terms "reproduce," "reproduction," "derivative works," and
-"distribution" have the meaning as provided under U.S. copyright law;
-provided, however, that for the purposes of this License, derivative
-works shall not include works that remain separable from, or merely
-link (or bind by name) to the interfaces of, the Work.
-
-Works, including the Software, are "made available" under this License
-by including in or with the Work either (a) a copyright notice
-referencing the applicability of this License to the Work, or (b) a
-copy of this License.
-
-2. License Grants
-
-    2.1 Copyright Grant. Subject to the terms and conditions of this
-    License, each Licensor grants to you a perpetual, worldwide,
-    non-exclusive, royalty-free, copyright license to reproduce,
-    prepare derivative works of, publicly display, publicly perform,
-    sublicense and distribute its Work and any resulting derivative
-    works in any form.
-
-3. Limitations
-
-    3.1 Redistribution. You may reproduce or distribute the Work only
-    if (a) you do so under this License, (b) you include a complete
-    copy of this License with your distribution, and (c) you retain
-    without modification any copyright, patent, trademark, or
-    attribution notices that are present in the Work.
-
-    3.2 Derivative Works. You may specify that additional or different
-    terms apply to the use, reproduction, and distribution of your
-    derivative works of the Work ("Your Terms") only if (a) Your Terms
-    provide that the use limitation in Section 3.3 applies to your
-    derivative works, and (b) you identify the specific derivative
-    works that are subject to Your Terms. Notwithstanding Your Terms,
-    this License (including the redistribution requirements in Section
-    3.1) will continue to apply to the Work itself.
-
-    3.3 Use Limitation. The Work and any derivative works thereof only
-    may be used or intended for use non-commercially. The Work or
-    derivative works thereof may be used or intended for use by Nvidia
-    or its affiliates commercially or non-commercially. As used herein,
-    "non-commercially" means for research or evaluation purposes only.
-
-    3.4 Patent Claims. If you bring or threaten to bring a patent claim
-    against any Licensor (including any claim, cross-claim or
-    counterclaim in a lawsuit) to enforce any patents that you allege
-    are infringed by any Work, then your rights under this License from
-    such Licensor (including the grants in Sections 2.1 and 2.2) will
-    terminate immediately.
-
-    3.5 Trademarks. This License does not grant any rights to use any
-    Licensor's or its affiliates' names, logos, or trademarks, except
-    as necessary to reproduce the notices described in this License.
-
-    3.6 Termination. If you violate any term of this License, then your
-    rights under this License (including the grants in Sections 2.1 and
-    2.2) will terminate immediately.
-
-4. Disclaimer of Warranty.
-
-THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
-NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
-THIS LICENSE. 
-
-5. Limitation of Liability.
-
-EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
-THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
-SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
-OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
-(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
-COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
-THE POSSIBILITY OF SUCH DAMAGES.
-
-=======================================================================
diff --git a/LICENSE-STYLEGAN2 b/LICENSE-STYLEGAN2
deleted file mode 100644
index 915ca760bc639695e152e784d9dc2dbf71369b67..0000000000000000000000000000000000000000
--- a/LICENSE-STYLEGAN2
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2019 Kim Seonghyeon
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/README.md b/README.md
index d23d460c0d4710ebda3d28aafc52f857d9d28f01..320746dfd848fc52ee582fea57df0bbd9b628281 100644
--- a/README.md
+++ b/README.md
@@ -1,119 +1,13 @@
-# [SIGGRAPH Asia 2021] Time-Travel Rephotography
-<a href="https://arxiv.org/abs/2012.12261"><img src="https://img.shields.io/badge/arXiv-2008.00951-b31b1b.svg"></a>
-<a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-yellow.svg"></a>
-[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/15D2WIF_vE2l48ddxEx45cM3RykZwQXM8?usp=sharing)
-### [[Project Website](https://time-travel-rephotography.github.io/)]
-
-<p align='center'>
-<img src="time-travel-rephotography.gif" width='100%'/>
-</p>
-
-Many historical people were only ever captured by old, faded, black and white photos, that are distorted due to the limitations of early cameras and the passage of time. This paper simulates traveling back in time with a modern camera to rephotograph famous subjects. Unlike conventional image restoration filters which apply independent operations like denoising, colorization, and superresolution, we leverage the StyleGAN2 framework to project old photos into the space of modern high-resolution photos, achieving all of these effects in a unified framework. A unique challenge with this approach is retaining the identity and pose of the subject in the original photo, while discarding the many artifacts frequently seen in low-quality antique photos. Our comparisons to current state-of-the-art restoration filters show significant improvements and compelling results for a variety of important historical people. 
-<br/>
-
-**Time-Travel Rephotography**
-<br/>
-[Xuan Luo](https://roxanneluo.github.io),
-[Xuaner Zhang](https://people.eecs.berkeley.edu/~cecilia77/),
-[Paul Yoo](https://www.linkedin.com/in/paul-yoo-768a3715b),
-[Ricardo Martin-Brualla](http://www.ricardomartinbrualla.com/),
-[Jason Lawrence](http://jasonlawrence.info/), and 
-[Steven M. Seitz](https://homes.cs.washington.edu/~seitz/)
-<br/>
-In SIGGRAPH Asia 2021.
-
-## Demo
-We provide an easy-to-get-started demo using Google Colab!
-The Colab will allow you to try our method on the sample Abraham Lincoln photo or **your own photos** using Cloud GPUs on Google Colab.
-
-[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/15D2WIF_vE2l48ddxEx45cM3RykZwQXM8?usp=sharing)
-
-Or you can run our method on your own machine following the instructions below.
- 
-## Prerequisite
-- Pull third-party packages.
-  ```
-  git submodule update --init --recursive
-  ```
-- Install python packages.
-  ```
-  conda create --name rephotography python=3.8.5
-  conda activate rephotography
-  conda install pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=10.1 -c pytorch
-  pip install -r requirements.txt
-  ```
-
-## Quick Start
-Run our method on the example photo of Abraham Lincoln.
-- Download models:
-  ```
-  ./scripts/download_checkpoints.sh
-  ```
-- Run:
-  ```
-  ./scripts/run.sh b "dataset/Abraham Lincoln_01.png" 0.75 
-  ```
-- You can inspect the optimization process by  
-  ```
-  tensorboard --logdir "log/Abraham Lincoln_01"
-  ```
-- You can find your results as below.
-  ```
-  results/
-    Abraham Lincoln_01/       # intermediate outputs for histogram matching and face parsing
-    Abraham Lincoln_01_b.png  # the input after matching the histogram of the sibling image
-    Abraham Lincoln_01-b-G0.75-init(10,18)-s256-vgg1-vggface0.3-eye0.1-color1.0e+10-cx0.1(relu3_4,relu2_2,relu1_2)-NR5.0e+04-lr0.1_0.01-c32-wp(250,750)-init.png        # the sibling image
-    Abraham Lincoln_01-b-G0.75-init(10,18)-s256-vgg1-vggface0.3-eye0.1-color1.0e+10-cx0.1(relu3_4,relu2_2,relu1_2)-NR5.0e+04-lr0.1_0.01-c32-wp(250,750)-init.pt         # the sibing latent codes and initialized noise maps
-    Abraham Lincoln_01-b-G0.75-init(10,18)-s256-vgg1-vggface0.3-eye0.1-color1.0e+10-cx0.1(relu3_4,relu2_2,relu1_2)-NR5.0e+04-lr0.1_0.01-c32-wp(250,750).png             # the output result
-    Abraham Lincoln_01-b-G0.75-init(10,18)-s256-vgg1-vggface0.3-eye0.1-color1.0e+10-cx0.1(relu3_4,relu2_2,relu1_2)-NR5.0e+04-lr0.1_0.01-c32-wp(250,750).pt              # the final optimized latent codes and noise maps
-    Abraham Lincoln_01-b-G0.75-init(10,18)-s256-vgg1-vggface0.3-eye0.1-color1.0e+10-cx0.1(relu3_4,relu2_2,relu1_2)-NR5.0e+04-lr0.1_0.01-c32-wp(250,750)-rand.png        # the result with the final latent codes but random noise maps
-
-  ```
-
-## Run on Your Own Image
-- Crop and align the head regions of your images:
-  ```
-  python -m tools.data.align_images <input_raw_image_dir> <aligned_image_dir>
-  ```
-- Run:
-  ```
-  ./scripts/run.sh <spectral_sensitivity> <input_image_path> <blur_radius>
-  ```
-  The `spectral_sensitivity` can be `b` (blue-sensitive), `gb` (orthochromatic), or `g` (panchromatic). You can roughly estimate the `spectral_sensitivity` of your photo as follows. Use the *blue-sensitive* model for photos before 1873, manually select between blue-sensitive and *orthochromatic* for images from 1873 to 1906 and among all models for photos taken afterwards.
-
-  The `blur_radius` is the estimated gaussian blur radius in pixels if the input photot is resized to 1024x1024.
-  
-## Historical Wiki Face Dataset
-| Path      | Size | Description |
-|----------- | ----------- | ----------- |
-| [Historical Wiki Face Dataset.zip](https://drive.google.com/open?id=1mgC2U7quhKSz_lTL97M-0cPrIILTiUCE&authuser=xuanluo%40cs.washington.edu&usp=drive_fs)| 148 MB | Images|
-| [spectral_sensitivity.json](https://drive.google.com/open?id=1n3Bqd8G0g-wNpshlgoZiOMXxLlOycAXr&authuser=xuanluo%40cs.washington.edu&usp=drive_fs)| 6 KB | Spectral sensitivity (`b`, `gb`, or `g`). |
-| [blur_radius.json](https://drive.google.com/open?id=1n4vUsbQo2BcxtKVMGfD1wFHaINzEmAVP&authuser=xuanluo%40cs.washington.edu&usp=drive_fs)| 6 KB | Blur radius in pixels| 
-
-The `json`s are dictionares that map input names to the corresponding spectral sensitivity or blur radius.
-Due to copyright constraints, `Historical Wiki Face Dataset.zip` contains all images in the *Historical Wiki Face Dataset* that were used in our user study except the photo of [Mao Zedong](https://en.wikipedia.org/wiki/File:Mao_Zedong_in_1959_%28cropped%29.jpg). You can download it separately and crop it as [above](#run-on-your-own-image). 
-
-## Citation
-If you find our code useful, please consider citing our paper:
-```
-@article{Luo-Rephotography-2021,
-  author    = {Luo, Xuan and Zhang, Xuaner and Yoo, Paul and Martin-Brualla, Ricardo and Lawrence, Jason and Seitz, Steven M.},
-  title     = {Time-Travel Rephotography},
-  journal = {ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH Asia 2021)},
-  publisher = {ACM New York, NY, USA},
-  volume = {40},
-  number = {6},
-  articleno = {213},
-  doi = {https://doi.org/10.1145/3478513.3480485},
-  year = {2021},
-  month = {12}
-}
-```
-
-## License
-This work is licensed under MIT License. See [LICENSE](LICENSE) for details.
-
-Codes for the StyleGAN2 model come from [https://github.com/rosinality/stylegan2-pytorch](https://github.com/rosinality/stylegan2-pytorch).
-
-## Acknowledgments
-We thank [Nick Brandreth](https://www.nickbrandreth.com/) for capturing the dry plate photos. We thank Bo Zhang, Qingnan Fan, Roy Or-El, Aleksander Holynski and Keunhong Park for insightful advice.
+---
+title: Time TravelRephotography
+emoji: 🦀
+colorFrom: yellow
+colorTo: red
+sdk: gradio
+sdk_version: 2.9.4
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
diff --git a/app.py b/app.py
index e060194385954f228d451ca3dd6f2d0e5cf8e527..536be32fb8ca8e3abacf5ffa27fe97ebad2a31c0 100644
--- a/app.py
+++ b/app.py
@@ -1,172 +1,109 @@
-from argparse import Namespace
+#!/usr/bin/env python
+
+from __future__ import annotations
+
+import argparse
+import functools
 import os
-from os.path import join as pjoin
-import random
+import pickle
 import sys
-from typing import (
-    Iterable,
-    Optional,
-)
 
-import cv2
+import gradio as gr
 import numpy as np
-from PIL import Image
 import torch
-from torch.utils.tensorboard import SummaryWriter
-from torchvision.transforms import (
-    Compose,
-    Grayscale,
-    Resize,
-    ToTensor,
-    Normalize,
-)
-
-from losses.joint_loss import JointLoss
-from model import Generator
-from tools.initialize import Initializer
-from tools.match_skin_histogram import match_skin_histogram
-from utils.projector_arguments import ProjectorArguments
-from utils import torch_helpers as th
-from utils.torch_helpers import make_image
-from utils.misc import stem
-from utils.optimize import Optimizer
-from models.degrade import (
-    Degrade,
-    Downsample,
-)
-
-
-def set_random_seed(seed: int):
-    # FIXME (xuanluo): this setup still allows randomness somehow
-    torch.manual_seed(seed)
-    random.seed(seed)
-    np.random.seed(seed)
-
-
-def read_images(paths: str, max_size: Optional[int] = None):
-    transform = Compose(
-        [
-            Grayscale(),
-            ToTensor(),
-        ]
-    )
+import torch_utils
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
 
-    imgs = []
-    for path in paths:
-        img = Image.open(path)
-        if max_size is not None and img.width > max_size:
-            img = img.resize((max_size, max_size))
-        img = transform(img)
-        imgs.append(img)
-    imgs = torch.stack(imgs, 0)
-    return imgs
-
-
-def normalize(img: torch.Tensor, mean=0.5, std=0.5):
-    """[0, 1] -> [-1, 1]"""
-    return (img - mean) / std
-
-
-def create_generator(args: Namespace, device: torch.device):
-    generator = Generator(args.generator_size, 512, 8)
-    generator.load_state_dict(torch.load(args.ckpt)['g_ema'], strict=False)
-    generator.eval()
-    generator = generator.to(device)
-    return generator
-
-
-def save(
-        path_prefixes: Iterable[str],
-        imgs: torch.Tensor,  # BCHW
-        latents: torch.Tensor,
-        noises: torch.Tensor,
-        imgs_rand: Optional[torch.Tensor] = None,
-):
-    assert len(path_prefixes) == len(imgs) and len(latents) == len(path_prefixes)
-    if imgs_rand is not None:
-        assert len(imgs) == len(imgs_rand)
-    imgs_arr = make_image(imgs)
-    for path_prefix, img, latent, noise in zip(path_prefixes, imgs_arr, latents, noises):
-        os.makedirs(os.path.dirname(path_prefix), exist_ok=True)
-        cv2.imwrite(path_prefix + ".png", img[...,::-1])
-        torch.save({"latent": latent.detach().cpu(), "noise": noise.detach().cpu()},
-                path_prefix + ".pt")
-
-    if imgs_rand is not None:
-        imgs_arr = make_image(imgs_rand)
-        for path_prefix, img in zip(path_prefixes, imgs_arr):
-            cv2.imwrite(path_prefix + "-rand.png", img[...,::-1])
-
-
-def main(args):
-    opt_str = ProjectorArguments.to_string(args)
-    print(opt_str)
-
-    if args.rand_seed is not None:
-        set_random_seed(args.rand_seed)
-    device = th.device()
-
-    # read inputs. TODO imgs_orig has channel 1
-    imgs_orig = read_images([args.input], max_size=args.generator_size).to(device)
-    imgs = normalize(imgs_orig)  # actually this will be overwritten by the histogram matching result
-
-    # initialize
-    with torch.no_grad():
-        init = Initializer(args).to(device)
-        latent_init = init(imgs_orig)
-
-    # create generator
-    generator = create_generator(args, device)
-
-    # init noises
-    with torch.no_grad():
-        noises_init = generator.make_noise()
-
-    # create a new input by matching the input's histogram to the sibling image
-    with torch.no_grad():
-        sibling, _, sibling_rgbs = generator([latent_init], input_is_latent=True, noise=noises_init)
-    mh_dir = pjoin(args.results_dir, stem(args.input))
-    imgs = match_skin_histogram(
-        imgs, sibling,
-        args.spectral_sensitivity,
-        pjoin(mh_dir, "input_sibling"),
-        pjoin(mh_dir, "skin_mask"),
-        matched_hist_fn=mh_dir.rstrip(os.sep) + f"_{args.spectral_sensitivity}.png",
-        normalize=normalize,
-    ).to(device)
-    torch.cuda.empty_cache()
-    # TODO imgs has channel 3
-
-    degrade = Degrade(args).to(device)
-
-    rgb_levels = generator.get_latent_size(args.coarse_min) // 2 + len(args.wplus_step) - 1
-    criterion = JointLoss(
-            args, imgs,
-            sibling=sibling.detach(), sibling_rgbs=sibling_rgbs[:rgb_levels]).to(device)
-
-    # save initialization
-    save(
-        [pjoin(args.results_dir, f"{stem(args.input)}-{opt_str}-init")],
-        sibling, latent_init, noises_init,
-    )
+sys.path.insert(0, 'StyleGAN-Human')
+
+TITLE = 'StyleGAN-Human'
+DESCRIPTION = '''This is an unofficial demo for https://github.com/stylegan-human/StyleGAN-Human.
+Expected execution time on Hugging Face Spaces: 0.8s
+Related App: [StyleGAN-Human (Interpolation)](https://huggingface.co/spaces/hysts/StyleGAN-Human-Interpolation)
+'''
+ARTICLE = '<center><img src="https://visitor-badge.glitch.me/badge?page_id=hysts.stylegan-human" alt="visitor badge"/></center>'
+
+TOKEN = "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"
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', type=str, default='cpu')
+    parser.add_argument('--theme', type=str)
+    parser.add_argument('--live', action='store_true')
+    parser.add_argument('--share', action='store_true')
+    parser.add_argument('--port', type=int)
+    parser.add_argument('--disable-queue',
+                        dest='enable_queue',
+                        action='store_false')
+    parser.add_argument('--allow-flagging', type=str, default='never')
+    return parser.parse_args()
+
+
+def generate_z(z_dim: int, seed: int, device: torch.device) -> torch.Tensor:
+    return torch.from_numpy(np.random.RandomState(seed).randn(
+        1, z_dim)).to(device).float()
 
-    writer = SummaryWriter(pjoin(args.log_dir, f"{stem(args.input)}/{opt_str}"))
-    # start optimize
-    latent, noises = Optimizer.optimize(generator, criterion, degrade, imgs, latent_init, noises_init, args, writer=writer)
-
-    # generate output
-    img_out, _, _ = generator([latent], input_is_latent=True, noise=noises)
-    img_out_rand_noise, _, _ = generator([latent], input_is_latent=True)
-    # save output
-    save(
-        [pjoin(args.results_dir, f"{stem(args.input)}-{opt_str}")],
-        img_out, latent, noises,
-        imgs_rand=img_out_rand_noise
-    )
 
+@torch.inference_mode()
+def generate_image(seed: int, truncation_psi: float, model: nn.Module,
+                   device: torch.device) -> np.ndarray:
+    seed = int(np.clip(seed, 0, np.iinfo(np.uint32).max))
+
+    z = generate_z(model.z_dim, seed, device)
+    label = torch.zeros([1, model.c_dim], device=device)
+
+    out = model(z, label, truncation_psi=truncation_psi, force_fp32=True)
+    out = (out.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+    return out[0].cpu().numpy()
+
+
+def load_model(file_name: str, device: torch.device) -> nn.Module:
+    path = hf_hub_download('feng2022/Time-TravelRephotography',
+                           f'{file_name}',
+                           use_auth_token=TOKEN)
+    with open(path, 'rb') as f:
+        model = pickle.load(f)['G_ema']
+    model.eval()
+    model.to(device)
+    with torch.inference_mode():
+        z = torch.zeros((1, model.z_dim)).to(device)
+        label = torch.zeros([1, model.c_dim], device=device)
+        model(z, label, force_fp32=True)
+    return model
+
+
+def main():
+    args = parse_args()
+    device = torch.device(args.device)
+
+    model = load_model('stylegan_human_v2_1024.pkl', device)
+
+    func = functools.partial(generate_image, model=model, device=device)
+    func = functools.update_wrapper(func, generate_image)
+
+    gr.Interface(
+        func,
+        [
+            gr.inputs.Number(default=0, label='Seed'),
+            gr.inputs.Slider(
+                0, 2, step=0.05, default=0.7, label='Truncation psi'),
+        ],
+        gr.outputs.Image(type='numpy', label='Output'),
+        title=TITLE,
+        description=DESCRIPTION,
+        article=ARTICLE,
+        theme=args.theme,
+        allow_flagging=args.allow_flagging,
+        live=args.live,
+    ).launch(
+        enable_queue=args.enable_queue,
+        server_port=args.port,
+        share=args.share,
+    )
 
-def parse_args():
-    return ProjectorArguments().parse()
 
-if __name__ == "__main__":
-    sys.exit(main(parse_args()))
+if __name__ == '__main__':
+    main()
+ 
\ No newline at end of file
diff --git a/dnnlib/__init__.py b/dnnlib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2c9a6a3f95f9fe55baccad83c9e94842c42453
--- /dev/null
+++ b/dnnlib/__init__.py
@@ -0,0 +1,11 @@
+﻿# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .util import EasyDict, make_cache_dir_path
diff --git a/dnnlib/tflib/__init__.py b/dnnlib/tflib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7013e8cf7ed660e50bb984226c95052792979b12
--- /dev/null
+++ b/dnnlib/tflib/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+from . import autosummary
+from . import network
+from . import optimizer
+from . import tfutil
+from . import custom_ops
+
+from .tfutil import *
+from .network import Network
+
+from .optimizer import Optimizer
+
+from .custom_ops import get_plugin
diff --git a/dnnlib/tflib/autosummary.py b/dnnlib/tflib/autosummary.py
new file mode 100644
index 0000000000000000000000000000000000000000..ede0f23dc3106112d241c70a8d4c17b2fa2af50d
--- /dev/null
+++ b/dnnlib/tflib/autosummary.py
@@ -0,0 +1,193 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Helper for adding automatically tracked values to Tensorboard.
+
+Autosummary creates an identity op that internally keeps track of the input
+values and automatically shows up in TensorBoard. The reported value
+represents an average over input components. The average is accumulated
+constantly over time and flushed when save_summaries() is called.
+
+Notes:
+- The output tensor must be used as an input for something else in the
+  graph. Otherwise, the autosummary op will not get executed, and the average
+  value will not get accumulated.
+- It is perfectly fine to include autosummaries with the same name in
+  several places throughout the graph, even if they are executed concurrently.
+- It is ok to also pass in a python scalar or numpy array. In this case, it
+  is added to the average immediately.
+"""
+
+from collections import OrderedDict
+import numpy as np
+import tensorflow as tf
+from tensorboard import summary as summary_lib
+from tensorboard.plugins.custom_scalar import layout_pb2
+
+from . import tfutil
+from .tfutil import TfExpression
+from .tfutil import TfExpressionEx
+
+# Enable "Custom scalars" tab in TensorBoard for advanced formatting.
+# Disabled by default to reduce tfevents file size.
+enable_custom_scalars = False
+
+_dtype = tf.float64
+_vars = OrderedDict()  # name => [var, ...]
+_immediate = OrderedDict()  # name => update_op, update_value
+_finalized = False
+_merge_op = None
+
+
+def _create_var(name: str, value_expr: TfExpression) -> TfExpression:
+    """Internal helper for creating autosummary accumulators."""
+    assert not _finalized
+    name_id = name.replace("/", "_")
+    v = tf.cast(value_expr, _dtype)
+
+    if v.shape.is_fully_defined():
+        size = np.prod(v.shape.as_list())
+        size_expr = tf.constant(size, dtype=_dtype)
+    else:
+        size = None
+        size_expr = tf.reduce_prod(tf.cast(tf.shape(v), _dtype))
+
+    if size == 1:
+        if v.shape.ndims != 0:
+            v = tf.reshape(v, [])
+        v = [size_expr, v, tf.square(v)]
+    else:
+        v = [size_expr, tf.reduce_sum(v), tf.reduce_sum(tf.square(v))]
+    v = tf.cond(tf.is_finite(v[1]), lambda: tf.stack(v), lambda: tf.zeros(3, dtype=_dtype))
+
+    with tfutil.absolute_name_scope("Autosummary/" + name_id), tf.control_dependencies(None):
+        var = tf.Variable(tf.zeros(3, dtype=_dtype), trainable=False)  # [sum(1), sum(x), sum(x**2)]
+    update_op = tf.cond(tf.is_variable_initialized(var), lambda: tf.assign_add(var, v), lambda: tf.assign(var, v))
+
+    if name in _vars:
+        _vars[name].append(var)
+    else:
+        _vars[name] = [var]
+    return update_op
+
+
+def autosummary(name: str, value: TfExpressionEx, passthru: TfExpressionEx = None, condition: TfExpressionEx = True) -> TfExpressionEx:
+    """Create a new autosummary.
+
+    Args:
+        name:     Name to use in TensorBoard
+        value:    TensorFlow expression or python value to track
+        passthru: Optionally return this TF node without modifications but tack an autosummary update side-effect to this node.
+
+    Example use of the passthru mechanism:
+
+    n = autosummary('l2loss', loss, passthru=n)
+
+    This is a shorthand for the following code:
+
+    with tf.control_dependencies([autosummary('l2loss', loss)]):
+        n = tf.identity(n)
+    """
+    tfutil.assert_tf_initialized()
+    name_id = name.replace("/", "_")
+
+    if tfutil.is_tf_expression(value):
+        with tf.name_scope("summary_" + name_id), tf.device(value.device):
+            condition = tf.convert_to_tensor(condition, name='condition')
+            update_op = tf.cond(condition, lambda: tf.group(_create_var(name, value)), tf.no_op)
+            with tf.control_dependencies([update_op]):
+                return tf.identity(value if passthru is None else passthru)
+
+    else:  # python scalar or numpy array
+        assert not tfutil.is_tf_expression(passthru)
+        assert not tfutil.is_tf_expression(condition)
+        if condition:
+            if name not in _immediate:
+                with tfutil.absolute_name_scope("Autosummary/" + name_id), tf.device(None), tf.control_dependencies(None):
+                    update_value = tf.placeholder(_dtype)
+                    update_op = _create_var(name, update_value)
+                    _immediate[name] = update_op, update_value
+            update_op, update_value = _immediate[name]
+            tfutil.run(update_op, {update_value: value})
+        return value if passthru is None else passthru
+
+
+def finalize_autosummaries() -> None:
+    """Create the necessary ops to include autosummaries in TensorBoard report.
+    Note: This should be done only once per graph.
+    """
+    global _finalized
+    tfutil.assert_tf_initialized()
+
+    if _finalized:
+        return None
+
+    _finalized = True
+    tfutil.init_uninitialized_vars([var for vars_list in _vars.values() for var in vars_list])
+
+    # Create summary ops.
+    with tf.device(None), tf.control_dependencies(None):
+        for name, vars_list in _vars.items():
+            name_id = name.replace("/", "_")
+            with tfutil.absolute_name_scope("Autosummary/" + name_id):
+                moments = tf.add_n(vars_list)
+                moments /= moments[0]
+                with tf.control_dependencies([moments]):  # read before resetting
+                    reset_ops = [tf.assign(var, tf.zeros(3, dtype=_dtype)) for var in vars_list]
+                    with tf.name_scope(None), tf.control_dependencies(reset_ops):  # reset before reporting
+                        mean = moments[1]
+                        std = tf.sqrt(moments[2] - tf.square(moments[1]))
+                        tf.summary.scalar(name, mean)
+                        if enable_custom_scalars:
+                            tf.summary.scalar("xCustomScalars/" + name + "/margin_lo", mean - std)
+                            tf.summary.scalar("xCustomScalars/" + name + "/margin_hi", mean + std)
+
+    # Setup layout for custom scalars.
+    layout = None
+    if enable_custom_scalars:
+        cat_dict = OrderedDict()
+        for series_name in sorted(_vars.keys()):
+            p = series_name.split("/")
+            cat = p[0] if len(p) >= 2 else ""
+            chart = "/".join(p[1:-1]) if len(p) >= 3 else p[-1]
+            if cat not in cat_dict:
+                cat_dict[cat] = OrderedDict()
+            if chart not in cat_dict[cat]:
+                cat_dict[cat][chart] = []
+            cat_dict[cat][chart].append(series_name)
+        categories = []
+        for cat_name, chart_dict in cat_dict.items():
+            charts = []
+            for chart_name, series_names in chart_dict.items():
+                series = []
+                for series_name in series_names:
+                    series.append(layout_pb2.MarginChartContent.Series(
+                        value=series_name,
+                        lower="xCustomScalars/" + series_name + "/margin_lo",
+                        upper="xCustomScalars/" + series_name + "/margin_hi"))
+                margin = layout_pb2.MarginChartContent(series=series)
+                charts.append(layout_pb2.Chart(title=chart_name, margin=margin))
+            categories.append(layout_pb2.Category(title=cat_name, chart=charts))
+        layout = summary_lib.custom_scalar_pb(layout_pb2.Layout(category=categories))
+    return layout
+
+def save_summaries(file_writer, global_step=None):
+    """Call FileWriter.add_summary() with all summaries in the default graph,
+    automatically finalizing and merging them on the first call.
+    """
+    global _merge_op
+    tfutil.assert_tf_initialized()
+
+    if _merge_op is None:
+        layout = finalize_autosummaries()
+        if layout is not None:
+            file_writer.add_summary(layout)
+        with tf.device(None), tf.control_dependencies(None):
+            _merge_op = tf.summary.merge_all()
+
+    file_writer.add_summary(_merge_op.eval(), global_step)
diff --git a/dnnlib/tflib/custom_ops.py b/dnnlib/tflib/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..a09ac5dc2a5de80d22a5593ed7725551737d59af
--- /dev/null
+++ b/dnnlib/tflib/custom_ops.py
@@ -0,0 +1,171 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""TensorFlow custom ops builder.
+"""
+
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+cuda_cache_path = os.path.join(os.path.dirname(__file__), '_cudacache')
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+compiler_bindir_search_path = [
+    'C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.14.26428/bin/Hostx64/x64',
+    'C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.23.28105/bin/Hostx64/x64',
+    'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin',
+]
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    for compiler_path in compiler_bindir_search_path:
+        if os.path.isdir(compiler_path):
+            return compiler_path
+    return None
+
+def _get_compute_cap(device):
+    caps_str = device.physical_device_desc
+    m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+    major = m.group(1)
+    minor = m.group(2)
+    return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+    gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU devices found')
+    (major, minor) = _get_compute_cap(gpus[0])
+    return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+    with os.popen(cmd) as pipe:
+        output = pipe.read()
+        status = pipe.close()
+    if status is not None:
+        raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+    cmd = 'nvcc ' + opts.strip()
+    cmd += ' --disable-warnings'
+    cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+    compiler_bindir = _find_compiler_bindir()
+    if compiler_bindir is None:
+        # Require that _find_compiler_bindir succeeds on Windows.  Allow
+        # nvcc to use whatever is the default on Linux.
+        if os.name == 'nt':
+            raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+    else:
+        cmd += ' --compiler-bindir "%s"' % compiler_bindir
+    cmd += ' 2>&1'
+    return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file):
+    cuda_file_base = os.path.basename(cuda_file)
+    cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+    # Already in cache?
+    if cuda_file in _plugin_cache:
+        return _plugin_cache[cuda_file]
+
+    # Setup plugin.
+    if verbose:
+        print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+    try:
+        # Hash CUDA source.
+        md5 = hashlib.md5()
+        with open(cuda_file, 'rb') as f:
+            md5.update(f.read())
+        md5.update(b'\n')
+
+        # Hash headers included by the CUDA code by running it through the preprocessor.
+        if not do_not_hash_included_headers:
+            if verbose:
+                print('Preprocessing... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+                _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+                with open(tmp_file, 'rb') as f:
+                    bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+                    good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+                    for ln in f:
+                        if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+                            ln = ln.replace(bad_file_str, good_file_str)
+                            md5.update(ln)
+                    md5.update(b'\n')
+
+        # Select compiler options.
+        compile_opts = ''
+        if os.name == 'nt':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+        elif os.name == 'posix':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+            compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+        else:
+            assert False # not Windows or Linux, w00t?
+        compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+        compile_opts += ' --use_fast_math'
+        nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+        # Hash build configuration.
+        md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+        md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+        md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+        # Compile if not already compiled.
+        bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+        bin_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+        if not os.path.isfile(bin_file):
+            if verbose:
+                print('Compiling... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+                _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+                os.makedirs(cuda_cache_path, exist_ok=True)
+                intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+                shutil.copyfile(tmp_file, intermediate_file)
+                os.rename(intermediate_file, bin_file) # atomic
+
+        # Load.
+        if verbose:
+            print('Loading... ', end='', flush=True)
+        plugin = tf.load_op_library(bin_file)
+
+        # Add to cache.
+        _plugin_cache[cuda_file] = plugin
+        if verbose:
+            print('Done.', flush=True)
+        return plugin
+
+    except:
+        if verbose:
+            print('Failed!', flush=True)
+        raise
+
+#----------------------------------------------------------------------------
diff --git a/dnnlib/tflib/network.py b/dnnlib/tflib/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfa73dc5ff2051689d16159871d2bc7e31294502
--- /dev/null
+++ b/dnnlib/tflib/network.py
@@ -0,0 +1,592 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Helper for managing networks."""
+
+import types
+import inspect
+import re
+import uuid
+import sys
+import numpy as np
+import tensorflow as tf
+
+from collections import OrderedDict
+from typing import Any, List, Tuple, Union
+
+from . import tfutil
+from .. import util
+
+from .tfutil import TfExpression, TfExpressionEx
+
+_import_handlers = []  # Custom import handlers for dealing with legacy data in pickle import.
+_import_module_src = dict()  # Source code for temporary modules created during pickle import.
+
+
+def import_handler(handler_func):
+    """Function decorator for declaring custom import handlers."""
+    _import_handlers.append(handler_func)
+    return handler_func
+
+
+class Network:
+    """Generic network abstraction.
+
+    Acts as a convenience wrapper for a parameterized network construction
+    function, providing several utility methods and convenient access to
+    the inputs/outputs/weights.
+
+    Network objects can be safely pickled and unpickled for long-term
+    archival purposes. The pickling works reliably as long as the underlying
+    network construction function is defined in a standalone Python module
+    that has no side effects or application-specific imports.
+
+    Args:
+        name: Network name. Used to select TensorFlow name and variable scopes.
+        func_name: Fully qualified name of the underlying network construction function, or a top-level function object.
+        static_kwargs: Keyword arguments to be passed in to the network construction function.
+
+    Attributes:
+        name: User-specified name, defaults to build func name if None.
+        scope: Unique TensorFlow scope containing template graph and variables, derived from the user-specified name.
+        static_kwargs: Arguments passed to the user-supplied build func.
+        components: Container for sub-networks. Passed to the build func, and retained between calls.
+        num_inputs: Number of input tensors.
+        num_outputs: Number of output tensors.
+        input_shapes: Input tensor shapes (NC or NCHW), including minibatch dimension.
+        output_shapes: Output tensor shapes (NC or NCHW), including minibatch dimension.
+        input_shape: Short-hand for input_shapes[0].
+        output_shape: Short-hand for output_shapes[0].
+        input_templates: Input placeholders in the template graph.
+        output_templates: Output tensors in the template graph.
+        input_names: Name string for each input.
+        output_names: Name string for each output.
+        own_vars: Variables defined by this network (local_name => var), excluding sub-networks.
+        vars: All variables (local_name => var).
+        trainables: All trainable variables (local_name => var).
+        var_global_to_local: Mapping from variable global names to local names.
+    """
+
+    def __init__(self, name: str = None, func_name: Any = None, **static_kwargs):
+        tfutil.assert_tf_initialized()
+        assert isinstance(name, str) or name is None
+        assert func_name is not None
+        assert isinstance(func_name, str) or util.is_top_level_function(func_name)
+        assert util.is_pickleable(static_kwargs)
+
+        self._init_fields()
+        self.name = name
+        self.static_kwargs = util.EasyDict(static_kwargs)
+
+        # Locate the user-specified network build function.
+        if util.is_top_level_function(func_name):
+            func_name = util.get_top_level_function_name(func_name)
+        module, self._build_func_name = util.get_module_from_obj_name(func_name)
+        self._build_func = util.get_obj_from_module(module, self._build_func_name)
+        assert callable(self._build_func)
+
+        # Dig up source code for the module containing the build function.
+        self._build_module_src = _import_module_src.get(module, None)
+        if self._build_module_src is None:
+            self._build_module_src = inspect.getsource(module)
+
+        # Init TensorFlow graph.
+        self._init_graph()
+        self.reset_own_vars()
+
+    def _init_fields(self) -> None:
+        self.name = None
+        self.scope = None
+        self.static_kwargs = util.EasyDict()
+        self.components = util.EasyDict()
+        self.num_inputs = 0
+        self.num_outputs = 0
+        self.input_shapes = [[]]
+        self.output_shapes = [[]]
+        self.input_shape = []
+        self.output_shape = []
+        self.input_templates = []
+        self.output_templates = []
+        self.input_names = []
+        self.output_names = []
+        self.own_vars = OrderedDict()
+        self.vars = OrderedDict()
+        self.trainables = OrderedDict()
+        self.var_global_to_local = OrderedDict()
+
+        self._build_func = None  # User-supplied build function that constructs the network.
+        self._build_func_name = None  # Name of the build function.
+        self._build_module_src = None  # Full source code of the module containing the build function.
+        self._run_cache = dict()  # Cached graph data for Network.run().
+
+    def _init_graph(self) -> None:
+        # Collect inputs.
+        self.input_names = []
+
+        for param in inspect.signature(self._build_func).parameters.values():
+            if param.kind == param.POSITIONAL_OR_KEYWORD and param.default is param.empty:
+                self.input_names.append(param.name)
+
+        self.num_inputs = len(self.input_names)
+        assert self.num_inputs >= 1
+
+        # Choose name and scope.
+        if self.name is None:
+            self.name = self._build_func_name
+        assert re.match("^[A-Za-z0-9_.\\-]*$", self.name)
+        with tf.name_scope(None):
+            self.scope = tf.get_default_graph().unique_name(self.name, mark_as_used=True)
+
+        # Finalize build func kwargs.
+        build_kwargs = dict(self.static_kwargs)
+        build_kwargs["is_template_graph"] = True
+        build_kwargs["components"] = self.components
+
+        # Build template graph.
+        with tfutil.absolute_variable_scope(self.scope, reuse=False), tfutil.absolute_name_scope(self.scope):  # ignore surrounding scopes
+            assert tf.get_variable_scope().name == self.scope
+            assert tf.get_default_graph().get_name_scope() == self.scope
+            with tf.control_dependencies(None):  # ignore surrounding control dependencies
+                self.input_templates = [tf.placeholder(tf.float32, name=name) for name in self.input_names]
+                out_expr = self._build_func(*self.input_templates, **build_kwargs)
+
+        # Collect outputs.
+        assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple)
+        self.output_templates = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr)
+        self.num_outputs = len(self.output_templates)
+        assert self.num_outputs >= 1
+        assert all(tfutil.is_tf_expression(t) for t in self.output_templates)
+
+        # Perform sanity checks.
+        if any(t.shape.ndims is None for t in self.input_templates):
+            raise ValueError("Network input shapes not defined. Please call x.set_shape() for each input.")
+        if any(t.shape.ndims is None for t in self.output_templates):
+            raise ValueError("Network output shapes not defined. Please call x.set_shape() where applicable.")
+        if any(not isinstance(comp, Network) for comp in self.components.values()):
+            raise ValueError("Components of a Network must be Networks themselves.")
+        if len(self.components) != len(set(comp.name for comp in self.components.values())):
+            raise ValueError("Components of a Network must have unique names.")
+
+        # List inputs and outputs.
+        self.input_shapes = [t.shape.as_list() for t in self.input_templates]
+        self.output_shapes = [t.shape.as_list() for t in self.output_templates]
+        self.input_shape = self.input_shapes[0]
+        self.output_shape = self.output_shapes[0]
+        self.output_names = [t.name.split("/")[-1].split(":")[0] for t in self.output_templates]
+
+        # List variables.
+        self.own_vars = OrderedDict((var.name[len(self.scope) + 1:].split(":")[0], var) for var in tf.global_variables(self.scope + "/"))
+        self.vars = OrderedDict(self.own_vars)
+        self.vars.update((comp.name + "/" + name, var) for comp in self.components.values() for name, var in comp.vars.items())
+        self.trainables = OrderedDict((name, var) for name, var in self.vars.items() if var.trainable)
+        self.var_global_to_local = OrderedDict((var.name.split(":")[0], name) for name, var in self.vars.items())
+
+    def reset_own_vars(self) -> None:
+        """Re-initialize all variables of this network, excluding sub-networks."""
+        tfutil.run([var.initializer for var in self.own_vars.values()])
+
+    def reset_vars(self) -> None:
+        """Re-initialize all variables of this network, including sub-networks."""
+        tfutil.run([var.initializer for var in self.vars.values()])
+
+    def reset_trainables(self) -> None:
+        """Re-initialize all trainable variables of this network, including sub-networks."""
+        tfutil.run([var.initializer for var in self.trainables.values()])
+
+    def get_output_for(self, *in_expr: TfExpression, return_as_list: bool = False, **dynamic_kwargs) -> Union[TfExpression, List[TfExpression]]:
+        """Construct TensorFlow expression(s) for the output(s) of this network, given the input expression(s)."""
+        assert len(in_expr) == self.num_inputs
+        assert not all(expr is None for expr in in_expr)
+
+        # Finalize build func kwargs.
+        build_kwargs = dict(self.static_kwargs)
+        build_kwargs.update(dynamic_kwargs)
+        build_kwargs["is_template_graph"] = False
+        build_kwargs["components"] = self.components
+
+        # Build TensorFlow graph to evaluate the network.
+        with tfutil.absolute_variable_scope(self.scope, reuse=True), tf.name_scope(self.name):
+            assert tf.get_variable_scope().name == self.scope
+            valid_inputs = [expr for expr in in_expr if expr is not None]
+            final_inputs = []
+            for expr, name, shape in zip(in_expr, self.input_names, self.input_shapes):
+                if expr is not None:
+                    expr = tf.identity(expr, name=name)
+                else:
+                    expr = tf.zeros([tf.shape(valid_inputs[0])[0]] + shape[1:], name=name)
+                final_inputs.append(expr)
+            out_expr = self._build_func(*final_inputs, **build_kwargs)
+
+        # Propagate input shapes back to the user-specified expressions.
+        for expr, final in zip(in_expr, final_inputs):
+            if isinstance(expr, tf.Tensor):
+                expr.set_shape(final.shape)
+
+        # Express outputs in the desired format.
+        assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple)
+        if return_as_list:
+            out_expr = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr)
+        return out_expr
+
+    def get_var_local_name(self, var_or_global_name: Union[TfExpression, str]) -> str:
+        """Get the local name of a given variable, without any surrounding name scopes."""
+        assert tfutil.is_tf_expression(var_or_global_name) or isinstance(var_or_global_name, str)
+        global_name = var_or_global_name if isinstance(var_or_global_name, str) else var_or_global_name.name
+        return self.var_global_to_local[global_name]
+
+    def find_var(self, var_or_local_name: Union[TfExpression, str]) -> TfExpression:
+        """Find variable by local or global name."""
+        assert tfutil.is_tf_expression(var_or_local_name) or isinstance(var_or_local_name, str)
+        return self.vars[var_or_local_name] if isinstance(var_or_local_name, str) else var_or_local_name
+
+    def get_var(self, var_or_local_name: Union[TfExpression, str]) -> np.ndarray:
+        """Get the value of a given variable as NumPy array.
+        Note: This method is very inefficient -- prefer to use tflib.run(list_of_vars) whenever possible."""
+        return self.find_var(var_or_local_name).eval()
+
+    def set_var(self, var_or_local_name: Union[TfExpression, str], new_value: Union[int, float, np.ndarray]) -> None:
+        """Set the value of a given variable based on the given NumPy array.
+        Note: This method is very inefficient -- prefer to use tflib.set_vars() whenever possible."""
+        tfutil.set_vars({self.find_var(var_or_local_name): new_value})
+
+    def __getstate__(self) -> dict:
+        """Pickle export."""
+        state = dict()
+        state["version"]            = 4
+        state["name"]               = self.name
+        state["static_kwargs"]      = dict(self.static_kwargs)
+        state["components"]         = dict(self.components)
+        state["build_module_src"]   = self._build_module_src
+        state["build_func_name"]    = self._build_func_name
+        state["variables"]          = list(zip(self.own_vars.keys(), tfutil.run(list(self.own_vars.values()))))
+        return state
+
+    def __setstate__(self, state: dict) -> None:
+        """Pickle import."""
+        # pylint: disable=attribute-defined-outside-init
+        tfutil.assert_tf_initialized()
+        self._init_fields()
+
+        # Execute custom import handlers.
+        for handler in _import_handlers:
+            state = handler(state)
+
+        # Set basic fields.
+        assert state["version"] in [2, 3, 4]
+        self.name = state["name"]
+        self.static_kwargs = util.EasyDict(state["static_kwargs"])
+        self.components = util.EasyDict(state.get("components", {}))
+        self._build_module_src = state["build_module_src"]
+        self._build_func_name = state["build_func_name"]
+
+        # Create temporary module from the imported source code.
+        module_name = "_tflib_network_import_" + uuid.uuid4().hex
+        module = types.ModuleType(module_name)
+        sys.modules[module_name] = module
+        _import_module_src[module] = self._build_module_src
+        exec(self._build_module_src, module.__dict__) # pylint: disable=exec-used
+
+        # Locate network build function in the temporary module.
+        self._build_func = util.get_obj_from_module(module, self._build_func_name)
+        assert callable(self._build_func)
+
+        # Init TensorFlow graph.
+        self._init_graph()
+        self.reset_own_vars()
+        tfutil.set_vars({self.find_var(name): value for name, value in state["variables"]})
+
+    def clone(self, name: str = None, **new_static_kwargs) -> "Network":
+        """Create a clone of this network with its own copy of the variables."""
+        # pylint: disable=protected-access
+        net = object.__new__(Network)
+        net._init_fields()
+        net.name = name if name is not None else self.name
+        net.static_kwargs = util.EasyDict(self.static_kwargs)
+        net.static_kwargs.update(new_static_kwargs)
+        net._build_module_src = self._build_module_src
+        net._build_func_name = self._build_func_name
+        net._build_func = self._build_func
+        net._init_graph()
+        net.copy_vars_from(self)
+        return net
+
+    def copy_own_vars_from(self, src_net: "Network") -> None:
+        """Copy the values of all variables from the given network, excluding sub-networks."""
+        names = [name for name in self.own_vars.keys() if name in src_net.own_vars]
+        tfutil.set_vars(tfutil.run({self.vars[name]: src_net.vars[name] for name in names}))
+
+    def copy_vars_from(self, src_net: "Network") -> None:
+        """Copy the values of all variables from the given network, including sub-networks."""
+        names = [name for name in self.vars.keys() if name in src_net.vars]
+        tfutil.set_vars(tfutil.run({self.vars[name]: src_net.vars[name] for name in names}))
+
+    def copy_trainables_from(self, src_net: "Network") -> None:
+        """Copy the values of all trainable variables from the given network, including sub-networks."""
+        names = [name for name in self.trainables.keys() if name in src_net.trainables]
+        tfutil.set_vars(tfutil.run({self.vars[name]: src_net.vars[name] for name in names}))
+
+    def convert(self, new_func_name: str, new_name: str = None, **new_static_kwargs) -> "Network":
+        """Create new network with the given parameters, and copy all variables from this network."""
+        if new_name is None:
+            new_name = self.name
+        static_kwargs = dict(self.static_kwargs)
+        static_kwargs.update(new_static_kwargs)
+        net = Network(name=new_name, func_name=new_func_name, **static_kwargs)
+        net.copy_vars_from(self)
+        return net
+
+    def setup_as_moving_average_of(self, src_net: "Network", beta: TfExpressionEx = 0.99, beta_nontrainable: TfExpressionEx = 0.0) -> tf.Operation:
+        """Construct a TensorFlow op that updates the variables of this network
+        to be slightly closer to those of the given network."""
+        with tfutil.absolute_name_scope(self.scope + "/_MovingAvg"):
+            ops = []
+            for name, var in self.vars.items():
+                if name in src_net.vars:
+                    cur_beta = beta if name in self.trainables else beta_nontrainable
+                    new_value = tfutil.lerp(src_net.vars[name], var, cur_beta)
+                    ops.append(var.assign(new_value))
+            return tf.group(*ops)
+
+    def run(self,
+            *in_arrays: Tuple[Union[np.ndarray, None], ...],
+            input_transform: dict = None,
+            output_transform: dict = None,
+            return_as_list: bool = False,
+            print_progress: bool = False,
+            minibatch_size: int = None,
+            num_gpus: int = 1,
+            assume_frozen: bool = False,
+            **dynamic_kwargs) -> Union[np.ndarray, Tuple[np.ndarray, ...], List[np.ndarray]]:
+        """Run this network for the given NumPy array(s), and return the output(s) as NumPy array(s).
+
+        Args:
+            input_transform:    A dict specifying a custom transformation to be applied to the input tensor(s) before evaluating the network.
+                                The dict must contain a 'func' field that points to a top-level function. The function is called with the input
+                                TensorFlow expression(s) as positional arguments. Any remaining fields of the dict will be passed in as kwargs.
+            output_transform:   A dict specifying a custom transformation to be applied to the output tensor(s) after evaluating the network.
+                                The dict must contain a 'func' field that points to a top-level function. The function is called with the output
+                                TensorFlow expression(s) as positional arguments. Any remaining fields of the dict will be passed in as kwargs.
+            return_as_list:     True = return a list of NumPy arrays, False = return a single NumPy array, or a tuple if there are multiple outputs.
+            print_progress:     Print progress to the console? Useful for very large input arrays.
+            minibatch_size:     Maximum minibatch size to use, None = disable batching.
+            num_gpus:           Number of GPUs to use.
+            assume_frozen:      Improve multi-GPU performance by assuming that the trainable parameters will remain changed between calls.
+            dynamic_kwargs:     Additional keyword arguments to be passed into the network build function.
+        """
+        assert len(in_arrays) == self.num_inputs
+        assert not all(arr is None for arr in in_arrays)
+        assert input_transform is None or util.is_top_level_function(input_transform["func"])
+        assert output_transform is None or util.is_top_level_function(output_transform["func"])
+        output_transform, dynamic_kwargs = _handle_legacy_output_transforms(output_transform, dynamic_kwargs)
+        num_items = in_arrays[0].shape[0]
+        if minibatch_size is None:
+            minibatch_size = num_items
+
+        # Construct unique hash key from all arguments that affect the TensorFlow graph.
+        key = dict(input_transform=input_transform, output_transform=output_transform, num_gpus=num_gpus, assume_frozen=assume_frozen, dynamic_kwargs=dynamic_kwargs)
+        def unwind_key(obj):
+            if isinstance(obj, dict):
+                return [(key, unwind_key(value)) for key, value in sorted(obj.items())]
+            if callable(obj):
+                return util.get_top_level_function_name(obj)
+            return obj
+        key = repr(unwind_key(key))
+
+        # Build graph.
+        if key not in self._run_cache:
+            with tfutil.absolute_name_scope(self.scope + "/_Run"), tf.control_dependencies(None):
+                with tf.device("/cpu:0"):
+                    in_expr = [tf.placeholder(tf.float32, name=name) for name in self.input_names]
+                    in_split = list(zip(*[tf.split(x, num_gpus) for x in in_expr]))
+
+                out_split = []
+                for gpu in range(num_gpus):
+                    with tf.device("/gpu:%d" % gpu):
+                        net_gpu = self.clone() if assume_frozen else self
+                        in_gpu = in_split[gpu]
+
+                        if input_transform is not None:
+                            in_kwargs = dict(input_transform)
+                            in_gpu = in_kwargs.pop("func")(*in_gpu, **in_kwargs)
+                            in_gpu = [in_gpu] if tfutil.is_tf_expression(in_gpu) else list(in_gpu)
+
+                        assert len(in_gpu) == self.num_inputs
+                        out_gpu = net_gpu.get_output_for(*in_gpu, return_as_list=True, **dynamic_kwargs)
+
+                        if output_transform is not None:
+                            out_kwargs = dict(output_transform)
+                            out_gpu = out_kwargs.pop("func")(*out_gpu, **out_kwargs)
+                            out_gpu = [out_gpu] if tfutil.is_tf_expression(out_gpu) else list(out_gpu)
+
+                        assert len(out_gpu) == self.num_outputs
+                        out_split.append(out_gpu)
+
+                with tf.device("/cpu:0"):
+                    out_expr = [tf.concat(outputs, axis=0) for outputs in zip(*out_split)]
+                    self._run_cache[key] = in_expr, out_expr
+
+        # Run minibatches.
+        in_expr, out_expr = self._run_cache[key]
+        out_arrays = [np.empty([num_items] + expr.shape.as_list()[1:], expr.dtype.name) for expr in out_expr]
+
+        for mb_begin in range(0, num_items, minibatch_size):
+            if print_progress:
+                print("\r%d / %d" % (mb_begin, num_items), end="")
+
+            mb_end = min(mb_begin + minibatch_size, num_items)
+            mb_num = mb_end - mb_begin
+            mb_in = [src[mb_begin : mb_end] if src is not None else np.zeros([mb_num] + shape[1:]) for src, shape in zip(in_arrays, self.input_shapes)]
+            mb_out = tf.get_default_session().run(out_expr, dict(zip(in_expr, mb_in)))
+
+            for dst, src in zip(out_arrays, mb_out):
+                dst[mb_begin: mb_end] = src
+
+        # Done.
+        if print_progress:
+            print("\r%d / %d" % (num_items, num_items))
+
+        if not return_as_list:
+            out_arrays = out_arrays[0] if len(out_arrays) == 1 else tuple(out_arrays)
+        return out_arrays
+
+    def list_ops(self) -> List[TfExpression]:
+        include_prefix = self.scope + "/"
+        exclude_prefix = include_prefix + "_"
+        ops = tf.get_default_graph().get_operations()
+        ops = [op for op in ops if op.name.startswith(include_prefix)]
+        ops = [op for op in ops if not op.name.startswith(exclude_prefix)]
+        return ops
+
+    def list_layers(self) -> List[Tuple[str, TfExpression, List[TfExpression]]]:
+        """Returns a list of (layer_name, output_expr, trainable_vars) tuples corresponding to
+        individual layers of the network. Mainly intended to be used for reporting."""
+        layers = []
+
+        def recurse(scope, parent_ops, parent_vars, level):
+            # Ignore specific patterns.
+            if any(p in scope for p in ["/Shape", "/strided_slice", "/Cast", "/concat", "/Assign"]):
+                return
+
+            # Filter ops and vars by scope.
+            global_prefix = scope + "/"
+            local_prefix = global_prefix[len(self.scope) + 1:]
+            cur_ops = [op for op in parent_ops if op.name.startswith(global_prefix) or op.name == global_prefix[:-1]]
+            cur_vars = [(name, var) for name, var in parent_vars if name.startswith(local_prefix) or name == local_prefix[:-1]]
+            if not cur_ops and not cur_vars:
+                return
+
+            # Filter out all ops related to variables.
+            for var in [op for op in cur_ops if op.type.startswith("Variable")]:
+                var_prefix = var.name + "/"
+                cur_ops = [op for op in cur_ops if not op.name.startswith(var_prefix)]
+
+            # Scope does not contain ops as immediate children => recurse deeper.
+            contains_direct_ops = any("/" not in op.name[len(global_prefix):] and op.type not in ["Identity", "Cast", "Transpose"] for op in cur_ops)
+            if (level == 0 or not contains_direct_ops) and (len(cur_ops) + len(cur_vars)) > 1:
+                visited = set()
+                for rel_name in [op.name[len(global_prefix):] for op in cur_ops] + [name[len(local_prefix):] for name, _var in cur_vars]:
+                    token = rel_name.split("/")[0]
+                    if token not in visited:
+                        recurse(global_prefix + token, cur_ops, cur_vars, level + 1)
+                        visited.add(token)
+                return
+
+            # Report layer.
+            layer_name = scope[len(self.scope) + 1:]
+            layer_output = cur_ops[-1].outputs[0] if cur_ops else cur_vars[-1][1]
+            layer_trainables = [var for _name, var in cur_vars if var.trainable]
+            layers.append((layer_name, layer_output, layer_trainables))
+
+        recurse(self.scope, self.list_ops(), list(self.vars.items()), 0)
+        return layers
+
+    def print_layers(self, title: str = None, hide_layers_with_no_params: bool = False) -> None:
+        """Print a summary table of the network structure."""
+        rows = [[title if title is not None else self.name, "Params", "OutputShape", "WeightShape"]]
+        rows += [["---"] * 4]
+        total_params = 0
+
+        for layer_name, layer_output, layer_trainables in self.list_layers():
+            num_params = sum(int(np.prod(var.shape.as_list())) for var in layer_trainables)
+            weights = [var for var in layer_trainables if var.name.endswith("/weight:0")]
+            weights.sort(key=lambda x: len(x.name))
+            if len(weights) == 0 and len(layer_trainables) == 1:
+                weights = layer_trainables
+            total_params += num_params
+
+            if not hide_layers_with_no_params or num_params != 0:
+                num_params_str = str(num_params) if num_params > 0 else "-"
+                output_shape_str = str(layer_output.shape)
+                weight_shape_str = str(weights[0].shape) if len(weights) >= 1 else "-"
+                rows += [[layer_name, num_params_str, output_shape_str, weight_shape_str]]
+
+        rows += [["---"] * 4]
+        rows += [["Total", str(total_params), "", ""]]
+
+        widths = [max(len(cell) for cell in column) for column in zip(*rows)]
+        print()
+        for row in rows:
+            print("  ".join(cell + " " * (width - len(cell)) for cell, width in zip(row, widths)))
+        print()
+
+    def setup_weight_histograms(self, title: str = None) -> None:
+        """Construct summary ops to include histograms of all trainable parameters in TensorBoard."""
+        if title is None:
+            title = self.name
+
+        with tf.name_scope(None), tf.device(None), tf.control_dependencies(None):
+            for local_name, var in self.trainables.items():
+                if "/" in local_name:
+                    p = local_name.split("/")
+                    name = title + "_" + p[-1] + "/" + "_".join(p[:-1])
+                else:
+                    name = title + "_toplevel/" + local_name
+
+                tf.summary.histogram(name, var)
+
+#----------------------------------------------------------------------------
+# Backwards-compatible emulation of legacy output transformation in Network.run().
+
+_print_legacy_warning = True
+
+def _handle_legacy_output_transforms(output_transform, dynamic_kwargs):
+    global _print_legacy_warning
+    legacy_kwargs = ["out_mul", "out_add", "out_shrink", "out_dtype"]
+    if not any(kwarg in dynamic_kwargs for kwarg in legacy_kwargs):
+        return output_transform, dynamic_kwargs
+
+    if _print_legacy_warning:
+        _print_legacy_warning = False
+        print()
+        print("WARNING: Old-style output transformations in Network.run() are deprecated.")
+        print("Consider using 'output_transform=dict(func=tflib.convert_images_to_uint8)'")
+        print("instead of 'out_mul=127.5, out_add=127.5, out_dtype=np.uint8'.")
+        print()
+    assert output_transform is None
+
+    new_kwargs = dict(dynamic_kwargs)
+    new_transform = {kwarg: new_kwargs.pop(kwarg) for kwarg in legacy_kwargs if kwarg in dynamic_kwargs}
+    new_transform["func"] = _legacy_output_transform_func
+    return new_transform, new_kwargs
+
+def _legacy_output_transform_func(*expr, out_mul=1.0, out_add=0.0, out_shrink=1, out_dtype=None):
+    if out_mul != 1.0:
+        expr = [x * out_mul for x in expr]
+
+    if out_add != 0.0:
+        expr = [x + out_add for x in expr]
+
+    if out_shrink > 1:
+        ksize = [1, 1, out_shrink, out_shrink]
+        expr = [tf.nn.avg_pool(x, ksize=ksize, strides=ksize, padding="VALID", data_format="NCHW") for x in expr]
+
+    if out_dtype is not None:
+        if tf.as_dtype(out_dtype).is_integer:
+            expr = [tf.round(x) for x in expr]
+        expr = [tf.saturate_cast(x, out_dtype) for x in expr]
+    return expr
diff --git a/dnnlib/tflib/ops/__init__.py b/dnnlib/tflib/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ee9b7dd51bb69639724580f167b2eac39666266
--- /dev/null
+++ b/dnnlib/tflib/ops/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+# empty
diff --git a/dnnlib/tflib/ops/fused_bias_act.cu b/dnnlib/tflib/ops/fused_bias_act.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d1d0bd6bc46ec3fc94c80f1f7e05c66a96a9072e
--- /dev/null
+++ b/dnnlib/tflib/ops/fused_bias_act.cu
@@ -0,0 +1,190 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#define EIGEN_USE_GPU
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <stdio.h>
+
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal(cudaGetErrorName(err))); } while (false)
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T>
+struct FusedBiasActKernelParams
+{
+    const T*    x;      // [sizeX]
+    const T*    b;      // [sizeB] or NULL
+    const T*    ref;    // [sizeX] or NULL
+    T*          y;      // [sizeX]
+
+    int         grad;
+    int         axis;
+    int         act;
+    float       alpha;
+    float       gain;
+
+    int         sizeX;
+    int         sizeB;
+    int         stepB;
+    int         loopX;
+};
+
+template <class T>
+static __global__ void FusedBiasActKernel(const FusedBiasActKernelParams<T> p)
+{
+    const float expRange        = 80.0f;
+    const float halfExpRange    = 40.0f;
+    const float seluScale       = 1.0507009873554804934193349852946f;
+    const float seluAlpha       = 1.6732632423543772848170429916717f;
+
+    // Loop over elements.
+    int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+    for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x)
+    {
+        // Load and apply bias.
+        float x = (float)p.x[xi];
+        if (p.b)
+            x += (float)p.b[(xi / p.stepB) % p.sizeB];
+        float ref = (p.ref) ? (float)p.ref[xi] : 0.0f;
+        if (p.gain != 0.0f & p.act != 9)
+            ref /= p.gain;
+
+        // Evaluate activation func.
+        float y;
+        switch (p.act * 10 + p.grad)
+        {
+            // linear
+            default:
+            case 10: y = x; break;
+            case 11: y = x; break;
+            case 12: y = 0.0f; break;
+
+            // relu
+            case 20: y = (x > 0.0f) ? x : 0.0f; break;
+            case 21: y = (ref > 0.0f) ? x : 0.0f; break;
+            case 22: y = 0.0f; break;
+
+            // lrelu
+            case 30: y = (x > 0.0f) ? x : x * p.alpha; break;
+            case 31: y = (ref > 0.0f) ? x : x * p.alpha; break;
+            case 32: y = 0.0f; break;
+
+            // tanh
+            case 40: { float c = expf(x); float d = 1.0f / c; y = (x < -expRange) ? -1.0f : (x > expRange) ? 1.0f : (c - d) / (c + d); } break;
+            case 41: y = x * (1.0f - ref * ref); break;
+            case 42: y = x * (1.0f - ref * ref) * (-2.0f * ref); break;
+
+            // sigmoid
+            case 50: y = (x < -expRange) ? 0.0f : 1.0f / (expf(-x) + 1.0f); break;
+            case 51: y = x * ref * (1.0f - ref); break;
+            case 52: y = x * ref * (1.0f - ref) * (1.0f - 2.0f * ref); break;
+
+            // elu
+            case 60: y = (x >= 0.0f) ? x : expf(x) - 1.0f; break;
+            case 61: y = (ref >= 0.0f) ? x : x * (ref + 1.0f); break;
+            case 62: y = (ref >= 0.0f) ? 0.0f : x * (ref + 1.0f); break;
+
+            // selu
+            case 70: y = (x >= 0.0f) ? seluScale * x : (seluScale * seluAlpha) * (expf(x) - 1.0f); break;
+            case 71: y = (ref >= 0.0f) ? x * seluScale : x * (ref + seluScale * seluAlpha); break;
+            case 72: y = (ref >= 0.0f) ? 0.0f : x * (ref + seluScale * seluAlpha); break;
+
+            // softplus
+            case 80: y = (x > expRange) ? x : logf(expf(x) + 1.0f); break;
+            case 81: y = x * (1.0f - expf(-ref)); break;
+            case 82: { float c = expf(-ref); y = x * c * (1.0f - c); } break;
+
+            // swish
+            case 90: y = (x < -expRange) ? 0.0f : x / (expf(-x) + 1.0f); break;
+            case 91: { float c = expf(ref); float d = c + 1.0f; y = (ref > halfExpRange) ? x : x * c * (ref + d) / (d * d); } break;
+            case 92: { float c = expf(ref); float d = c + 1.0f; y = (ref > halfExpRange) ? 0.0f : x * c * (ref * (2.0f - d) + 2.0f * d) / (d * d * d); } break;
+        }
+
+        // Apply gain and store.
+        p.y[xi] = (T)(y * p.gain);
+    }
+}
+
+//------------------------------------------------------------------------
+// TensorFlow op.
+
+template <class T>
+struct FusedBiasActOp : public OpKernel
+{
+    FusedBiasActKernelParams<T> m_attribs;
+
+    FusedBiasActOp(OpKernelConstruction* ctx) : OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("grad", &m_attribs.grad));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &m_attribs.axis));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("act", &m_attribs.act));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &m_attribs.alpha));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("gain", &m_attribs.gain));
+        OP_REQUIRES(ctx, m_attribs.grad >= 0, errors::InvalidArgument("grad must be non-negative"));
+        OP_REQUIRES(ctx, m_attribs.axis >= 0, errors::InvalidArgument("axis must be non-negative"));
+        OP_REQUIRES(ctx, m_attribs.act >= 0, errors::InvalidArgument("act must be non-negative"));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        FusedBiasActKernelParams<T> p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        const Tensor& x     = ctx->input(0); // [...]
+        const Tensor& b     = ctx->input(1); // [sizeB] or [0]
+        const Tensor& ref   = ctx->input(2); // x.shape or [0]
+        p.x = x.flat<T>().data();
+        p.b = (b.NumElements()) ? b.flat<T>().data() : NULL;
+        p.ref = (ref.NumElements()) ? ref.flat<T>().data() : NULL;
+        OP_REQUIRES(ctx, b.NumElements() == 0 || m_attribs.axis < x.dims(), errors::InvalidArgument("axis out of bounds"));
+        OP_REQUIRES(ctx, b.dims() == 1, errors::InvalidArgument("b must have rank 1"));
+        OP_REQUIRES(ctx, b.NumElements() == 0 || b.NumElements() == x.dim_size(m_attribs.axis), errors::InvalidArgument("b has wrong number of elements"));
+        OP_REQUIRES(ctx, ref.NumElements() == ((p.grad == 0) ? 0 : x.NumElements()), errors::InvalidArgument("ref has wrong number of elements"));
+        OP_REQUIRES(ctx, x.NumElements() <= kint32max, errors::InvalidArgument("x is too large"));
+
+        p.sizeX = (int)x.NumElements();
+        p.sizeB = (int)b.NumElements();
+        p.stepB = 1;
+        for (int i = m_attribs.axis + 1; i < x.dims(); i++)
+            p.stepB *= (int)x.dim_size(i);
+
+        Tensor* y = NULL; // x.shape
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x.shape(), &y));
+        p.y = y->flat<T>().data();
+
+        p.loopX = 4;
+        int blockSize = 4 * 32;
+        int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+        void* args[] = {&p};
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)FusedBiasActKernel<T>, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("FusedBiasAct")
+    .Input      ("x: T")
+    .Input      ("b: T")
+    .Input      ("ref: T")
+    .Output     ("y: T")
+    .Attr       ("T: {float, half}")
+    .Attr       ("grad: int = 0")
+    .Attr       ("axis: int = 1")
+    .Attr       ("act: int = 0")
+    .Attr       ("alpha: float = 0.0")
+    .Attr       ("gain: float = 1.0");
+REGISTER_KERNEL_BUILDER(Name("FusedBiasAct").Device(DEVICE_GPU).TypeConstraint<float>("T"), FusedBiasActOp<float>);
+REGISTER_KERNEL_BUILDER(Name("FusedBiasAct").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), FusedBiasActOp<Eigen::half>);
+
+//------------------------------------------------------------------------
diff --git a/dnnlib/tflib/ops/fused_bias_act.py b/dnnlib/tflib/ops/fused_bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0dfd08d475f4d6759fd4bbdc133aef85f3bb24
--- /dev/null
+++ b/dnnlib/tflib/ops/fused_bias_act.py
@@ -0,0 +1,198 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Custom TensorFlow ops for efficient bias and activation."""
+
+import os
+import numpy as np
+import tensorflow as tf
+from .. import custom_ops
+from ...util import EasyDict
+
+def _get_plugin():
+    return custom_ops.get_plugin(os.path.splitext(__file__)[0] + '.cu')
+
+#----------------------------------------------------------------------------
+
+activation_funcs = {
+    'linear':   EasyDict(func=lambda x, **_:        x,                          def_alpha=None, def_gain=1.0,           cuda_idx=1, ref='y', zero_2nd_grad=True),
+    'relu':     EasyDict(func=lambda x, **_:        tf.nn.relu(x),              def_alpha=None, def_gain=np.sqrt(2),    cuda_idx=2, ref='y', zero_2nd_grad=True),
+    'lrelu':    EasyDict(func=lambda x, alpha, **_: tf.nn.leaky_relu(x, alpha), def_alpha=0.2,  def_gain=np.sqrt(2),    cuda_idx=3, ref='y', zero_2nd_grad=True),
+    'tanh':     EasyDict(func=lambda x, **_:        tf.nn.tanh(x),              def_alpha=None, def_gain=1.0,           cuda_idx=4, ref='y', zero_2nd_grad=False),
+    'sigmoid':  EasyDict(func=lambda x, **_:        tf.nn.sigmoid(x),           def_alpha=None, def_gain=1.0,           cuda_idx=5, ref='y', zero_2nd_grad=False),
+    'elu':      EasyDict(func=lambda x, **_:        tf.nn.elu(x),               def_alpha=None, def_gain=1.0,           cuda_idx=6, ref='y', zero_2nd_grad=False),
+    'selu':     EasyDict(func=lambda x, **_:        tf.nn.selu(x),              def_alpha=None, def_gain=1.0,           cuda_idx=7, ref='y', zero_2nd_grad=False),
+    'softplus': EasyDict(func=lambda x, **_:        tf.nn.softplus(x),          def_alpha=None, def_gain=1.0,           cuda_idx=8, ref='y', zero_2nd_grad=False),
+    'swish':    EasyDict(func=lambda x, **_:        tf.nn.sigmoid(x) * x,       def_alpha=None, def_gain=np.sqrt(2),    cuda_idx=9, ref='x', zero_2nd_grad=False),
+}
+
+#----------------------------------------------------------------------------
+
+def fused_bias_act(x, b=None, axis=1, act='linear', alpha=None, gain=None, impl='cuda'):
+    r"""Fused bias and activation function.
+
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports first and second order gradients,
+    but not third order gradients.
+
+    Args:
+        x:      Input activation tensor. Can have any shape, but if `b` is defined, the
+                dimension corresponding to `axis`, as well as the rank, must be known.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `axis`.
+        axis:   The dimension in `x` corresponding to the elements of `b`.
+                The value of `axis` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying `1.0`.
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+
+    impl_dict = {
+        'ref':  _fused_bias_act_ref,
+        'cuda': _fused_bias_act_cuda,
+    }
+    return impl_dict[impl](x=x, b=b, axis=axis, act=act, alpha=alpha, gain=gain)
+
+#----------------------------------------------------------------------------
+
+def _fused_bias_act_ref(x, b, axis, act, alpha, gain):
+    """Slow reference implementation of `fused_bias_act()` using standard TensorFlow ops."""
+
+    # Validate arguments.
+    x = tf.convert_to_tensor(x)
+    b = tf.convert_to_tensor(b) if b is not None else tf.constant([], dtype=x.dtype)
+    act_spec = activation_funcs[act]
+    assert b.shape.rank == 1 and (b.shape[0] == 0 or b.shape[0] == x.shape[axis])
+    assert b.shape[0] == 0 or 0 <= axis < x.shape.rank
+    if alpha is None:
+        alpha = act_spec.def_alpha
+    if gain is None:
+        gain = act_spec.def_gain
+
+    # Add bias.
+    if b.shape[0] != 0:
+        x += tf.reshape(b, [-1 if i == axis else 1 for i in range(x.shape.rank)])
+
+    # Evaluate activation function.
+    x = act_spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    if gain != 1:
+        x *= gain
+    return x
+
+#----------------------------------------------------------------------------
+
+def _fused_bias_act_cuda(x, b, axis, act, alpha, gain):
+    """Fast CUDA implementation of `fused_bias_act()` using custom ops."""
+
+    # Validate arguments.
+    x = tf.convert_to_tensor(x)
+    empty_tensor = tf.constant([], dtype=x.dtype)
+    b = tf.convert_to_tensor(b) if b is not None else empty_tensor
+    act_spec = activation_funcs[act]
+    assert b.shape.rank == 1 and (b.shape[0] == 0 or b.shape[0] == x.shape[axis])
+    assert b.shape[0] == 0 or 0 <= axis < x.shape.rank
+    if alpha is None:
+        alpha = act_spec.def_alpha
+    if gain is None:
+        gain = act_spec.def_gain
+
+    # Special cases.
+    if act == 'linear' and b is None and gain == 1.0:
+        return x
+    if act_spec.cuda_idx is None:
+        return _fused_bias_act_ref(x=x, b=b, axis=axis, act=act, alpha=alpha, gain=gain)
+
+    # CUDA kernel.
+    cuda_kernel = _get_plugin().fused_bias_act
+    cuda_kwargs = dict(axis=axis, act=act_spec.cuda_idx, alpha=alpha, gain=gain)
+
+    # Forward pass: y = func(x, b).
+    def func_y(x, b):
+        y = cuda_kernel(x=x, b=b, ref=empty_tensor, grad=0, **cuda_kwargs)
+        y.set_shape(x.shape)
+        return y
+
+    # Backward pass: dx, db = grad(dy, x, y)
+    def grad_dx(dy, x, y):
+        ref = {'x': x, 'y': y}[act_spec.ref]
+        dx = cuda_kernel(x=dy, b=empty_tensor, ref=ref, grad=1, **cuda_kwargs)
+        dx.set_shape(x.shape)
+        return dx
+    def grad_db(dx):
+        if b.shape[0] == 0:
+            return empty_tensor
+        db = dx
+        if axis < x.shape.rank - 1:
+            db = tf.reduce_sum(db, list(range(axis + 1, x.shape.rank)))
+        if axis > 0:
+            db = tf.reduce_sum(db, list(range(axis)))
+        db.set_shape(b.shape)
+        return db
+
+    # Second order gradients: d_dy, d_x = grad2(d_dx, d_db, x, y)
+    def grad2_d_dy(d_dx, d_db, x, y):
+        ref = {'x': x, 'y': y}[act_spec.ref]
+        d_dy = cuda_kernel(x=d_dx, b=d_db, ref=ref, grad=1, **cuda_kwargs)
+        d_dy.set_shape(x.shape)
+        return d_dy
+    def grad2_d_x(d_dx, d_db, x, y):
+        ref = {'x': x, 'y': y}[act_spec.ref]
+        d_x = cuda_kernel(x=d_dx, b=d_db, ref=ref, grad=2, **cuda_kwargs)
+        d_x.set_shape(x.shape)
+        return d_x
+
+    # Fast version for piecewise-linear activation funcs.
+    @tf.custom_gradient
+    def func_zero_2nd_grad(x, b):
+        y = func_y(x, b)
+        @tf.custom_gradient
+        def grad(dy):
+            dx = grad_dx(dy, x, y)
+            db = grad_db(dx)
+            def grad2(d_dx, d_db):
+                d_dy = grad2_d_dy(d_dx, d_db, x, y)
+                return d_dy
+            return (dx, db), grad2
+        return y, grad
+
+    # Slow version for general activation funcs.
+    @tf.custom_gradient
+    def func_nonzero_2nd_grad(x, b):
+        y = func_y(x, b)
+        def grad_wrap(dy):
+            @tf.custom_gradient
+            def grad_impl(dy, x):
+                dx = grad_dx(dy, x, y)
+                db = grad_db(dx)
+                def grad2(d_dx, d_db):
+                    d_dy = grad2_d_dy(d_dx, d_db, x, y)
+                    d_x = grad2_d_x(d_dx, d_db, x, y)
+                    return d_dy, d_x
+                return (dx, db), grad2
+            return grad_impl(dy, x)
+        return y, grad_wrap
+
+    # Which version to use?
+    if act_spec.zero_2nd_grad:
+        return func_zero_2nd_grad(x, b)
+    return func_nonzero_2nd_grad(x, b)
+
+#----------------------------------------------------------------------------
diff --git a/dnnlib/tflib/ops/upfirdn_2d.cu b/dnnlib/tflib/ops/upfirdn_2d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d2a2cea5c7f533a98819f2d40c1f56e488fd35aa
--- /dev/null
+++ b/dnnlib/tflib/ops/upfirdn_2d.cu
@@ -0,0 +1,328 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#define EIGEN_USE_GPU
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <stdio.h>
+
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal(cudaGetErrorName(err))); } while (false)
+
+static __host__ __device__ __forceinline__ int floorDiv(int a, int b)
+{
+    int c = a / b;
+    if (c * b > a)
+        c--;
+    return c;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+template <class T>
+struct UpFirDn2DKernelParams
+{
+    const T*    x;          // [majorDim, inH, inW, minorDim]
+    const T*    k;          // [kernelH, kernelW]
+    T*          y;          // [majorDim, outH, outW, minorDim]
+
+    int         upx;
+    int         upy;
+    int         downx;
+    int         downy;
+    int         padx0;
+    int         padx1;
+    int         pady0;
+    int         pady1;
+
+    int         majorDim;
+    int         inH;
+    int         inW;
+    int         minorDim;
+    int         kernelH;
+    int         kernelW;
+    int         outH;
+    int         outW;
+    int         loopMajor;
+    int         loopX;
+};
+
+//------------------------------------------------------------------------
+// General CUDA implementation for large filter kernels.
+
+template <class T>
+static __global__ void UpFirDn2DKernel_large(const UpFirDn2DKernelParams<T> p)
+{
+    // Calculate thread index.
+    int minorIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    int outY = minorIdx / p.minorDim;
+    minorIdx -= outY * p.minorDim;
+    int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
+    int majorIdxBase = blockIdx.z * p.loopMajor;
+    if (outXBase >= p.outW || outY >= p.outH || majorIdxBase >= p.majorDim)
+        return;
+
+    // Setup Y receptive field.
+    int midY = outY * p.downy + p.upy - 1 - p.pady0;
+    int inY = min(max(floorDiv(midY, p.upy), 0), p.inH);
+    int h = min(max(floorDiv(midY + p.kernelH, p.upy), 0), p.inH) - inY;
+    int kernelY = midY + p.kernelH - (inY + 1) * p.upy;
+
+    // Loop over majorDim and outX.
+    for (int loopMajor = 0, majorIdx = majorIdxBase; loopMajor < p.loopMajor && majorIdx < p.majorDim; loopMajor++, majorIdx++)
+    for (int loopX = 0, outX = outXBase; loopX < p.loopX && outX < p.outW; loopX++, outX += blockDim.y)
+    {
+        // Setup X receptive field.
+        int midX = outX * p.downx + p.upx - 1 - p.padx0;
+        int inX = min(max(floorDiv(midX, p.upx), 0), p.inW);
+        int w = min(max(floorDiv(midX + p.kernelW, p.upx), 0), p.inW) - inX;
+        int kernelX = midX + p.kernelW - (inX + 1) * p.upx;
+
+        // Initialize pointers.
+        const T* xp = &p.x[((majorIdx * p.inH + inY) * p.inW + inX) * p.minorDim + minorIdx];
+        const T* kp = &p.k[kernelY * p.kernelW + kernelX];
+        int xpx = p.minorDim;
+        int kpx = -p.upx;
+        int xpy = p.inW * p.minorDim;
+        int kpy = -p.upy * p.kernelW;
+
+        // Inner loop.
+        float v = 0.0f;
+        for (int y = 0; y < h; y++)
+        {
+            for (int x = 0; x < w; x++)
+            {
+                v += (float)(*xp) * (float)(*kp);
+                xp += xpx;
+                kp += kpx;
+            }
+            xp += xpy - w * xpx;
+            kp += kpy - w * kpx;
+        }
+
+        // Store result.
+        p.y[((majorIdx * p.outH + outY) * p.outW + outX) * p.minorDim + minorIdx] = (T)v;
+    }
+}
+
+//------------------------------------------------------------------------
+// Specialized CUDA implementation for small filter kernels.
+
+template <class T, int upx, int upy, int downx, int downy, int kernelW, int kernelH, int tileOutW, int tileOutH>
+static __global__ void UpFirDn2DKernel_small(const UpFirDn2DKernelParams<T> p)
+{
+    //assert(kernelW % upx == 0);
+    //assert(kernelH % upy == 0);
+    const int tileInW = ((tileOutW - 1) * downx + kernelW - 1) / upx + 1;
+    const int tileInH = ((tileOutH - 1) * downy + kernelH - 1) / upy + 1;
+    __shared__ volatile float sk[kernelH][kernelW];
+    __shared__ volatile float sx[tileInH][tileInW];
+
+    // Calculate tile index.
+    int minorIdx = blockIdx.x;
+    int tileOutY = minorIdx / p.minorDim;
+    minorIdx -= tileOutY * p.minorDim;
+    tileOutY *= tileOutH;
+    int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
+    int majorIdxBase = blockIdx.z * p.loopMajor;
+    if (tileOutXBase >= p.outW | tileOutY >= p.outH | majorIdxBase >= p.majorDim)
+        return;
+
+    // Load filter kernel (flipped).
+    for (int tapIdx = threadIdx.x; tapIdx < kernelH * kernelW; tapIdx += blockDim.x)
+    {
+        int ky = tapIdx / kernelW;
+        int kx = tapIdx - ky * kernelW;
+        float v = 0.0f;
+        if (kx < p.kernelW & ky < p.kernelH)
+            v = (float)p.k[(p.kernelH - 1 - ky) * p.kernelW + (p.kernelW - 1 - kx)];
+        sk[ky][kx] = v;
+    }
+
+    // Loop over majorDim and outX.
+    for (int loopMajor = 0, majorIdx = majorIdxBase; loopMajor < p.loopMajor & majorIdx < p.majorDim; loopMajor++, majorIdx++)
+    for (int loopX = 0, tileOutX = tileOutXBase; loopX < p.loopX & tileOutX < p.outW; loopX++, tileOutX += tileOutW)
+    {
+        // Load input pixels.
+        int tileMidX = tileOutX * downx + upx - 1 - p.padx0;
+        int tileMidY = tileOutY * downy + upy - 1 - p.pady0;
+        int tileInX = floorDiv(tileMidX, upx);
+        int tileInY = floorDiv(tileMidY, upy);
+        __syncthreads();
+        for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW; inIdx += blockDim.x)
+        {
+            int relInY = inIdx / tileInW;
+            int relInX = inIdx - relInY * tileInW;
+            int inX = relInX + tileInX;
+            int inY = relInY + tileInY;
+            float v = 0.0f;
+            if (inX >= 0 & inY >= 0 & inX < p.inW & inY < p.inH)
+                v = (float)p.x[((majorIdx * p.inH + inY) * p.inW + inX) * p.minorDim + minorIdx];
+            sx[relInY][relInX] = v;
+        }
+
+        // Loop over output pixels.
+        __syncthreads();
+        for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW; outIdx += blockDim.x)
+        {
+            int relOutY = outIdx / tileOutW;
+            int relOutX = outIdx - relOutY * tileOutW;
+            int outX = relOutX + tileOutX;
+            int outY = relOutY + tileOutY;
+
+            // Setup receptive field.
+            int midX = tileMidX + relOutX * downx;
+            int midY = tileMidY + relOutY * downy;
+            int inX = floorDiv(midX, upx);
+            int inY = floorDiv(midY, upy);
+            int relInX = inX - tileInX;
+            int relInY = inY - tileInY;
+            int kernelX = (inX + 1) * upx - midX - 1; // flipped
+            int kernelY = (inY + 1) * upy - midY - 1; // flipped
+
+            // Inner loop.
+            float v = 0.0f;
+            #pragma unroll
+            for (int y = 0; y < kernelH / upy; y++)
+                #pragma unroll
+                for (int x = 0; x < kernelW / upx; x++)
+                    v += sx[relInY + y][relInX + x] * sk[kernelY + y * upy][kernelX + x * upx];
+
+            // Store result.
+            if (outX < p.outW & outY < p.outH)
+                p.y[((majorIdx * p.outH + outY) * p.outW + outX) * p.minorDim + minorIdx] = (T)v;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TensorFlow op.
+
+template <class T>
+struct UpFirDn2DOp : public OpKernel
+{
+    UpFirDn2DKernelParams<T> m_attribs;
+
+    UpFirDn2DOp(OpKernelConstruction* ctx) : OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("upx", &m_attribs.upx));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("upy", &m_attribs.upy));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("downx", &m_attribs.downx));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("downy", &m_attribs.downy));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("padx0", &m_attribs.padx0));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("padx1", &m_attribs.padx1));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("pady0", &m_attribs.pady0));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("pady1", &m_attribs.pady1));
+        OP_REQUIRES(ctx, m_attribs.upx >= 1 && m_attribs.upy >= 1, errors::InvalidArgument("upx and upy must be at least 1x1"));
+        OP_REQUIRES(ctx, m_attribs.downx >= 1 && m_attribs.downy >= 1, errors::InvalidArgument("downx and downy must be at least 1x1"));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        UpFirDn2DKernelParams<T> p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        const Tensor& x = ctx->input(0); // [majorDim, inH, inW, minorDim]
+        const Tensor& k = ctx->input(1); // [kernelH, kernelW]
+        p.x = x.flat<T>().data();
+        p.k = k.flat<T>().data();
+        OP_REQUIRES(ctx, x.dims() == 4, errors::InvalidArgument("input must have rank 4"));
+        OP_REQUIRES(ctx, k.dims() == 2, errors::InvalidArgument("kernel must have rank 2"));
+        OP_REQUIRES(ctx, x.NumElements() <= kint32max, errors::InvalidArgument("input too large"));
+        OP_REQUIRES(ctx, k.NumElements() <= kint32max, errors::InvalidArgument("kernel too large"));
+
+        p.majorDim  = (int)x.dim_size(0);
+        p.inH       = (int)x.dim_size(1);
+        p.inW       = (int)x.dim_size(2);
+        p.minorDim  = (int)x.dim_size(3);
+        p.kernelH   = (int)k.dim_size(0);
+        p.kernelW   = (int)k.dim_size(1);
+        OP_REQUIRES(ctx, p.kernelW >= 1 && p.kernelH >= 1, errors::InvalidArgument("kernel must be at least 1x1"));
+
+        p.outW = (p.inW * p.upx + p.padx0 + p.padx1 - p.kernelW + p.downx) / p.downx;
+        p.outH = (p.inH * p.upy + p.pady0 + p.pady1 - p.kernelH + p.downy) / p.downy;
+        OP_REQUIRES(ctx, p.outW >= 1 && p.outH >= 1, errors::InvalidArgument("output must be at least 1x1"));
+
+        Tensor* y = NULL; // [majorDim, outH, outW, minorDim]
+        TensorShape ys;
+        ys.AddDim(p.majorDim);
+        ys.AddDim(p.outH);
+        ys.AddDim(p.outW);
+        ys.AddDim(p.minorDim);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, ys, &y));
+        p.y = y->flat<T>().data();
+        OP_REQUIRES(ctx, y->NumElements() <= kint32max, errors::InvalidArgument("output too large"));
+
+        // Choose CUDA kernel to use.
+        void* cudaKernel = (void*)UpFirDn2DKernel_large<T>;
+        int tileOutW = -1;
+        int tileOutH = -1;
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 7 && p.kernelH <= 7) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 7,7, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 6 && p.kernelH <= 6) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 6,6, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 5 && p.kernelH <= 5) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 5,5, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 4 && p.kernelH <= 4) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 4,4, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 3 && p.kernelH <= 3) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 3,3, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 8 && p.kernelH <= 8) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 8,8, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 6 && p.kernelH <= 6) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 6,6, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 4 && p.kernelH <= 4) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 4,4, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 2 && p.kernelH <= 2) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 2,2, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 8 && p.kernelH <= 8) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 8,8, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 6 && p.kernelH <= 6) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 6,6, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 4 && p.kernelH <= 4) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 4,4, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 2 && p.kernelH <= 2) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 2,2, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+
+        // Choose launch params.
+        dim3 blockSize;
+        dim3 gridSize;
+        if (tileOutW > 0 && tileOutH > 0) // small
+        {
+            p.loopMajor = (p.majorDim - 1) / 16384 + 1;
+            p.loopX = 1;
+            blockSize = dim3(32 * 8, 1, 1);
+            gridSize = dim3(((p.outH - 1) / tileOutH + 1) * p.minorDim, (p.outW - 1) / (p.loopX * tileOutW) + 1, (p.majorDim - 1) / p.loopMajor + 1);
+        }
+        else // large
+        {
+            p.loopMajor = (p.majorDim - 1) / 16384 + 1;
+            p.loopX = 4;
+            blockSize = dim3(4, 32, 1);
+            gridSize = dim3((p.outH * p.minorDim - 1) / blockSize.x + 1, (p.outW - 1) / (p.loopX * blockSize.y) + 1, (p.majorDim - 1) / p.loopMajor + 1);
+        }
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(cudaKernel, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("UpFirDn2D")
+    .Input      ("x: T")
+    .Input      ("k: T")
+    .Output     ("y: T")
+    .Attr       ("T: {float, half}")
+    .Attr       ("upx: int = 1")
+    .Attr       ("upy: int = 1")
+    .Attr       ("downx: int = 1")
+    .Attr       ("downy: int = 1")
+    .Attr       ("padx0: int = 0")
+    .Attr       ("padx1: int = 0")
+    .Attr       ("pady0: int = 0")
+    .Attr       ("pady1: int = 0");
+REGISTER_KERNEL_BUILDER(Name("UpFirDn2D").Device(DEVICE_GPU).TypeConstraint<float>("T"), UpFirDn2DOp<float>);
+REGISTER_KERNEL_BUILDER(Name("UpFirDn2D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), UpFirDn2DOp<Eigen::half>);
+
+//------------------------------------------------------------------------
diff --git a/dnnlib/tflib/ops/upfirdn_2d.py b/dnnlib/tflib/ops/upfirdn_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..22e4b14fd5436e42336d3dd82f6135876076c518
--- /dev/null
+++ b/dnnlib/tflib/ops/upfirdn_2d.py
@@ -0,0 +1,366 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Custom TensorFlow ops for efficient resampling of 2D images."""
+
+import os
+import numpy as np
+import tensorflow as tf
+from .. import custom_ops
+
+def _get_plugin():
+    return custom_ops.get_plugin(os.path.splitext(__file__)[0] + '.cu')
+
+#----------------------------------------------------------------------------
+
+def upfirdn_2d(x, k, upx=1, upy=1, downx=1, downy=1, padx0=0, padx1=0, pady0=0, pady1=0, impl='cuda'):
+    r"""Pad, upsample, FIR filter, and downsample a batch of 2D images.
+
+    Accepts a batch of 2D images of the shape `[majorDim, inH, inW, minorDim]`
+    and performs the following operations for each image, batched across
+    `majorDim` and `minorDim`:
+
+    1. Pad the image with zeros by the specified number of pixels on each side
+       (`padx0`, `padx1`, `pady0`, `pady1`). Specifying a negative value
+       corresponds to cropping the image.
+
+    2. Upsample the image by inserting the zeros after each pixel (`upx`, `upy`).
+
+    3. Convolve the image with the specified 2D FIR filter (`k`), shrinking the
+       image so that the footprint of all output pixels lies within the input image.
+
+    4. Downsample the image by throwing away pixels (`downx`, `downy`).
+
+    This sequence of operations bears close resemblance to scipy.signal.upfirdn().
+    The fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:      Input tensor of the shape `[majorDim, inH, inW, minorDim]`.
+        k:      2D FIR filter of the shape `[firH, firW]`.
+        upx:    Integer upsampling factor along the X-axis (default: 1).
+        upy:    Integer upsampling factor along the Y-axis (default: 1).
+        downx:  Integer downsampling factor along the X-axis (default: 1).
+        downy:  Integer downsampling factor along the Y-axis (default: 1).
+        padx0:  Number of pixels to pad on the left side (default: 0).
+        padx1:  Number of pixels to pad on the right side (default: 0).
+        pady0:  Number of pixels to pad on the top side (default: 0).
+        pady1:  Number of pixels to pad on the bottom side (default: 0).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[majorDim, outH, outW, minorDim]`, and same datatype as `x`.
+    """
+
+    impl_dict = {
+        'ref':  _upfirdn_2d_ref,
+        'cuda': _upfirdn_2d_cuda,
+    }
+    return impl_dict[impl](x=x, k=k, upx=upx, upy=upy, downx=downx, downy=downy, padx0=padx0, padx1=padx1, pady0=pady0, pady1=pady1)
+
+#----------------------------------------------------------------------------
+
+def _upfirdn_2d_ref(x, k, upx, upy, downx, downy, padx0, padx1, pady0, pady1):
+    """Slow reference implementation of `upfirdn_2d()` using standard TensorFlow ops."""
+
+    x = tf.convert_to_tensor(x)
+    k = np.asarray(k, dtype=np.float32)
+    assert x.shape.rank == 4
+    inH = x.shape[1].value
+    inW = x.shape[2].value
+    minorDim = _shape(x, 3)
+    kernelH, kernelW = k.shape
+    assert inW >= 1 and inH >= 1
+    assert kernelW >= 1 and kernelH >= 1
+    assert isinstance(upx, int) and isinstance(upy, int)
+    assert isinstance(downx, int) and isinstance(downy, int)
+    assert isinstance(padx0, int) and isinstance(padx1, int)
+    assert isinstance(pady0, int) and isinstance(pady1, int)
+
+    # Upsample (insert zeros).
+    x = tf.reshape(x, [-1, inH, 1, inW, 1, minorDim])
+    x = tf.pad(x, [[0, 0], [0, 0], [0, upy - 1], [0, 0], [0, upx - 1], [0, 0]])
+    x = tf.reshape(x, [-1, inH * upy, inW * upx, minorDim])
+
+    # Pad (crop if negative).
+    x = tf.pad(x, [[0, 0], [max(pady0, 0), max(pady1, 0)], [max(padx0, 0), max(padx1, 0)], [0, 0]])
+    x = x[:, max(-pady0, 0) : x.shape[1].value - max(-pady1, 0), max(-padx0, 0) : x.shape[2].value - max(-padx1, 0), :]
+
+    # Convolve with filter.
+    x = tf.transpose(x, [0, 3, 1, 2])
+    x = tf.reshape(x, [-1, 1, inH * upy + pady0 + pady1, inW * upx + padx0 + padx1])
+    w = tf.constant(k[::-1, ::-1, np.newaxis, np.newaxis], dtype=x.dtype)
+    x = tf.nn.conv2d(x, w, strides=[1,1,1,1], padding='VALID', data_format='NCHW')
+    x = tf.reshape(x, [-1, minorDim, inH * upy + pady0 + pady1 - kernelH + 1, inW * upx + padx0 + padx1 - kernelW + 1])
+    x = tf.transpose(x, [0, 2, 3, 1])
+
+    # Downsample (throw away pixels).
+    return x[:, ::downy, ::downx, :]
+
+#----------------------------------------------------------------------------
+
+def _upfirdn_2d_cuda(x, k, upx, upy, downx, downy, padx0, padx1, pady0, pady1):
+    """Fast CUDA implementation of `upfirdn_2d()` using custom ops."""
+
+    x = tf.convert_to_tensor(x)
+    k = np.asarray(k, dtype=np.float32)
+    majorDim, inH, inW, minorDim = x.shape.as_list()
+    kernelH, kernelW = k.shape
+    assert inW >= 1 and inH >= 1
+    assert kernelW >= 1 and kernelH >= 1
+    assert isinstance(upx, int) and isinstance(upy, int)
+    assert isinstance(downx, int) and isinstance(downy, int)
+    assert isinstance(padx0, int) and isinstance(padx1, int)
+    assert isinstance(pady0, int) and isinstance(pady1, int)
+
+    outW = (inW * upx + padx0 + padx1 - kernelW) // downx + 1
+    outH = (inH * upy + pady0 + pady1 - kernelH) // downy + 1
+    assert outW >= 1 and outH >= 1
+
+    kc = tf.constant(k, dtype=x.dtype)
+    gkc = tf.constant(k[::-1, ::-1], dtype=x.dtype)
+    gpadx0 = kernelW - padx0 - 1
+    gpady0 = kernelH - pady0 - 1
+    gpadx1 = inW * upx - outW * downx + padx0 - upx + 1
+    gpady1 = inH * upy - outH * downy + pady0 - upy + 1
+
+    @tf.custom_gradient
+    def func(x):
+        y = _get_plugin().up_fir_dn2d(x=x, k=kc, upx=upx, upy=upy, downx=downx, downy=downy, padx0=padx0, padx1=padx1, pady0=pady0, pady1=pady1)
+        y.set_shape([majorDim, outH, outW, minorDim])
+        @tf.custom_gradient
+        def grad(dy):
+            dx = _get_plugin().up_fir_dn2d(x=dy, k=gkc, upx=downx, upy=downy, downx=upx, downy=upy, padx0=gpadx0, padx1=gpadx1, pady0=gpady0, pady1=gpady1)
+            dx.set_shape([majorDim, inH, inW, minorDim])
+            return dx, func
+        return y, grad
+    return func(x)
+
+#----------------------------------------------------------------------------
+
+def filter_2d(x, k, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Filter a batch of 2D images with the given FIR filter.
+
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and filters each image with the given filter. The filter is normalized so that
+    if the input pixels are constant, they will be scaled by the specified `gain`.
+    Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+
+    k = _setup_kernel(k) * gain
+    p = k.shape[0] - 1
+    return _simple_upfirdn_2d(x, k, pad0=(p+1)//2, pad1=p//2, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def upsample_2d(x, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Upsample a batch of 2D images with the given filter.
+
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and upsamples each image with the given filter. The filter is normalized so that
+    if the input pixels are constant, they will be scaled by the specified `gain`.
+    Pixels outside the image are assumed to be zero, and the filter is padded with
+    zeros so that its shape is a multiple of the upsampling factor.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to nearest-neighbor
+                      upsampling.
+        factor:       Integer upsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H * factor, W * factor]` or
+        `[N, H * factor, W * factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * (gain * (factor ** 2))
+    p = k.shape[0] - factor
+    return _simple_upfirdn_2d(x, k, up=factor, pad0=(p+1)//2+factor-1, pad1=p//2, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def downsample_2d(x, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Downsample a batch of 2D images with the given filter.
+
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and downsamples each image with the given filter. The filter is normalized so that
+    if the input pixels are constant, they will be scaled by the specified `gain`.
+    Pixels outside the image are assumed to be zero, and the filter is padded with
+    zeros so that its shape is a multiple of the downsampling factor.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to average pooling.
+        factor:       Integer downsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H // factor, W // factor]` or
+        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * gain
+    p = k.shape[0] - factor
+    return _simple_upfirdn_2d(x, k, down=factor, pad0=(p+1)//2, pad1=p//2, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def upsample_conv_2d(x, w, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Fused `upsample_2d()` followed by `tf.nn.conv2d()`.
+
+    Padding is performed only once at the beginning, not between the operations.
+    The fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        w:            Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`.
+                      Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to nearest-neighbor
+                      upsampling.
+        factor:       Integer upsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H * factor, W * factor]` or
+        `[N, H * factor, W * factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+
+    # Check weight shape.
+    w = tf.convert_to_tensor(w)
+    assert w.shape.rank == 4
+    convH = w.shape[0].value
+    convW = w.shape[1].value
+    inC = _shape(w, 2)
+    outC = _shape(w, 3)
+    assert convW == convH
+
+    # Setup filter kernel.
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * (gain * (factor ** 2))
+    p = (k.shape[0] - factor) - (convW - 1)
+
+    # Determine data dimensions.
+    if data_format == 'NCHW':
+        stride = [1, 1, factor, factor]
+        output_shape = [_shape(x, 0), outC, (_shape(x, 2) - 1) * factor + convH, (_shape(x, 3) - 1) * factor + convW]
+        num_groups = _shape(x, 1) // inC
+    else:
+        stride = [1, factor, factor, 1]
+        output_shape = [_shape(x, 0), (_shape(x, 1) - 1) * factor + convH, (_shape(x, 2) - 1) * factor + convW, outC]
+        num_groups = _shape(x, 3) // inC
+
+    # Transpose weights.
+    w = tf.reshape(w, [convH, convW, inC, num_groups, -1])
+    w = tf.transpose(w[::-1, ::-1], [0, 1, 4, 3, 2])
+    w = tf.reshape(w, [convH, convW, -1, num_groups * inC])
+
+    # Execute.
+    x = tf.nn.conv2d_transpose(x, w, output_shape=output_shape, strides=stride, padding='VALID', data_format=data_format)
+    return _simple_upfirdn_2d(x, k, pad0=(p+1)//2+factor-1, pad1=p//2+1, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def conv_downsample_2d(x, w, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Fused `tf.nn.conv2d()` followed by `downsample_2d()`.
+
+    Padding is performed only once at the beginning, not between the operations.
+    The fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        w:            Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`.
+                      Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to average pooling.
+        factor:       Integer downsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H // factor, W // factor]` or
+        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    w = tf.convert_to_tensor(w)
+    convH, convW, _inC, _outC = w.shape.as_list()
+    assert convW == convH
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * gain
+    p = (k.shape[0] - factor) + (convW - 1)
+    if data_format == 'NCHW':
+        s = [1, 1, factor, factor]
+    else:
+        s = [1, factor, factor, 1]
+    x = _simple_upfirdn_2d(x, k, pad0=(p+1)//2, pad1=p//2, data_format=data_format, impl=impl)
+    return tf.nn.conv2d(x, w, strides=s, padding='VALID', data_format=data_format)
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _shape(tf_expr, dim_idx):
+    if tf_expr.shape.rank is not None:
+        dim = tf_expr.shape[dim_idx].value
+        if dim is not None:
+            return dim
+    return tf.shape(tf_expr)[dim_idx]
+
+def _setup_kernel(k):
+    k = np.asarray(k, dtype=np.float32)
+    if k.ndim == 1:
+        k = np.outer(k, k)
+    k /= np.sum(k)
+    assert k.ndim == 2
+    assert k.shape[0] == k.shape[1]
+    return k
+
+def _simple_upfirdn_2d(x, k, up=1, down=1, pad0=0, pad1=0, data_format='NCHW', impl='cuda'):
+    assert data_format in ['NCHW', 'NHWC']
+    assert x.shape.rank == 4
+    y = x
+    if data_format == 'NCHW':
+        y = tf.reshape(y, [-1, _shape(y, 2), _shape(y, 3), 1])
+    y = upfirdn_2d(y, k, upx=up, upy=up, downx=down, downy=down, padx0=pad0, padx1=pad1, pady0=pad0, pady1=pad1, impl=impl)
+    if data_format == 'NCHW':
+        y = tf.reshape(y, [-1, _shape(x, 1), _shape(y, 1), _shape(y, 2)])
+    return y
+
+#----------------------------------------------------------------------------
diff --git a/dnnlib/tflib/optimizer.py b/dnnlib/tflib/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d5dcc6172209985308784c9b9e590759612a0b
--- /dev/null
+++ b/dnnlib/tflib/optimizer.py
@@ -0,0 +1,338 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Helper wrapper for a Tensorflow optimizer."""
+
+import numpy as np
+import tensorflow as tf
+
+from collections import OrderedDict
+from typing import List, Union
+
+from . import autosummary
+from . import tfutil
+from .. import util
+
+from .tfutil import TfExpression, TfExpressionEx
+
+try:
+    # TensorFlow 1.13
+    from tensorflow.python.ops import nccl_ops
+except:
+    # Older TensorFlow versions
+    import tensorflow.contrib.nccl as nccl_ops
+
+class Optimizer:
+    """A Wrapper for tf.train.Optimizer.
+
+    Automatically takes care of:
+    - Gradient averaging for multi-GPU training.
+    - Gradient accumulation for arbitrarily large minibatches.
+    - Dynamic loss scaling and typecasts for FP16 training.
+    - Ignoring corrupted gradients that contain NaNs/Infs.
+    - Reporting statistics.
+    - Well-chosen default settings.
+    """
+
+    def __init__(self,
+        name:                   str             = "Train",                  # Name string that will appear in TensorFlow graph.
+        tf_optimizer:           str             = "tf.train.AdamOptimizer", # Underlying optimizer class.
+        learning_rate:          TfExpressionEx  = 0.001,                    # Learning rate. Can vary over time.
+        minibatch_multiplier:   TfExpressionEx  = None,                     # Treat N consecutive minibatches as one by accumulating gradients.
+        share:                  "Optimizer"     = None,                     # Share internal state with a previously created optimizer?
+        use_loss_scaling:       bool            = False,                    # Enable dynamic loss scaling for robust mixed-precision training?
+        loss_scaling_init:      float           = 64.0,                     # Log2 of initial loss scaling factor.
+        loss_scaling_inc:       float           = 0.0005,                   # Log2 of per-minibatch loss scaling increment when there is no overflow.
+        loss_scaling_dec:       float           = 1.0,                      # Log2 of per-minibatch loss scaling decrement when there is an overflow.
+        report_mem_usage:       bool            = False,                    # Report fine-grained memory usage statistics in TensorBoard?
+        **kwargs):
+
+        # Public fields.
+        self.name                   = name
+        self.learning_rate          = learning_rate
+        self.minibatch_multiplier   = minibatch_multiplier
+        self.id                     = self.name.replace("/", ".")
+        self.scope                  = tf.get_default_graph().unique_name(self.id)
+        self.optimizer_class        = util.get_obj_by_name(tf_optimizer)
+        self.optimizer_kwargs       = dict(kwargs)
+        self.use_loss_scaling       = use_loss_scaling
+        self.loss_scaling_init      = loss_scaling_init
+        self.loss_scaling_inc       = loss_scaling_inc
+        self.loss_scaling_dec       = loss_scaling_dec
+
+        # Private fields.
+        self._updates_applied       = False
+        self._devices               = OrderedDict() # device_name => EasyDict()
+        self._shared_optimizers     = OrderedDict() # device_name => optimizer_class
+        self._gradient_shapes       = None          # [shape, ...]
+        self._report_mem_usage      = report_mem_usage
+
+        # Validate arguments.
+        assert callable(self.optimizer_class)
+
+        # Share internal state if requested.
+        if share is not None:
+            assert isinstance(share, Optimizer)
+            assert self.optimizer_class is share.optimizer_class
+            assert self.learning_rate is share.learning_rate
+            assert self.optimizer_kwargs == share.optimizer_kwargs
+            self._shared_optimizers = share._shared_optimizers # pylint: disable=protected-access
+
+    def _get_device(self, device_name: str):
+        """Get internal state for the given TensorFlow device."""
+        tfutil.assert_tf_initialized()
+        if device_name in self._devices:
+            return self._devices[device_name]
+
+        # Initialize fields.
+        device = util.EasyDict()
+        device.name             = device_name
+        device.optimizer        = None          # Underlying optimizer:     optimizer_class
+        device.loss_scaling_var = None          # Log2 of loss scaling:     tf.Variable
+        device.grad_raw         = OrderedDict() # Raw gradients:            var => [grad, ...]
+        device.grad_clean       = OrderedDict() # Clean gradients:          var => grad
+        device.grad_acc_vars    = OrderedDict() # Accumulation sums:        var => tf.Variable
+        device.grad_acc_count   = None          # Accumulation counter:     tf.Variable
+        device.grad_acc         = OrderedDict() # Accumulated gradients:    var => grad
+
+        # Setup TensorFlow objects.
+        with tfutil.absolute_name_scope(self.scope + "/Devices"), tf.device(device_name), tf.control_dependencies(None):
+            if device_name not in self._shared_optimizers:
+                optimizer_name = self.scope.replace("/", "_") + "_opt%d" % len(self._shared_optimizers)
+                self._shared_optimizers[device_name] = self.optimizer_class(name=optimizer_name, learning_rate=self.learning_rate, **self.optimizer_kwargs)
+            device.optimizer = self._shared_optimizers[device_name]
+            if self.use_loss_scaling:
+                device.loss_scaling_var = tf.Variable(np.float32(self.loss_scaling_init), trainable=False, name="loss_scaling_var")
+
+        # Register device.
+        self._devices[device_name] = device
+        return device
+
+    def register_gradients(self, loss: TfExpression, trainable_vars: Union[List, dict]) -> None:
+        """Register the gradients of the given loss function with respect to the given variables.
+        Intended to be called once per GPU."""
+        tfutil.assert_tf_initialized()
+        assert not self._updates_applied
+        device = self._get_device(loss.device)
+
+        # Validate trainables.
+        if isinstance(trainable_vars, dict):
+            trainable_vars = list(trainable_vars.values())  # allow passing in Network.trainables as vars
+        assert isinstance(trainable_vars, list) and len(trainable_vars) >= 1
+        assert all(tfutil.is_tf_expression(expr) for expr in trainable_vars + [loss])
+        assert all(var.device == device.name for var in trainable_vars)
+
+        # Validate shapes.
+        if self._gradient_shapes is None:
+            self._gradient_shapes = [var.shape.as_list() for var in trainable_vars]
+        assert len(trainable_vars) == len(self._gradient_shapes)
+        assert all(var.shape.as_list() == var_shape for var, var_shape in zip(trainable_vars, self._gradient_shapes))
+
+        # Report memory usage if requested.
+        deps = []
+        if self._report_mem_usage:
+            self._report_mem_usage = False
+            try:
+                with tf.name_scope(self.id + '_mem'), tf.device(device.name), tf.control_dependencies([loss]):
+                    deps.append(autosummary.autosummary(self.id + "/mem_usage_gb", tf.contrib.memory_stats.BytesInUse() / 2**30))
+            except tf.errors.NotFoundError:
+                pass
+
+        # Compute gradients.
+        with tf.name_scope(self.id + "_grad"), tf.device(device.name), tf.control_dependencies(deps):
+            loss = self.apply_loss_scaling(tf.cast(loss, tf.float32))
+            gate = tf.train.Optimizer.GATE_NONE  # disable gating to reduce memory usage
+            grad_list = device.optimizer.compute_gradients(loss=loss, var_list=trainable_vars, gate_gradients=gate)
+
+        # Register gradients.
+        for grad, var in grad_list:
+            if var not in device.grad_raw:
+                device.grad_raw[var] = []
+            device.grad_raw[var].append(grad)
+
+    def apply_updates(self, allow_no_op: bool = False) -> tf.Operation:
+        """Construct training op to update the registered variables based on their gradients."""
+        tfutil.assert_tf_initialized()
+        assert not self._updates_applied
+        self._updates_applied = True
+        all_ops = []
+
+        # Check for no-op.
+        if allow_no_op and len(self._devices) == 0:
+            with tfutil.absolute_name_scope(self.scope):
+                return tf.no_op(name='TrainingOp')
+
+        # Clean up gradients.
+        for device_idx, device in enumerate(self._devices.values()):
+            with tfutil.absolute_name_scope(self.scope + "/Clean%d" % device_idx), tf.device(device.name):
+                for var, grad in device.grad_raw.items():
+
+                    # Filter out disconnected gradients and convert to float32.
+                    grad = [g for g in grad if g is not None]
+                    grad = [tf.cast(g, tf.float32) for g in grad]
+
+                    # Sum within the device.
+                    if len(grad) == 0:
+                        grad = tf.zeros(var.shape)  # No gradients => zero.
+                    elif len(grad) == 1:
+                        grad = grad[0]              # Single gradient => use as is.
+                    else:
+                        grad = tf.add_n(grad)       # Multiple gradients => sum.
+
+                    # Scale as needed.
+                    scale = 1.0 / len(device.grad_raw[var]) / len(self._devices)
+                    scale = tf.constant(scale, dtype=tf.float32, name="scale")
+                    if self.minibatch_multiplier is not None:
+                        scale /= tf.cast(self.minibatch_multiplier, tf.float32)
+                    scale = self.undo_loss_scaling(scale)
+                    device.grad_clean[var] = grad * scale
+
+        # Sum gradients across devices.
+        if len(self._devices) > 1:
+            with tfutil.absolute_name_scope(self.scope + "/Broadcast"), tf.device(None):
+                for all_vars in zip(*[device.grad_clean.keys() for device in self._devices.values()]):
+                    if len(all_vars) > 0 and all(dim > 0 for dim in all_vars[0].shape.as_list()): # NCCL does not support zero-sized tensors.
+                        all_grads = [device.grad_clean[var] for device, var in zip(self._devices.values(), all_vars)]
+                        all_grads = nccl_ops.all_sum(all_grads)
+                        for device, var, grad in zip(self._devices.values(), all_vars, all_grads):
+                            device.grad_clean[var] = grad
+
+        # Apply updates separately on each device.
+        for device_idx, device in enumerate(self._devices.values()):
+            with tfutil.absolute_name_scope(self.scope + "/Apply%d" % device_idx), tf.device(device.name):
+                # pylint: disable=cell-var-from-loop
+
+                # Accumulate gradients over time.
+                if self.minibatch_multiplier is None:
+                    acc_ok = tf.constant(True, name='acc_ok')
+                    device.grad_acc = OrderedDict(device.grad_clean)
+                else:
+                    # Create variables.
+                    with tf.control_dependencies(None):
+                        for var in device.grad_clean.keys():
+                            device.grad_acc_vars[var] = tf.Variable(tf.zeros(var.shape), trainable=False, name="grad_acc_var")
+                        device.grad_acc_count = tf.Variable(tf.zeros([]), trainable=False, name="grad_acc_count")
+
+                    # Track counter.
+                    count_cur = device.grad_acc_count + 1.0
+                    count_inc_op = lambda: tf.assign(device.grad_acc_count, count_cur)
+                    count_reset_op = lambda: tf.assign(device.grad_acc_count, tf.zeros([]))
+                    acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier, tf.float32))
+                    all_ops.append(tf.cond(acc_ok, count_reset_op, count_inc_op))
+
+                    # Track gradients.
+                    for var, grad in device.grad_clean.items():
+                        acc_var = device.grad_acc_vars[var]
+                        acc_cur = acc_var + grad
+                        device.grad_acc[var] = acc_cur
+                        with tf.control_dependencies([acc_cur]):
+                            acc_inc_op = lambda: tf.assign(acc_var, acc_cur)
+                            acc_reset_op = lambda: tf.assign(acc_var, tf.zeros(var.shape))
+                            all_ops.append(tf.cond(acc_ok, acc_reset_op, acc_inc_op))
+
+                # No overflow => apply gradients.
+                all_ok = tf.reduce_all(tf.stack([acc_ok] + [tf.reduce_all(tf.is_finite(g)) for g in device.grad_acc.values()]))
+                apply_op = lambda: device.optimizer.apply_gradients([(tf.cast(grad, var.dtype), var) for var, grad in device.grad_acc.items()])
+                all_ops.append(tf.cond(all_ok, apply_op, tf.no_op))
+
+                # Adjust loss scaling.
+                if self.use_loss_scaling:
+                    ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var, self.loss_scaling_inc)
+                    ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var, self.loss_scaling_dec)
+                    ls_update_op = lambda: tf.group(tf.cond(all_ok, ls_inc_op, ls_dec_op))
+                    all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op))
+
+                # Last device => report statistics.
+                if device_idx == len(self._devices) - 1:
+                    all_ops.append(autosummary.autosummary(self.id + "/learning_rate", self.learning_rate))
+                    all_ops.append(autosummary.autosummary(self.id + "/overflow_frequency", tf.where(all_ok, 0, 1), condition=acc_ok))
+                    if self.use_loss_scaling:
+                        all_ops.append(autosummary.autosummary(self.id + "/loss_scaling_log2", device.loss_scaling_var))
+
+        # Initialize variables.
+        self.reset_optimizer_state()
+        if self.use_loss_scaling:
+            tfutil.init_uninitialized_vars([device.loss_scaling_var for device in self._devices.values()])
+        if self.minibatch_multiplier is not None:
+            tfutil.run([var.initializer for device in self._devices.values() for var in list(device.grad_acc_vars.values()) + [device.grad_acc_count]])
+
+        # Group everything into a single op.
+        with tfutil.absolute_name_scope(self.scope):
+            return tf.group(*all_ops, name="TrainingOp")
+
+    def reset_optimizer_state(self) -> None:
+        """Reset internal state of the underlying optimizer."""
+        tfutil.assert_tf_initialized()
+        tfutil.run([var.initializer for device in self._devices.values() for var in device.optimizer.variables()])
+
+    def get_loss_scaling_var(self, device: str) -> Union[tf.Variable, None]:
+        """Get or create variable representing log2 of the current dynamic loss scaling factor."""
+        return self._get_device(device).loss_scaling_var
+
+    def apply_loss_scaling(self, value: TfExpression) -> TfExpression:
+        """Apply dynamic loss scaling for the given expression."""
+        assert tfutil.is_tf_expression(value)
+        if not self.use_loss_scaling:
+            return value
+        return value * tfutil.exp2(self.get_loss_scaling_var(value.device))
+
+    def undo_loss_scaling(self, value: TfExpression) -> TfExpression:
+        """Undo the effect of dynamic loss scaling for the given expression."""
+        assert tfutil.is_tf_expression(value)
+        if not self.use_loss_scaling:
+            return value
+        return value * tfutil.exp2(-self.get_loss_scaling_var(value.device)) # pylint: disable=invalid-unary-operand-type
+
+
+class SimpleAdam:
+    """Simplified version of tf.train.AdamOptimizer that behaves identically when used with dnnlib.tflib.Optimizer."""
+
+    def __init__(self, name="Adam", learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
+        self.name = name
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.all_state_vars = []
+
+    def variables(self):
+        return self.all_state_vars
+
+    def compute_gradients(self, loss, var_list, gate_gradients=tf.train.Optimizer.GATE_NONE):
+        assert gate_gradients == tf.train.Optimizer.GATE_NONE
+        return list(zip(tf.gradients(loss, var_list), var_list))
+
+    def apply_gradients(self, grads_and_vars):
+        with tf.name_scope(self.name):
+            state_vars = []
+            update_ops = []
+
+            # Adjust learning rate to deal with startup bias.
+            with tf.control_dependencies(None):
+                b1pow_var = tf.Variable(dtype=tf.float32, initial_value=1, trainable=False)
+                b2pow_var = tf.Variable(dtype=tf.float32, initial_value=1, trainable=False)
+                state_vars += [b1pow_var, b2pow_var]
+            b1pow_new = b1pow_var * self.beta1
+            b2pow_new = b2pow_var * self.beta2
+            update_ops += [tf.assign(b1pow_var, b1pow_new), tf.assign(b2pow_var, b2pow_new)]
+            lr_new = self.learning_rate * tf.sqrt(1 - b2pow_new) / (1 - b1pow_new)
+
+            # Construct ops to update each variable.
+            for grad, var in grads_and_vars:
+                with tf.control_dependencies(None):
+                    m_var = tf.Variable(dtype=tf.float32, initial_value=tf.zeros_like(var), trainable=False)
+                    v_var = tf.Variable(dtype=tf.float32, initial_value=tf.zeros_like(var), trainable=False)
+                    state_vars += [m_var, v_var]
+                m_new = self.beta1 * m_var + (1 - self.beta1) * grad
+                v_new = self.beta2 * v_var + (1 - self.beta2) * tf.square(grad)
+                var_delta = lr_new * m_new / (tf.sqrt(v_new) + self.epsilon)
+                update_ops += [tf.assign(m_var, m_new), tf.assign(v_var, v_new), tf.assign_sub(var, var_delta)]
+
+            # Group everything together.
+            self.all_state_vars += state_vars
+            return tf.group(*update_ops)
diff --git a/dnnlib/tflib/tfutil.py b/dnnlib/tflib/tfutil.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b04c59e41a1b1548bc798379ceb551a488ed2a6
--- /dev/null
+++ b/dnnlib/tflib/tfutil.py
@@ -0,0 +1,254 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Miscellaneous helper utils for Tensorflow."""
+
+import os
+import numpy as np
+import tensorflow as tf
+
+# Silence deprecation warnings from TensorFlow 1.13 onwards
+import logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+import tensorflow.contrib   # requires TensorFlow 1.x!
+tf.contrib = tensorflow.contrib
+
+from typing import Any, Iterable, List, Union
+
+TfExpression = Union[tf.Tensor, tf.Variable, tf.Operation]
+"""A type that represents a valid Tensorflow expression."""
+
+TfExpressionEx = Union[TfExpression, int, float, np.ndarray]
+"""A type that can be converted to a valid Tensorflow expression."""
+
+
+def run(*args, **kwargs) -> Any:
+    """Run the specified ops in the default session."""
+    assert_tf_initialized()
+    return tf.get_default_session().run(*args, **kwargs)
+
+
+def is_tf_expression(x: Any) -> bool:
+    """Check whether the input is a valid Tensorflow expression, i.e., Tensorflow Tensor, Variable, or Operation."""
+    return isinstance(x, (tf.Tensor, tf.Variable, tf.Operation))
+
+
+def shape_to_list(shape: Iterable[tf.Dimension]) -> List[Union[int, None]]:
+    """Convert a Tensorflow shape to a list of ints. Retained for backwards compatibility -- use TensorShape.as_list() in new code."""
+    return [dim.value for dim in shape]
+
+
+def flatten(x: TfExpressionEx) -> TfExpression:
+    """Shortcut function for flattening a tensor."""
+    with tf.name_scope("Flatten"):
+        return tf.reshape(x, [-1])
+
+
+def log2(x: TfExpressionEx) -> TfExpression:
+    """Logarithm in base 2."""
+    with tf.name_scope("Log2"):
+        return tf.log(x) * np.float32(1.0 / np.log(2.0))
+
+
+def exp2(x: TfExpressionEx) -> TfExpression:
+    """Exponent in base 2."""
+    with tf.name_scope("Exp2"):
+        return tf.exp(x * np.float32(np.log(2.0)))
+
+
+def lerp(a: TfExpressionEx, b: TfExpressionEx, t: TfExpressionEx) -> TfExpressionEx:
+    """Linear interpolation."""
+    with tf.name_scope("Lerp"):
+        return a + (b - a) * t
+
+
+def lerp_clip(a: TfExpressionEx, b: TfExpressionEx, t: TfExpressionEx) -> TfExpression:
+    """Linear interpolation with clip."""
+    with tf.name_scope("LerpClip"):
+        return a + (b - a) * tf.clip_by_value(t, 0.0, 1.0)
+
+
+def absolute_name_scope(scope: str) -> tf.name_scope:
+    """Forcefully enter the specified name scope, ignoring any surrounding scopes."""
+    return tf.name_scope(scope + "/")
+
+
+def absolute_variable_scope(scope: str, **kwargs) -> tf.variable_scope:
+    """Forcefully enter the specified variable scope, ignoring any surrounding scopes."""
+    return tf.variable_scope(tf.VariableScope(name=scope, **kwargs), auxiliary_name_scope=False)
+
+
+def _sanitize_tf_config(config_dict: dict = None) -> dict:
+    # Defaults.
+    cfg = dict()
+    cfg["rnd.np_random_seed"]               = None      # Random seed for NumPy. None = keep as is.
+    cfg["rnd.tf_random_seed"]               = "auto"    # Random seed for TensorFlow. 'auto' = derive from NumPy random state. None = keep as is.
+    cfg["env.TF_CPP_MIN_LOG_LEVEL"]         = "1"       # 0 = Print all available debug info from TensorFlow. 1 = Print warnings and errors, but disable debug info.
+    cfg["graph_options.place_pruned_graph"] = True      # False = Check that all ops are available on the designated device. True = Skip the check for ops that are not used.
+    cfg["gpu_options.allow_growth"]         = True      # False = Allocate all GPU memory at the beginning. True = Allocate only as much GPU memory as needed.
+
+    # Remove defaults for environment variables that are already set.
+    for key in list(cfg):
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            if fields[1] in os.environ:
+                del cfg[key]
+
+    # User overrides.
+    if config_dict is not None:
+        cfg.update(config_dict)
+    return cfg
+
+
+def init_tf(config_dict: dict = None) -> None:
+    """Initialize TensorFlow session using good default settings."""
+    # Skip if already initialized.
+    if tf.get_default_session() is not None:
+        return
+
+    # Setup config dict and random seeds.
+    cfg = _sanitize_tf_config(config_dict)
+    np_random_seed = cfg["rnd.np_random_seed"]
+    if np_random_seed is not None:
+        np.random.seed(np_random_seed)
+    tf_random_seed = cfg["rnd.tf_random_seed"]
+    if tf_random_seed == "auto":
+        tf_random_seed = np.random.randint(1 << 31)
+    if tf_random_seed is not None:
+        tf.set_random_seed(tf_random_seed)
+
+    # Setup environment variables.
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            os.environ[fields[1]] = str(value)
+
+    # Create default TensorFlow session.
+    create_session(cfg, force_as_default=True)
+
+
+def assert_tf_initialized():
+    """Check that TensorFlow session has been initialized."""
+    if tf.get_default_session() is None:
+        raise RuntimeError("No default TensorFlow session found. Please call dnnlib.tflib.init_tf().")
+
+
+def create_session(config_dict: dict = None, force_as_default: bool = False) -> tf.Session:
+    """Create tf.Session based on config dict."""
+    # Setup TensorFlow config proto.
+    cfg = _sanitize_tf_config(config_dict)
+    config_proto = tf.ConfigProto()
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] not in ["rnd", "env"]:
+            obj = config_proto
+            for field in fields[:-1]:
+                obj = getattr(obj, field)
+            setattr(obj, fields[-1], value)
+
+    # Create session.
+    session = tf.Session(config=config_proto)
+    if force_as_default:
+        # pylint: disable=protected-access
+        session._default_session = session.as_default()
+        session._default_session.enforce_nesting = False
+        session._default_session.__enter__()
+    return session
+
+
+def init_uninitialized_vars(target_vars: List[tf.Variable] = None) -> None:
+    """Initialize all tf.Variables that have not already been initialized.
+
+    Equivalent to the following, but more efficient and does not bloat the tf graph:
+    tf.variables_initializer(tf.report_uninitialized_variables()).run()
+    """
+    assert_tf_initialized()
+    if target_vars is None:
+        target_vars = tf.global_variables()
+
+    test_vars = []
+    test_ops = []
+
+    with tf.control_dependencies(None):  # ignore surrounding control_dependencies
+        for var in target_vars:
+            assert is_tf_expression(var)
+
+            try:
+                tf.get_default_graph().get_tensor_by_name(var.name.replace(":0", "/IsVariableInitialized:0"))
+            except KeyError:
+                # Op does not exist => variable may be uninitialized.
+                test_vars.append(var)
+
+                with absolute_name_scope(var.name.split(":")[0]):
+                    test_ops.append(tf.is_variable_initialized(var))
+
+    init_vars = [var for var, inited in zip(test_vars, run(test_ops)) if not inited]
+    run([var.initializer for var in init_vars])
+
+
+def set_vars(var_to_value_dict: dict) -> None:
+    """Set the values of given tf.Variables.
+
+    Equivalent to the following, but more efficient and does not bloat the tf graph:
+    tflib.run([tf.assign(var, value) for var, value in var_to_value_dict.items()]
+    """
+    assert_tf_initialized()
+    ops = []
+    feed_dict = {}
+
+    for var, value in var_to_value_dict.items():
+        assert is_tf_expression(var)
+
+        try:
+            setter = tf.get_default_graph().get_tensor_by_name(var.name.replace(":0", "/setter:0"))  # look for existing op
+        except KeyError:
+            with absolute_name_scope(var.name.split(":")[0]):
+                with tf.control_dependencies(None):  # ignore surrounding control_dependencies
+                    setter = tf.assign(var, tf.placeholder(var.dtype, var.shape, "new_value"), name="setter")  # create new setter
+
+        ops.append(setter)
+        feed_dict[setter.op.inputs[1]] = value
+
+    run(ops, feed_dict)
+
+
+def create_var_with_large_initial_value(initial_value: np.ndarray, *args, **kwargs):
+    """Create tf.Variable with large initial value without bloating the tf graph."""
+    assert_tf_initialized()
+    assert isinstance(initial_value, np.ndarray)
+    zeros = tf.zeros(initial_value.shape, initial_value.dtype)
+    var = tf.Variable(zeros, *args, **kwargs)
+    set_vars({var: initial_value})
+    return var
+
+
+def convert_images_from_uint8(images, drange=[-1,1], nhwc_to_nchw=False):
+    """Convert a minibatch of images from uint8 to float32 with configurable dynamic range.
+    Can be used as an input transformation for Network.run().
+    """
+    images = tf.cast(images, tf.float32)
+    if nhwc_to_nchw:
+        images = tf.transpose(images, [0, 3, 1, 2])
+    return images * ((drange[1] - drange[0]) / 255) + drange[0]
+
+
+def convert_images_to_uint8(images, drange=[-1,1], nchw_to_nhwc=False, shrink=1):
+    """Convert a minibatch of images from float32 to uint8 with configurable dynamic range.
+    Can be used as an output transformation for Network.run().
+    """
+    images = tf.cast(images, tf.float32)
+    if shrink > 1:
+        ksize = [1, 1, shrink, shrink]
+        images = tf.nn.avg_pool(images, ksize=ksize, strides=ksize, padding="VALID", data_format="NCHW")
+    if nchw_to_nhwc:
+        images = tf.transpose(images, [0, 2, 3, 1])
+    scale = 255 / (drange[1] - drange[0])
+    images = images * scale + (0.5 - drange[0] * scale)
+    return tf.saturate_cast(images, tf.uint8)
diff --git a/dnnlib/util.py b/dnnlib/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2bf7a73d546895ac6eb73d9c56db2a04b096f3e
--- /dev/null
+++ b/dnnlib/util.py
@@ -0,0 +1,479 @@
+﻿# Copyright (c) SenseTime Research. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Miscellaneous utility classes and functions."""
+
+import ctypes
+import fnmatch
+import importlib
+import inspect
+import numpy as np
+import os
+import shutil
+import sys
+import types
+import io
+import pickle
+import re
+import requests
+import html
+import hashlib
+import glob
+import tempfile
+import urllib
+import urllib.request
+import uuid
+
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union
+
+
+# Util classes
+# ------------------------------------------------------------------------------------------
+
+
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+
+
+class Logger(object):
+    """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file."""
+
+    def __init__(self, file_name: str = None, file_mode: str = "w", should_flush: bool = True):
+        self.file = None
+
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+
+        sys.stdout = self
+        sys.stderr = self
+
+    def __enter__(self) -> "Logger":
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+
+    def write(self, text: Union[str, bytes]) -> None:
+        """Write text to stdout (and a file) and optionally flush."""
+        if isinstance(text, bytes):
+            text = text.decode()
+        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+
+        if self.file is not None:
+            self.file.write(text)
+
+        self.stdout.write(text)
+
+        if self.should_flush:
+            self.flush()
+
+    def flush(self) -> None:
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+
+        self.stdout.flush()
+
+    def close(self) -> None:
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+
+        if self.file is not None:
+            self.file.close()
+            self.file = None
+
+
+# Cache directories
+# ------------------------------------------------------------------------------------------
+
+_dnnlib_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    global _dnnlib_cache_dir
+    _dnnlib_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _dnnlib_cache_dir is not None:
+        return os.path.join(_dnnlib_cache_dir, *paths)
+    if 'DNNLIB_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths)
+
+# Small util functions
+# ------------------------------------------------------------------------------------------
+
+
+def format_time(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60)
+    else:
+        return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60)
+
+
+def ask_yes_no(question: str) -> bool:
+    """Ask the user the question until the user inputs a valid answer."""
+    while True:
+        try:
+            print("{0} [y/n]".format(question))
+            return strtobool(input().lower())
+        except ValueError:
+            pass
+
+
+def tuple_product(t: Tuple) -> Any:
+    """Calculate the product of the tuple elements."""
+    result = 1
+
+    for v in t:
+        result *= v
+
+    return result
+
+
+_str_to_ctype = {
+    "uint8": ctypes.c_ubyte,
+    "uint16": ctypes.c_uint16,
+    "uint32": ctypes.c_uint32,
+    "uint64": ctypes.c_uint64,
+    "int8": ctypes.c_byte,
+    "int16": ctypes.c_int16,
+    "int32": ctypes.c_int32,
+    "int64": ctypes.c_int64,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double
+}
+
+
+def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
+    """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes."""
+    type_str = None
+
+    if isinstance(type_obj, str):
+        type_str = type_obj
+    elif hasattr(type_obj, "__name__"):
+        type_str = type_obj.__name__
+    elif hasattr(type_obj, "name"):
+        type_str = type_obj.name
+    else:
+        raise RuntimeError("Cannot infer type name from input")
+
+    assert type_str in _str_to_ctype.keys()
+
+    my_dtype = np.dtype(type_str)
+    my_ctype = _str_to_ctype[type_str]
+
+    assert my_dtype.itemsize == ctypes.sizeof(my_ctype)
+
+    return my_dtype, my_ctype
+
+
+def is_pickleable(obj: Any) -> bool:
+    try:
+        with io.BytesIO() as stream:
+            pickle.dump(obj, stream)
+        return True
+    except:
+        return False
+
+
+# Functionality to import modules/objects by name, and call functions by name
+# ------------------------------------------------------------------------------------------
+
+def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]:
+    """Searches for the underlying module behind the name to some python object.
+    Returns the module and the object name (original name with module part removed)."""
+
+    # allow convenience shorthands, substitute them by full names
+    obj_name = re.sub("^np.", "numpy.", obj_name)
+    obj_name = re.sub("^tf.", "tensorflow.", obj_name)
+
+    # list alternatives for (module_name, local_obj_name)
+    parts = obj_name.split(".")
+    name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)]
+
+    # try each alternative in turn
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+            return module, local_obj_name
+        except:
+            pass
+
+    # maybe some of the modules themselves contain errors?
+    for module_name, _local_obj_name in name_pairs:
+        try:
+            importlib.import_module(module_name) # may raise ImportError
+        except ImportError:
+            if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"):
+                raise
+
+    # maybe the requested attribute is missing?
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+        except ImportError:
+            pass
+
+    # we are out of luck, but we have no idea why
+    raise ImportError(obj_name)
+
+
+def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
+    """Traverses the object name and returns the last (rightmost) python object."""
+    if obj_name == '':
+        return module
+    obj = module
+    for part in obj_name.split("."):
+        obj = getattr(obj, part)
+    return obj
+
+
+def get_obj_by_name(name: str) -> Any:
+    """Finds the python object with the given name."""
+    module, obj_name = get_module_from_obj_name(name)
+    return get_obj_from_module(module, obj_name)
+
+
+def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
+    """Finds the python object with the given name and calls it as a function."""
+    assert func_name is not None
+    # print('func_name: ', func_name) #'training.dataset.ImageFolderDataset'
+    func_obj = get_obj_by_name(func_name) 
+    assert callable(func_obj)
+    return func_obj(*args, **kwargs)
+
+
+def construct_class_by_name(*args, class_name: str = None, **kwargs) -> Any:
+    """Finds the python class with the given name and constructs it with the given arguments."""
+    return call_func_by_name(*args, func_name=class_name, **kwargs)
+
+
+def get_module_dir_by_obj_name(obj_name: str) -> str:
+    """Get the directory path of the module containing the given object name."""
+    module, _ = get_module_from_obj_name(obj_name)
+    return os.path.dirname(inspect.getfile(module))
+
+
+def is_top_level_function(obj: Any) -> bool:
+    """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'."""
+    return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__
+
+
+def get_top_level_function_name(obj: Any) -> str:
+    """Return the fully-qualified name of a top-level function."""
+    assert is_top_level_function(obj)
+    module = obj.__module__
+    if module == '__main__':
+        module = os.path.splitext(os.path.basename(sys.modules[module].__file__))[0]
+    return module + "." + obj.__name__
+
+
+# File system helpers
+# ------------------------------------------------------------------------------------------
+
+def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]:
+    """List all files recursively in a given directory while ignoring given file and directory names.
+    Returns list of tuples containing both absolute and relative paths."""
+    assert os.path.isdir(dir_path)
+    base_name = os.path.basename(os.path.normpath(dir_path))
+
+    if ignores is None:
+        ignores = []
+
+    result = []
+
+    for root, dirs, files in os.walk(dir_path, topdown=True):
+        for ignore_ in ignores:
+            dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)]
+
+            # dirs need to be edited in-place
+            for d in dirs_to_remove:
+                dirs.remove(d)
+
+            files = [f for f in files if not fnmatch.fnmatch(f, ignore_)]
+
+        absolute_paths = [os.path.join(root, f) for f in files]
+        relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths]
+
+        if add_base_to_relative:
+            relative_paths = [os.path.join(base_name, p) for p in relative_paths]
+
+        assert len(absolute_paths) == len(relative_paths)
+        result += zip(absolute_paths, relative_paths)
+
+    return result
+
+
+def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
+    """Takes in a list of tuples of (src, dst) paths and copies files.
+    Will create all necessary directories."""
+    for file in files:
+        target_dir_name = os.path.dirname(file[1])
+
+        # will create all intermediate-level directories
+        if not os.path.exists(target_dir_name):
+            os.makedirs(target_dir_name)
+
+        shutil.copyfile(file[0], file[1])
+
+
+# URL helpers
+# ------------------------------------------------------------------------------------------
+
+def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
+    """Determine whether the given object is a valid URL string."""
+    if not isinstance(obj, str) or not "://" in obj:
+        return False
+    if allow_file_urls and obj.startswith('file://'):
+        return True
+    try:
+        res = requests.compat.urlparse(obj)
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+        res = requests.compat.urlparse(requests.compat.urljoin(obj, "/"))
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+    except:
+        return False
+    return True
+
+
+def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    assert not (return_filename and (not cache))
+
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+
+    assert is_url(url)
+
+    # Lookup from cache.
+    if cache_dir is None:
+        cache_dir = make_cache_dir_path('downloads')
+
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    if cache:
+        cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*"))
+        if len(cache_files) == 1:
+            filename = cache_files[0]
+            return filename if return_filename else open(filename, "rb")
+
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+
+    # Save to cache.
+    if cache:
+        safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name)
+        cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name)
+        temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(temp_file, "wb") as f:
+            f.write(url_data)
+        os.replace(temp_file, cache_file) # atomic
+        if return_filename:
+            return cache_file
+
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
diff --git a/losses/color_transfer_loss.py b/losses/color_transfer_loss.py
deleted file mode 100644
index febfb5db954078c0839c93a3dd11a86451839c8c..0000000000000000000000000000000000000000
--- a/losses/color_transfer_loss.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from typing import List, Optional
-
-import torch
-from torch import nn
-from torch.nn.functional import (
-    smooth_l1_loss,
-)
-
-
-def flatten_CHW(im: torch.Tensor) -> torch.Tensor:
-    """
-    (B, C, H, W) -> (B, -1)
-    """
-    B = im.shape[0]
-    return im.reshape(B, -1)
-
-
-def stddev(x: torch.Tensor) -> torch.Tensor:
-    """
-    x: (B, -1), assume with mean normalized
-    Retuens:
-        stddev: (B)
-    """
-    return torch.sqrt(torch.mean(x * x, dim=-1))
-
-
-def gram_matrix(input_):
-    B, C = input_.shape[:2]
-    features = input_.view(B, C, -1)
-    N = features.shape[-1]
-    G = torch.bmm(features, features.transpose(1, 2))  # C x C
-    return G.div(C * N)
-
-
-class ColorTransferLoss(nn.Module):
-    """Penalize the gram matrix difference between StyleGAN2's ToRGB outputs"""
-    def __init__(
-        self,
-        init_rgbs,
-        scale_rgb: bool = False
-    ):
-        super().__init__()
-
-        with torch.no_grad():
-            init_feats = [x.detach() for x in init_rgbs]
-            self.stds = [stddev(flatten_CHW(rgb)) if scale_rgb else 1 for rgb in init_feats]  # (B, 1, 1, 1) or scalar
-            self.grams = [gram_matrix(rgb / std) for rgb, std in zip(init_feats, self.stds)]
-
-    def forward(self, rgbs: List[torch.Tensor], level: int = None):
-        if level is None:
-            level = len(self.grams)
-
-        feats = rgbs
-        loss = 0
-        for i, (rgb, std) in enumerate(zip(feats[:level], self.stds[:level])):
-            G = gram_matrix(rgb / std)
-            loss = loss + smooth_l1_loss(G, self.grams[i])
-
-        return loss
-
diff --git a/losses/joint_loss.py b/losses/joint_loss.py
deleted file mode 100644
index 3c18a6c5205415643f3c380750b0666e9278fdb4..0000000000000000000000000000000000000000
--- a/losses/joint_loss.py
+++ /dev/null
@@ -1,167 +0,0 @@
-from argparse import (
-    ArgumentParser,
-    Namespace,
-)
-from typing import (
-    Dict,
-    Iterable,
-    Optional,
-    Tuple,
-)
-
-import numpy as np
-import torch
-from torch import nn
-
-from utils.misc import (
-    optional_string,
-    iterable_to_str,
-)
-
-from .contextual_loss import ContextualLoss
-from .color_transfer_loss import ColorTransferLoss
-from .regularize_noise import NoiseRegularizer
-from .reconstruction import (
-    EyeLoss,
-    FaceLoss,
-    create_perceptual_loss,
-    ReconstructionArguments,
-)
-
-class LossArguments:
-    @staticmethod
-    def add_arguments(parser: ArgumentParser):
-        ReconstructionArguments.add_arguments(parser)
-
-        parser.add_argument("--color_transfer", type=float, default=1e10, help="color transfer loss weight")
-        parser.add_argument("--eye", type=float, default=0.1, help="eye loss weight")
-        parser.add_argument('--noise_regularize', type=float, default=5e4)
-        # contextual loss
-        parser.add_argument("--contextual", type=float, default=0.1, help="contextual loss weight")
-        parser.add_argument("--cx_layers", nargs='*', help="contextual loss layers",
-                            choices=['relu1_2', 'relu2_2', 'relu3_4', 'relu4_4', 'relu5_4'],
-                            default=['relu3_4', 'relu2_2', 'relu1_2'])
-
-    @staticmethod
-    def to_string(args: Namespace) -> str:
-        return (
-            ReconstructionArguments.to_string(args)
-            + optional_string(args.eye > 0, f"-eye{args.eye}")
-            + optional_string(args.color_transfer, f"-color{args.color_transfer:.1e}")
-            + optional_string(
-                args.contextual,
-                f"-cx{args.contextual}({iterable_to_str(args.cx_layers)})"
-            )
-            #+ optional_string(args.mse, f"-mse{args.mse}")
-            + optional_string(args.noise_regularize, f"-NR{args.noise_regularize:.1e}")
-        )
-
-
-class BakedMultiContextualLoss(nn.Module):
-    """Random sample different image patches for different vgg layers."""
-    def __init__(self, sibling: torch.Tensor, args: Namespace, size: int = 256):
-        super().__init__()
-
-        self.cxs = nn.ModuleList([ContextualLoss(use_vgg=True, vgg_layers=[layer])
-            for layer in args.cx_layers])
-        self.size = size
-        self.sibling = sibling.detach()
-
-    def forward(self, img: torch.Tensor):
-        cx_loss = 0
-        for cx in self.cxs:
-            h, w = np.random.randint(0, high=img.shape[-1] - self.size, size=2)
-            cx_loss = cx(self.sibling[..., h:h+self.size, w:w+self.size], img[..., h:h+self.size, w:w+self.size]) + cx_loss
-        return cx_loss
-
-
-class BakedContextualLoss(ContextualLoss):
-    def __init__(self, sibling: torch.Tensor, args: Namespace, size: int = 256):
-        super().__init__(use_vgg=True, vgg_layers=args.cx_layers)
-        self.size = size
-        self.sibling = sibling.detach()
-
-    def forward(self, img: torch.Tensor):
-        h, w = np.random.randint(0, high=img.shape[-1] - self.size, size=2)
-        return super().forward(self.sibling[..., h:h+self.size, w:w+self.size], img[..., h:h+self.size, w:w+self.size])
-
-
-class JointLoss(nn.Module):
-    def __init__(
-            self,
-            args: Namespace,
-            target: torch.Tensor,
-            sibling: Optional[torch.Tensor],
-            sibling_rgbs: Optional[Iterable[torch.Tensor]] = None,
-    ):
-        super().__init__()
-
-        self.weights = {
-            "face": 1., "eye": args.eye,
-            "contextual": args.contextual, "color_transfer": args.color_transfer,
-            "noise": args.noise_regularize,
-        }
-
-        reconstruction = {}
-        if args.vgg > 0 or args.vggface > 0:
-            percept = create_perceptual_loss(args)
-            reconstruction.update(
-                {"face": FaceLoss(target, input_size=args.generator_size, size=args.recon_size, percept=percept)}
-            )
-            if args.eye > 0:
-                reconstruction.update(
-                    {"eye": EyeLoss(target, input_size=args.generator_size, percept=percept)}
-                )
-        self.reconstruction = nn.ModuleDict(reconstruction)
-
-        exemplar = {}
-        if args.contextual > 0 and len(args.cx_layers) > 0:
-            assert sibling is not None
-            exemplar.update(
-                {"contextual": BakedContextualLoss(sibling, args)}
-            )
-        if args.color_transfer > 0:
-            assert sibling_rgbs is not None
-            self.sibling_rgbs = sibling_rgbs
-            exemplar.update(
-                {"color_transfer": ColorTransferLoss(init_rgbs=sibling_rgbs)}
-            )
-        self.exemplar = nn.ModuleDict(exemplar)
-
-        if args.noise_regularize > 0:
-            self.noise_criterion = NoiseRegularizer()
-
-    def forward(
-            self, img, degrade=None, noises=None, rgbs=None, rgb_level: Optional[int] = None
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            rgbs: results from the ToRGB layers
-        """
-        # TODO: add current optimization resolution for noises
-
-        losses = {}
-
-        # reconstruction losses
-        for name, criterion in self.reconstruction.items():
-            losses[name] = criterion(img, degrade=degrade)
-
-        # exemplar losses
-        if 'contextual' in self.exemplar:
-            losses["contextual"] = self.exemplar["contextual"](img)
-        if "color_transfer" in self.exemplar:
-            assert rgbs is not None
-            losses["color_transfer"] = self.exemplar["color_transfer"](rgbs, level=rgb_level)
-
-        # noise regularizer
-        if self.weights["noise"] > 0:
-            losses["noise"] = self.noise_criterion(noises)
-
-        total_loss = 0
-        for name, loss in losses.items():
-            total_loss = total_loss + self.weights[name] * loss
-        return total_loss, losses
-
-    def update_sibling(self, sibling: torch.Tensor):
-        assert "contextual" in self.exemplar
-        self.exemplar["contextual"].sibling = sibling.detach()
diff --git a/losses/perceptual_loss.py b/losses/perceptual_loss.py
deleted file mode 100644
index 742b2b07f7ec65a4252146e55d0ddbbd10061917..0000000000000000000000000000000000000000
--- a/losses/perceptual_loss.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-Code borrowed from https://gist.github.com/alper111/8233cdb0414b4cb5853f2f730ab95a49#file-vgg_perceptual_loss-py-L5
-"""
-import torch
-import torchvision
-from models.vggface import VGGFaceFeats
-
-
-def cos_loss(fi, ft):
-    return 1 - torch.nn.functional.cosine_similarity(fi, ft).mean()
-
-
-class VGGPerceptualLoss(torch.nn.Module):
-    def __init__(self, resize=False):
-        super(VGGPerceptualLoss, self).__init__()
-        blocks = []
-        blocks.append(torchvision.models.vgg16(pretrained=True).features[:4].eval())
-        blocks.append(torchvision.models.vgg16(pretrained=True).features[4:9].eval())
-        blocks.append(torchvision.models.vgg16(pretrained=True).features[9:16].eval())
-        blocks.append(torchvision.models.vgg16(pretrained=True).features[16:23].eval())
-        for bl in blocks:
-            for p in bl:
-                p.requires_grad = False
-        self.blocks = torch.nn.ModuleList(blocks)
-        self.transform = torch.nn.functional.interpolate
-        self.mean = torch.nn.Parameter(torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1))
-        self.std = torch.nn.Parameter(torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1))
-        self.resize = resize
-
-    def forward(self, input, target, max_layer=4, cos_dist: bool = False):
-        target = (target + 1) * 0.5
-        input = (input + 1) * 0.5
-
-        if input.shape[1] != 3:
-            input = input.repeat(1, 3, 1, 1)
-            target = target.repeat(1, 3, 1, 1)
-        input = (input-self.mean) / self.std
-        target = (target-self.mean) / self.std
-        if self.resize:
-            input = self.transform(input, mode='bilinear', size=(224, 224), align_corners=False)
-            target = self.transform(target, mode='bilinear', size=(224, 224), align_corners=False)
-        x = input
-        y = target
-        loss = 0.0
-        loss_func = cos_loss if cos_dist else torch.nn.functional.l1_loss
-        for bi, block in enumerate(self.blocks[:max_layer]):
-            x = block(x)
-            y = block(y)
-            loss += loss_func(x, y.detach())
-        return loss
-
-
-class VGGFacePerceptualLoss(torch.nn.Module):
-    def __init__(self, weight_path: str = "checkpoint/vgg_face_dag.pt", resize: bool = False):
-        super().__init__()
-        self.vgg = VGGFaceFeats()
-        self.vgg.load_state_dict(torch.load(weight_path))
-
-        mean = torch.tensor(self.vgg.meta["mean"]).view(1, 3, 1, 1) / 255.0
-        self.register_buffer("mean", mean)
-
-        self.transform = torch.nn.functional.interpolate
-        self.resize = resize
-
-    def forward(self, input, target, max_layer: int = 4, cos_dist: bool = False):
-        target = (target + 1) * 0.5
-        input = (input + 1) * 0.5
-
-        # preprocessing
-        if input.shape[1] != 3:
-            input = input.repeat(1, 3, 1, 1)
-            target = target.repeat(1, 3, 1, 1)
-        input = input - self.mean
-        target = target - self.mean
-        if self.resize:
-            input = self.transform(input, mode='bilinear', size=(224, 224), align_corners=False)
-            target = self.transform(target, mode='bilinear', size=(224, 224), align_corners=False)
-
-        input_feats = self.vgg(input)
-        target_feats = self.vgg(target)
-
-        loss_func = cos_loss if cos_dist else torch.nn.functional.l1_loss
-        # calc perceptual loss
-        loss = 0.0
-        for fi, ft in zip(input_feats[:max_layer], target_feats[:max_layer]):
-            loss = loss + loss_func(fi, ft.detach())
-        return loss
-
-
-class PerceptualLoss(torch.nn.Module):
-    def __init__(
-            self, lambda_vggface: float = 0.025 / 0.15, lambda_vgg: float = 1,  eps: float = 1e-8, cos_dist: bool = False
-    ):
-        super().__init__()
-        self.register_buffer("lambda_vggface", torch.tensor(lambda_vggface))
-        self.register_buffer("lambda_vgg", torch.tensor(lambda_vgg))
-        self.cos_dist = cos_dist
-
-        if lambda_vgg > eps:
-            self.vgg = VGGPerceptualLoss()
-        if lambda_vggface > eps:
-            self.vggface = VGGFacePerceptualLoss()
-
-    def forward(self, input, target, eps=1e-8, use_vggface: bool = True, use_vgg=True, max_vgg_layer=4):
-        loss = 0.0
-        if self.lambda_vgg > eps and use_vgg:
-            loss = loss + self.lambda_vgg * self.vgg(input, target, max_layer=max_vgg_layer)
-        if self.lambda_vggface > eps and use_vggface:
-            loss = loss + self.lambda_vggface * self.vggface(input, target, cos_dist=self.cos_dist)
-        return loss
-
diff --git a/losses/reconstruction.py b/losses/reconstruction.py
deleted file mode 100644
index 4338f095cc9f579afa952782250230a3db48325e..0000000000000000000000000000000000000000
--- a/losses/reconstruction.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from argparse import (
-    ArgumentParser,
-    Namespace,
-)
-from typing import Optional
-
-import numpy as np
-import torch
-from torch import nn
-
-from losses.perceptual_loss import PerceptualLoss
-from models.degrade import Downsample
-from utils.misc import optional_string
-
-
-class ReconstructionArguments:
-    @staticmethod
-    def add_arguments(parser: ArgumentParser):
-        parser.add_argument("--vggface", type=float, default=0.3, help="vggface")
-        parser.add_argument("--vgg", type=float, default=1, help="vgg")
-        parser.add_argument('--recon_size', type=int, default=256, help="size for face reconstruction loss")
-
-    @staticmethod
-    def to_string(args: Namespace) -> str:
-        return (
-            f"s{args.recon_size}"
-            + optional_string(args.vgg > 0, f"-vgg{args.vgg}")
-            + optional_string(args.vggface > 0, f"-vggface{args.vggface}")
-        )
-
-
-def create_perceptual_loss(args: Namespace):
-    return PerceptualLoss(lambda_vgg=args.vgg, lambda_vggface=args.vggface, cos_dist=False)
-
-
-class EyeLoss(nn.Module):
-    def __init__(
-            self,
-            target: torch.Tensor,
-            input_size: int = 1024,
-            input_channels: int = 3,
-            percept: Optional[nn.Module] = None,
-            args: Optional[Namespace] = None
-    ):
-        """
-        target: target image
-        """
-        assert not (percept is None and args is None)
-
-        super().__init__()
-
-        self.target = target
-
-        target_size = target.shape[-1]
-        self.downsample = Downsample(input_size, target_size, input_channels) \
-                if target_size != input_size else (lambda x: x)
-
-        self.percept = percept if percept is not None else create_perceptual_loss(args)
-
-        eye_size = np.array((224, 224))
-        btlrs = []
-        for sgn in [1, -1]:
-            center = np.array((480, 384 * sgn))   # (y, x)
-            b, t = center[0] - eye_size[0] // 2, center[0] + eye_size[0] // 2
-            l, r = center[1] - eye_size[1] // 2, center[1] + eye_size[1] // 2
-            btlrs.append((np.array((b, t, l, r)) / 1024 * target_size).astype(int))
-        self.btlrs = np.stack(btlrs, axis=0)
-
-    def forward(self, img: torch.Tensor, degrade: nn.Module = None):
-        """
-        img: it should be the degraded version of the generated image
-        """
-        if degrade is not None:
-            img = degrade(img, downsample=self.downsample)
-
-        loss = 0
-        for (b, t, l, r) in self.btlrs:
-            loss = loss + self.percept(
-                img[:, :, b:t, l:r], self.target[:, :, b:t, l:r],
-                use_vggface=False, max_vgg_layer=4,
-                # use_vgg=False,
-            )
-        return loss
-
-
-class FaceLoss(nn.Module):
-    def __init__(
-            self,
-            target: torch.Tensor,
-            input_size: int = 1024,
-            input_channels: int = 3,
-            size: int = 256,
-            percept: Optional[nn.Module] = None,
-            args: Optional[Namespace] = None
-    ):
-        """
-        target: target image
-        """
-        assert not (percept is None and args is None)
-
-        super().__init__()
-
-        target_size = target.shape[-1]
-        self.target = target if target_size == size \
-                else Downsample(target_size, size, target.shape[1]).to(target.device)(target)
-
-        self.downsample = Downsample(input_size, size, input_channels) \
-                if size != input_size else (lambda x: x)
-
-        self.percept = percept if percept is not None else create_perceptual_loss(args)
-
-    def forward(self, img: torch.Tensor, degrade: nn.Module = None):
-        """
-        img: it should be the degraded version of the generated image
-        """
-        if degrade is not None:
-            img = degrade(img, downsample=self.downsample)
-        loss = self.percept(img, self.target)
-        return loss
diff --git a/losses/regularize_noise.py b/losses/regularize_noise.py
deleted file mode 100644
index b02e442de388479c9592c1f3eafc0e108376d8c6..0000000000000000000000000000000000000000
--- a/losses/regularize_noise.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Iterable
-
-import torch
-from torch import nn
-
-
-class NoiseRegularizer(nn.Module):
-    def forward(self, noises: Iterable[torch.Tensor]):
-        loss = 0
-
-        for noise in noises:
-            size = noise.shape[2]
-
-            while True:
-                loss = (
-                    loss
-                    + (noise * torch.roll(noise, shifts=1, dims=3)).mean().pow(2)
-                    + (noise * torch.roll(noise, shifts=1, dims=2)).mean().pow(2)
-                )
-
-                if size <= 8:
-                    break
-
-                noise = noise.reshape([1, 1, size // 2, 2, size // 2, 2])
-                noise = noise.mean([3, 5])
-                size //= 2
-
-        return loss
-
-    @staticmethod
-    def normalize(noises: Iterable[torch.Tensor]):
-        for noise in noises:
-            mean = noise.mean()
-            std = noise.std()
-
-            noise.data.add_(-mean).div_(std)
-
diff --git a/models/__init__.py b/models/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/models/degrade.py b/models/degrade.py
deleted file mode 100644
index 573f211ca6f05c3822bd2ed415af102303febe89..0000000000000000000000000000000000000000
--- a/models/degrade.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from argparse import (
-    ArgumentParser,
-    Namespace,
-)
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from utils.misc import optional_string
-
-from .gaussian_smoothing import GaussianSmoothing
-
-
-class DegradeArguments:
-    @staticmethod
-    def add_arguments(parser: ArgumentParser):
-        parser.add_argument('--spectral_sensitivity', choices=["g", "b", "gb"], default="g",
-            help="Type of spectral sensitivity. g: grayscale (panchromatic), b: blue-sensitive, gb: green+blue (orthochromatic)")
-        parser.add_argument('--gaussian', type=float, default=0,
-            help="estimated blur radius in pixels of the input photo if it is scaled to 1024x1024")
-
-    @staticmethod
-    def to_string(args: Namespace) -> str:
-        return (
-            f"{args.spectral_sensitivity}"
-            + optional_string(args.gaussian > 0, f"-G{args.gaussian}")
-        )
-
-
-class CameraResponse(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-        self.register_parameter("gamma", nn.Parameter(torch.ones(1)))
-        self.register_parameter("offset", nn.Parameter(torch.zeros(1)))
-        self.register_parameter("gain", nn.Parameter(torch.ones(1)))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = torch.clamp(x, max=1, min=-1+1e-2)
-        x = (1 + x) * 0.5
-        x = self.offset + self.gain * torch.pow(x, self.gamma)
-        x = (x - 0.5) * 2
-        # b = torch.clamp(b, max=1, min=-1)
-        return x
-
-
-class SpectralResponse(nn.Module):
-    # TODO: use enum instead for color mode
-    def __init__(self, spectral_sensitivity: str = 'b'):
-        assert spectral_sensitivity in ("g", "b", "gb"), f"spectral_sensitivity {spectral_sensitivity} is not implemented."
-
-        super().__init__()
-
-        self.spectral_sensitivity = spectral_sensitivity
-
-        if self.spectral_sensitivity == "g":
-            self.register_buffer("to_gray", torch.tensor([0.299, 0.587, 0.114]).reshape(1, -1, 1, 1))
-
-    def forward(self, rgb: torch.Tensor) -> torch.Tensor:
-        if self.spectral_sensitivity == "b":
-            x = rgb[:, -1:]
-        elif self.spectral_sensitivity == "gb":
-            x = (rgb[:, 1:2] + rgb[:, -1:]) * 0.5
-        else:
-            assert self.spectral_sensitivity == "g"
-            x = (rgb * self.to_gray).sum(dim=1, keepdim=True)
-        return x
-
-
-class Downsample(nn.Module):
-    """Antialiasing downsampling"""
-    def __init__(self, input_size: int, output_size: int, channels: int):
-        super().__init__()
-        if input_size % output_size == 0:
-            self.stride = input_size // output_size
-            self.grid = None
-        else:
-            self.stride = 1
-            step = input_size / output_size
-            x = torch.arange(output_size) * step
-            Y, X = torch.meshgrid(x, x)
-            grid = torch.stack((X, Y), dim=-1)
-            grid /= torch.Tensor((input_size - 1, input_size - 1)).view(1, 1, -1)
-            grid = grid * 2 - 1
-            self.register_buffer("grid", grid)
-        sigma = 0.5 * input_size / output_size
-        #print(f"{input_size} -> {output_size}: sigma={sigma}")
-        self.blur = GaussianSmoothing(channels, int(2 * (sigma * 2) + 1 + 0.5), sigma)
-
-    def forward(self, im: torch.Tensor):
-        out = self.blur(im, stride=self.stride)
-        if self.grid is not None:
-            out = F.grid_sample(out, self.grid[None].expand(im.shape[0], -1, -1, -1))
-        return out
-
-
-
-class Degrade(nn.Module):
-    """
-    Simulate the degradation of antique film
-    """
-    def __init__(self, args:Namespace):
-        super().__init__()
-        self.srf = SpectralResponse(args.spectral_sensitivity)
-        self.crf = CameraResponse()
-        self.gaussian = None
-        if args.gaussian is not None and args.gaussian > 0:
-            self.gaussian = GaussianSmoothing(3, 2 * int(args.gaussian * 2 + 0.5) + 1, args.gaussian)
-
-    def forward(self, img: torch.Tensor, downsample: nn.Module = None):
-        if self.gaussian is not None:
-            img = self.gaussian(img)
-        if downsample is not None:
-            img = downsample(img)
-        img = self.srf(img)
-        img = self.crf(img)
-        # Note that I changed it back to 3 channels
-        return img.repeat((1, 3, 1, 1)) if img.shape[1] == 1 else img
-
-
-
diff --git a/models/encoder.py b/models/encoder.py
deleted file mode 100644
index 5a9516d5066150d6521a8c812fe710c4c583c589..0000000000000000000000000000000000000000
--- a/models/encoder.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from argparse import Namespace, ArgumentParser
-from functools import partial
-
-from torch import nn
-
-from .resnet import ResNetBasicBlock, activation_func, norm_module, Conv2dAuto
-
-
-def add_arguments(parser: ArgumentParser) -> ArgumentParser:
-    parser.add_argument("--latent_size", type=int, default=512, help="latent size")
-    return parser
-
-
-def create_model(args) -> nn.Module:
-    in_channels = 3 if "rgb" in args and args.rgb else 1
-    return Encoder(in_channels, args.encoder_size, latent_size=args.latent_size)
-
-
-class Flatten(nn.Module):
-    def forward(self, input_):
-        return input_.view(input_.size(0), -1)
-
-
-class Encoder(nn.Module):
-    def __init__(
-            self, in_channels: int, size: int, latent_size: int = 512,
-            activation: str = 'leaky_relu', norm: str = "instance"
-    ):
-        super().__init__()
-
-        out_channels0 = 64
-        norm_m = norm_module(norm)
-        self.conv0 = nn.Sequential(
-            Conv2dAuto(in_channels, out_channels0, kernel_size=5),
-            norm_m(out_channels0),
-            activation_func(activation),
-        )
-
-        pool_kernel = 2
-        self.pool = nn.AvgPool2d(pool_kernel)
-
-        num_channels = [128, 256, 512, 512]
-        # FIXME: this is a hack
-        if size >= 256:
-            num_channels.append(512)
-
-        residual = partial(ResNetBasicBlock, activation=activation, norm=norm, bias=True)
-        residual_blocks = nn.ModuleList()
-        for in_channel, out_channel in zip([out_channels0] + num_channels[:-1], num_channels):
-            residual_blocks.append(residual(in_channel, out_channel))
-            residual_blocks.append(nn.AvgPool2d(pool_kernel))
-        self.residual_blocks = nn.Sequential(*residual_blocks)
-
-        self.last = nn.Sequential(
-            nn.ReLU(),
-            nn.AvgPool2d(4),    # TODO: not sure whehter this would cause problem
-            Flatten(),
-            nn.Linear(num_channels[-1], latent_size, bias=True)
-        )
-
-    def forward(self, input_):
-        out = self.conv0(input_)
-        out = self.pool(out)
-        out = self.residual_blocks(out)
-        out = self.last(out)
-        return out
diff --git a/models/gaussian_smoothing.py b/models/gaussian_smoothing.py
deleted file mode 100644
index f7803dad0d8c34bc93fc9e80b3b9fea200bf0c78..0000000000000000000000000000000000000000
--- a/models/gaussian_smoothing.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import math
-import numbers
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-
-class GaussianSmoothing(nn.Module):
-    """
-    Apply gaussian smoothing on a
-    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
-    in the input using a depthwise convolution.
-    Arguments:
-        channels (int, sequence): Number of channels of the input tensors. Output will
-            have this number of channels as well.
-        kernel_size (int, sequence): Size of the gaussian kernel.
-        sigma (float, sequence): Standard deviation of the gaussian kernel.
-        dim (int, optional): The number of dimensions of the data.
-            Default value is 2 (spatial).
-    """
-    def __init__(self, channels, kernel_size, sigma, dim=2):
-        super(GaussianSmoothing, self).__init__()
-        if isinstance(kernel_size, numbers.Number):
-            kernel_size = [kernel_size] * dim
-        if isinstance(sigma, numbers.Number):
-            sigma = [sigma] * dim
-
-        # The gaussian kernel is the product of the
-        # gaussian function of each dimension.
-        kernel = 1
-        meshgrids = torch.meshgrid(
-            [
-                torch.arange(size, dtype=torch.float32)
-                for size in kernel_size
-            ]
-        )
-        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
-            mean = (size - 1) / 2
-            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * \
-                      torch.exp(-((mgrid - mean) / (2 * std)) ** 2)
-
-        # Make sure sum of values in gaussian kernel equals 1.
-        kernel = kernel / torch.sum(kernel)
-
-        # Reshape to depthwise convolutional weight
-        kernel = kernel.view(1, 1, *kernel.size())
-        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
-
-        self.register_buffer('weight', kernel)
-        self.groups = channels
-
-        if dim == 1:
-            self.conv = F.conv1d
-        elif dim == 2:
-            self.conv = F.conv2d
-        elif dim == 3:
-            self.conv = F.conv3d
-        else:
-            raise RuntimeError(
-                'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(dim)
-            )
-
-    def forward(self, input, stride: int = 1):
-        """
-        Apply gaussian filter to input.
-        Arguments:
-            input (torch.Tensor): Input to apply gaussian filter on.
-            stride for applying conv
-        Returns:
-            filtered (torch.Tensor): Filtered output.
-        """
-        padding = (self.weight.shape[-1] - 1) // 2
-        return self.conv(input, weight=self.weight, groups=self.groups, padding=padding, stride=stride)
-
diff --git a/models/resnet.py b/models/resnet.py
deleted file mode 100644
index c1cefd7a01e44f18f35e0176df50fb50a4f29b5e..0000000000000000000000000000000000000000
--- a/models/resnet.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from functools import partial
-
-from torch import nn
-
-
-def activation_func(activation: str):
-    return  nn.ModuleDict([
-        ['relu', nn.ReLU(inplace=True)],
-        ['leaky_relu', nn.LeakyReLU(negative_slope=0.01, inplace=True)],
-        ['selu', nn.SELU(inplace=True)],
-        ['none', nn.Identity()]
-    ])[activation]
-
-
-def norm_module(norm: str):
-    return {
-        'batch': nn.BatchNorm2d,
-        'instance': nn.InstanceNorm2d,
-    }[norm]
-
-
-class Conv2dAuto(nn.Conv2d):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # dynamic add padding based on the kernel_size
-        self.padding = (self.kernel_size[0] // 2, self.kernel_size[1] // 2)
-
-
-conv3x3 = partial(Conv2dAuto, kernel_size=3)
-
-
-class ResidualBlock(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, activation: str = 'relu'):
-        super().__init__()
-        self.in_channels, self.out_channels = in_channels, out_channels
-        self.blocks = nn.Identity()
-        self.activate = activation_func(activation)
-        self.shortcut = nn.Identity()
-
-    def forward(self, x):
-        residual = x
-        if self.should_apply_shortcut:
-            residual = self.shortcut(x)
-        x = self.blocks(x)
-        x += residual
-        x = self.activate(x)
-        return x
-
-    @property
-    def should_apply_shortcut(self):
-        return self.in_channels != self.out_channels
-
-
-class ResNetResidualBlock(ResidualBlock):
-    def __init__(
-            self, in_channels: int, out_channels: int,
-            expansion: int = 1, downsampling: int = 1,
-            conv=conv3x3, norm: str = 'batch', *args, **kwargs
-    ):
-        super().__init__(in_channels, out_channels, *args, **kwargs)
-        self.expansion, self.downsampling = expansion, downsampling
-        self.conv, self.norm = conv, norm_module(norm)
-        self.shortcut = nn.Sequential(
-            nn.Conv2d(self.in_channels, self.expanded_channels, kernel_size=1,
-                      stride=self.downsampling, bias=False),
-            self.norm(self.expanded_channels)) if self.should_apply_shortcut else None
-
-    @property
-    def expanded_channels(self):
-        return self.out_channels * self.expansion
-
-    @property
-    def should_apply_shortcut(self):
-        return self.in_channels != self.expanded_channels
-
-
-def conv_norm(in_channels: int, out_channels: int, conv, norm, *args, **kwargs):
-    return nn.Sequential(conv(in_channels, out_channels, *args, **kwargs), norm(out_channels))
-
-
-class ResNetBasicBlock(ResNetResidualBlock):
-    """
-    Basic ResNet block composed by two layers of 3x3conv/batchnorm/activation
-    """
-    expansion = 1
-
-    def __init__(
-            self, in_channels: int, out_channels: int, bias: bool = False, *args, **kwargs
-    ):
-        super().__init__(in_channels, out_channels, *args, **kwargs)
-        self.blocks = nn.Sequential(
-            conv_norm(
-                self.in_channels, self.out_channels, conv=self.conv, norm=self.norm,
-                bias=bias, stride=self.downsampling
-            ),
-            self.activate,
-            conv_norm(self.out_channels, self.expanded_channels, conv=self.conv, norm=self.norm, bias=bias),
-        )
-
diff --git a/models/vggface.py b/models/vggface.py
deleted file mode 100644
index 0a822079e3a67ae3292e8c5c413abe0d33999561..0000000000000000000000000000000000000000
--- a/models/vggface.py
+++ /dev/null
@@ -1,150 +0,0 @@
-
-import torch
-import torch.nn as nn
-
-
-class Vgg_face_dag(nn.Module):
-
-    def __init__(self):
-        super(Vgg_face_dag, self).__init__()
-        self.meta = {'mean': [129.186279296875, 104.76238250732422, 93.59396362304688],
-                     'std': [1, 1, 1],
-                     'imageSize': [224, 224, 3]}
-        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu1_1 = nn.ReLU(inplace=True)
-        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu1_2 = nn.ReLU(inplace=True)
-        self.pool1 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
-        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu2_1 = nn.ReLU(inplace=True)
-        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu2_2 = nn.ReLU(inplace=True)
-        self.pool2 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
-        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu3_1 = nn.ReLU(inplace=True)
-        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu3_2 = nn.ReLU(inplace=True)
-        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu3_3 = nn.ReLU(inplace=True)
-        self.pool3 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
-        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu4_1 = nn.ReLU(inplace=True)
-        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu4_2 = nn.ReLU(inplace=True)
-        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu4_3 = nn.ReLU(inplace=True)
-        self.pool4 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
-        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu5_1 = nn.ReLU(inplace=True)
-        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu5_2 = nn.ReLU(inplace=True)
-        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
-        self.relu5_3 = nn.ReLU(inplace=True)
-        self.pool5 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
-        self.fc6 = nn.Linear(in_features=25088, out_features=4096, bias=True)
-        self.relu6 = nn.ReLU(inplace=True)
-        self.dropout6 = nn.Dropout(p=0.5)
-        self.fc7 = nn.Linear(in_features=4096, out_features=4096, bias=True)
-        self.relu7 = nn.ReLU(inplace=True)
-        self.dropout7 = nn.Dropout(p=0.5)
-        self.fc8 = nn.Linear(in_features=4096, out_features=2622, bias=True)
-
-    def forward(self, x0):
-        x1 = self.conv1_1(x0)
-        x2 = self.relu1_1(x1)
-        x3 = self.conv1_2(x2)
-        x4 = self.relu1_2(x3)
-        x5 = self.pool1(x4)
-        x6 = self.conv2_1(x5)
-        x7 = self.relu2_1(x6)
-        x8 = self.conv2_2(x7)
-        x9 = self.relu2_2(x8)
-        x10 = self.pool2(x9)
-        x11 = self.conv3_1(x10)
-        x12 = self.relu3_1(x11)
-        x13 = self.conv3_2(x12)
-        x14 = self.relu3_2(x13)
-        x15 = self.conv3_3(x14)
-        x16 = self.relu3_3(x15)
-        x17 = self.pool3(x16)
-        x18 = self.conv4_1(x17)
-        x19 = self.relu4_1(x18)
-        x20 = self.conv4_2(x19)
-        x21 = self.relu4_2(x20)
-        x22 = self.conv4_3(x21)
-        x23 = self.relu4_3(x22)
-        x24 = self.pool4(x23)
-        x25 = self.conv5_1(x24)
-        x26 = self.relu5_1(x25)
-        x27 = self.conv5_2(x26)
-        x28 = self.relu5_2(x27)
-        x29 = self.conv5_3(x28)
-        x30 = self.relu5_3(x29)
-        x31_preflatten = self.pool5(x30)
-        x31 = x31_preflatten.view(x31_preflatten.size(0), -1)
-        x32 = self.fc6(x31)
-        x33 = self.relu6(x32)
-        x34 = self.dropout6(x33)
-        x35 = self.fc7(x34)
-        x36 = self.relu7(x35)
-        x37 = self.dropout7(x36)
-        x38 = self.fc8(x37)
-        return x38
-
-
-def vgg_face_dag(weights_path=None, **kwargs):
-    """
-    load imported model instance
-
-    Args:
-        weights_path (str): If set, loads model weights from the given path
-    """
-    model = Vgg_face_dag()
-    if weights_path:
-        state_dict = torch.load(weights_path)
-        model.load_state_dict(state_dict)
-    return model
-
-
-class VGGFaceFeats(Vgg_face_dag):
-    def forward(self, x0):
-        x1 = self.conv1_1(x0)
-        x2 = self.relu1_1(x1)
-        x3 = self.conv1_2(x2)
-        x4 = self.relu1_2(x3)
-        x5 = self.pool1(x4)
-        x6 = self.conv2_1(x5)
-        x7 = self.relu2_1(x6)
-        x8 = self.conv2_2(x7)
-        x9 = self.relu2_2(x8)
-        x10 = self.pool2(x9)
-        x11 = self.conv3_1(x10)
-        x12 = self.relu3_1(x11)
-        x13 = self.conv3_2(x12)
-        x14 = self.relu3_2(x13)
-        x15 = self.conv3_3(x14)
-        x16 = self.relu3_3(x15)
-        x17 = self.pool3(x16)
-        x18 = self.conv4_1(x17)
-        x19 = self.relu4_1(x18)
-        x20 = self.conv4_2(x19)
-        x21 = self.relu4_2(x20)
-        x22 = self.conv4_3(x21)
-        x23 = self.relu4_3(x22)
-        x24 = self.pool4(x23)
-        x25 = self.conv5_1(x24)
-        # x26 = self.relu5_1(x25)
-        # x27 = self.conv5_2(x26)
-        # x28 = self.relu5_2(x27)
-        # x29 = self.conv5_3(x28)
-        # x30 = self.relu5_3(x29)
-        # x31_preflatten = self.pool5(x30)
-        # x31 = x31_preflatten.view(x31_preflatten.size(0), -1)
-        # x32 = self.fc6(x31)
-        # x33 = self.relu6(x32)
-        # x34 = self.dropout6(x33)
-        # x35 = self.fc7(x34)
-        # x36 = self.relu7(x35)
-        # x37 = self.dropout7(x36)
-        # x38 = self.fc8(x37)
-        return x1, x6, x11, x18, x25
diff --git a/op/upfirdn2d_kernel.cu b/op/upfirdn2d_kernel.cu
deleted file mode 100644
index 2a710aa6adc3d43ac93136a1814e3c39970e1c7e..0000000000000000000000000000000000000000
--- a/op/upfirdn2d_kernel.cu
+++ /dev/null
@@ -1,272 +0,0 @@
-// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
-//
-// This work is made available under the Nvidia Source Code License-NC.
-// To view a copy of this license, visit
-// https://nvlabs.github.io/stylegan2/license.html
-
-#include <torch/types.h>
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-
-static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
-    int c = a / b;
-
-    if (c * b > a) {
-        c--;
-    }
-
-    return c;
-}
-
-
-struct UpFirDn2DKernelParams {
-    int up_x;
-    int up_y;
-    int down_x;
-    int down_y;
-    int pad_x0;
-    int pad_x1;
-    int pad_y0;
-    int pad_y1;
-
-    int major_dim;
-    int in_h;
-    int in_w;
-    int minor_dim;
-    int kernel_h;
-    int kernel_w;
-    int out_h;
-    int out_w;
-    int loop_major;
-    int loop_x;
-};
-
-
-template <typename scalar_t, int up_x, int up_y, int down_x, int down_y, int kernel_h, int kernel_w, int tile_out_h, int tile_out_w>
-__global__ void upfirdn2d_kernel(scalar_t* out, const scalar_t* input, const scalar_t* kernel, const UpFirDn2DKernelParams p) {
-    const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
-    const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
-
-    __shared__ volatile float sk[kernel_h][kernel_w];
-    __shared__ volatile float sx[tile_in_h][tile_in_w];
-
-    int minor_idx = blockIdx.x;
-    int tile_out_y = minor_idx / p.minor_dim;
-    minor_idx -= tile_out_y * p.minor_dim;
-    tile_out_y *= tile_out_h;
-    int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
-    int major_idx_base = blockIdx.z * p.loop_major;
-
-    if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h | major_idx_base >= p.major_dim) {
-        return;
-    }
-
-    for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w; tap_idx += blockDim.x) {
-        int ky = tap_idx / kernel_w;
-        int kx = tap_idx - ky * kernel_w;
-        scalar_t v = 0.0;
-
-        if (kx < p.kernel_w & ky < p.kernel_h) {
-            v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
-        }
-
-        sk[ky][kx] = v;
-    }
-
-    for (int loop_major = 0, major_idx = major_idx_base; loop_major < p.loop_major & major_idx < p.major_dim; loop_major++, major_idx++) {
-        for (int loop_x = 0, tile_out_x = tile_out_x_base; loop_x < p.loop_x & tile_out_x < p.out_w; loop_x++, tile_out_x += tile_out_w) {
-            int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
-            int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
-            int tile_in_x = floor_div(tile_mid_x, up_x);
-            int tile_in_y = floor_div(tile_mid_y, up_y);
-
-            __syncthreads();
-
-            for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w; in_idx += blockDim.x) {
-                int rel_in_y = in_idx / tile_in_w;
-                int rel_in_x = in_idx - rel_in_y * tile_in_w;
-                int in_x = rel_in_x + tile_in_x;
-                int in_y = rel_in_y + tile_in_y;
-
-                scalar_t v = 0.0;
-
-                if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
-                    v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim + minor_idx];
-                }
-
-                sx[rel_in_y][rel_in_x] = v;
-            }
-
-            __syncthreads();
-            for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w; out_idx += blockDim.x) {
-                int rel_out_y = out_idx / tile_out_w;
-                int rel_out_x = out_idx - rel_out_y * tile_out_w;
-                int out_x = rel_out_x + tile_out_x;
-                int out_y = rel_out_y + tile_out_y;
-
-                int mid_x = tile_mid_x + rel_out_x * down_x;
-                int mid_y = tile_mid_y + rel_out_y * down_y;
-                int in_x = floor_div(mid_x, up_x);
-                int in_y = floor_div(mid_y, up_y);
-                int rel_in_x = in_x - tile_in_x;
-                int rel_in_y = in_y - tile_in_y;
-                int kernel_x = (in_x + 1) * up_x - mid_x - 1;
-                int kernel_y = (in_y + 1) * up_y - mid_y - 1;
-
-                scalar_t v = 0.0;
-
-                #pragma unroll
-                for (int y = 0; y < kernel_h / up_y; y++)
-                    #pragma unroll
-                    for (int x = 0; x < kernel_w / up_x; x++)
-                        v += sx[rel_in_y + y][rel_in_x + x] * sk[kernel_y + y * up_y][kernel_x + x * up_x];
-
-                if (out_x < p.out_w & out_y < p.out_h) {
-                    out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim + minor_idx] = v;
-                }
-            }
-        }
-    }
-}
-
-
-torch::Tensor upfirdn2d_op(const torch::Tensor& input, const torch::Tensor& kernel,
-    int up_x, int up_y, int down_x, int down_y,
-    int pad_x0, int pad_x1, int pad_y0, int pad_y1) {
-    int curDevice = -1;
-    cudaGetDevice(&curDevice);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
-
-    UpFirDn2DKernelParams p;
-
-    auto x = input.contiguous();
-    auto k = kernel.contiguous();
-
-    p.major_dim = x.size(0);
-    p.in_h = x.size(1);
-    p.in_w = x.size(2);
-    p.minor_dim = x.size(3);
-    p.kernel_h = k.size(0);
-    p.kernel_w = k.size(1);
-    p.up_x = up_x;
-    p.up_y = up_y;
-    p.down_x = down_x;
-    p.down_y = down_y;
-    p.pad_x0 = pad_x0;
-    p.pad_x1 = pad_x1;
-    p.pad_y0 = pad_y0;
-    p.pad_y1 = pad_y1;
-
-    p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) / p.down_y;
-    p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) / p.down_x;
-
-    auto out = at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
-
-    int mode = -1;
-
-    int tile_out_h;
-    int tile_out_w;
-
-    if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 4 && p.kernel_w <= 4) {
-        mode = 1;
-        tile_out_h = 16;
-        tile_out_w = 64;
-    }
-
-    if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 3 && p.kernel_w <= 3) {
-        mode = 2;
-        tile_out_h = 16;
-        tile_out_w = 64;
-    }
-
-    if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 4 && p.kernel_w <= 4) {
-        mode = 3;
-        tile_out_h = 16;
-        tile_out_w = 64;
-    }
-
-    if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 2 && p.kernel_w <= 2) {
-        mode = 4;
-        tile_out_h = 16;
-        tile_out_w = 64;
-    }
-
-    if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 && p.kernel_h <= 4 && p.kernel_w <= 4) {
-        mode = 5;
-        tile_out_h = 8;
-        tile_out_w = 32;
-    }
-
-    if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 && p.kernel_h <= 2 && p.kernel_w <= 2) {
-        mode = 6;
-        tile_out_h = 8;
-        tile_out_w = 32;
-    }
-
-    dim3 block_size;
-    dim3 grid_size;
-
-    if (tile_out_h > 0 && tile_out_w) {
-        p.loop_major = (p.major_dim - 1) / 16384 + 1;
-        p.loop_x = 1;
-        block_size = dim3(32 * 8, 1, 1);
-        grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
-                         (p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
-                         (p.major_dim - 1) / p.loop_major + 1);
-    }
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
-        switch (mode) {
-        case 1:
-            upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 4, 4, 16, 64><<<grid_size, block_size, 0, stream>>>(
-                out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), p
-            );
-
-            break;
-
-        case 2:
-            upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 3, 3, 16, 64><<<grid_size, block_size, 0, stream>>>(
-                out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), p
-            );
-
-            break;
-
-        case 3:
-            upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 4, 4, 16, 64><<<grid_size, block_size, 0, stream>>>(
-                out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), p
-            );
-
-            break;
-
-        case 4:
-            upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 2, 2, 16, 64><<<grid_size, block_size, 0, stream>>>(
-                out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), p
-            );
-
-            break;
-
-        case 5:
-            upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32><<<grid_size, block_size, 0, stream>>>(
-                out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), p
-            );
-
-            break;
-
-        case 6:
-            upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32><<<grid_size, block_size, 0, stream>>>(
-                out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), p
-            );
-
-            break;
-        }
-    });
-
-    return out;
-}
\ No newline at end of file
diff --git a/optim/__init__.py b/optim/__init__.py
deleted file mode 100644
index 7dc5f600cff942786dcbd7eeee84ddc1920a81df..0000000000000000000000000000000000000000
--- a/optim/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from torch.optim import Adam
-from torch.optim.lbfgs import LBFGS
-from .radam import RAdam
-
-
-OPTIMIZER_MAP = {
-    "adam": Adam,
-    "radam": RAdam,
-    "lbfgs": LBFGS,
-}
-
-
-def get_optimizer_class(optimizer_name):
-    name = optimizer_name.lower()
-    return OPTIMIZER_MAP[name]
diff --git a/optim/radam.py b/optim/radam.py
deleted file mode 100644
index 35e797d231f6dc16e286ae7999a61132293b0d36..0000000000000000000000000000000000000000
--- a/optim/radam.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import math
-import torch
-from torch.optim.optimizer import Optimizer, required
-
-
-class RAdam(Optimizer):
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-
-        self.degenerated_to_sgd = degenerated_to_sgd
-        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
-            for param in params:
-                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
-                    param['buffer'] = [[None, None, None] for _ in range(10)]
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
-                        buffer=[[None, None, None] for _ in range(10)])
-        super(RAdam, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(RAdam, self).__setstate__(state)
-
-    def step(self, closure=None):
-
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data.float()
-                if grad.is_sparse:
-                    raise RuntimeError('RAdam does not support sparse gradients')
-
-                p_data_fp32 = p.data.float()
-
-                state = self.state[p]
-
-                if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
-                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
-                else:
-                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
-                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-
-                state['step'] += 1
-                buffered = group['buffer'][int(state['step'] % 10)]
-                if state['step'] == buffered[0]:
-                    N_sma, step_size = buffered[1], buffered[2]
-                else:
-                    buffered[0] = state['step']
-                    beta2_t = beta2 ** state['step']
-                    N_sma_max = 2 / (1 - beta2) - 1
-                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
-                    buffered[1] = N_sma
-
-                    # more conservative since it's an approximated value
-                    if N_sma >= 5:
-                        step_size = math.sqrt(
-                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
-                                        N_sma_max - 2)) / (1 - beta1 ** state['step'])
-                    elif self.degenerated_to_sgd:
-                        step_size = 1.0 / (1 - beta1 ** state['step'])
-                    else:
-                        step_size = -1
-                    buffered[2] = step_size
-
-                # more conservative since it's an approximated value
-                if N_sma >= 5:
-                    if group['weight_decay'] != 0:
-                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
-                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
-                    p.data.copy_(p_data_fp32)
-                elif step_size > 0:
-                    if group['weight_decay'] != 0:
-                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
-                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
-                    p.data.copy_(p_data_fp32)
-
-        return loss
-
-
-class PlainRAdam(Optimizer):
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-
-        self.degenerated_to_sgd = degenerated_to_sgd
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-
-        super(PlainRAdam, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(PlainRAdam, self).__setstate__(state)
-
-    def step(self, closure=None):
-
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data.float()
-                if grad.is_sparse:
-                    raise RuntimeError('RAdam does not support sparse gradients')
-
-                p_data_fp32 = p.data.float()
-
-                state = self.state[p]
-
-                if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
-                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
-                else:
-                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
-                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-
-                state['step'] += 1
-                beta2_t = beta2 ** state['step']
-                N_sma_max = 2 / (1 - beta2) - 1
-                N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
-
-                # more conservative since it's an approximated value
-                if N_sma >= 5:
-                    if group['weight_decay'] != 0:
-                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
-                    step_size = group['lr'] * math.sqrt(
-                        (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
-                                    N_sma_max - 2)) / (1 - beta1 ** state['step'])
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
-                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
-                    p.data.copy_(p_data_fp32)
-                elif self.degenerated_to_sgd:
-                    if group['weight_decay'] != 0:
-                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
-                    step_size = group['lr'] / (1 - beta1 ** state['step'])
-                    p_data_fp32.add_(-step_size, exp_avg)
-                    p.data.copy_(p_data_fp32)
-
-        return loss
-
-
-class AdamW(Optimizer):
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, warmup=warmup)
-        super(AdamW, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(AdamW, self).__setstate__(state)
-
-    def step(self, closure=None):
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data.float()
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-
-                p_data_fp32 = p.data.float()
-
-                state = self.state[p]
-
-                if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
-                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
-                else:
-                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
-                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-
-                denom = exp_avg_sq.sqrt().add_(group['eps'])
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-
-                if group['warmup'] > state['step']:
-                    scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
-                else:
-                    scheduled_lr = group['lr']
-
-                step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
-
-                if group['weight_decay'] != 0:
-                    p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
-
-                p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
-
-                p.data.copy_(p_data_fp32)
-
-        return loss
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d89276270612d4a258c8439142144673608de573..d5695a6521b7a1deb4966ed08472cc5f6f0504d2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,25 +1,5 @@
-# Torch
-#--find-links https://download.pytorch.org/whl/torch_stable.html
-#torch==1.4.0+cu100
-#torchvision==0.11.2+cu100
-#torchaudio==0.10.1+cu100
-#setuptools==59.5.0
-
-Pillow
-ninja
-tqdm
-opencv-python
-scikit-image
-numpy
-
-tensorboard
-
-# for face alignment
-tensorflow
-#keras
-#bz2
-dlib
-scipy
-
-matplotlib
-pprintpp
+numpy==1.22.3
+Pillow==9.1.0
+scipy==1.8.0
+torch==1.11.0
+torchvision==0.12.0
\ No newline at end of file
diff --git a/scripts/download_checkpoints.sh b/scripts/download_checkpoints.sh
deleted file mode 100644
index d253eb9da7541f1e4483cc15f4ce44a646354f73..0000000000000000000000000000000000000000
--- a/scripts/download_checkpoints.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-set -exo
-
-mkdir -p checkpoint
-gdown https://drive.google.com/uc?id=1hWc2JLM58_PkwfLG23Q5IH3Ysj2Mo1nr -O checkpoint/e4e_ffhq_encode.pt
-gdown https://drive.google.com/uc?id=1hvAAql9Jo0wlmLBSHRIGrtXHcKQE-Whn -O checkpoint/stylegan2-ffhq-config-f.pt
-gdown https://drive.google.com/uc?id=1mbGWbjivZxMGxZqyyOHbE310aOkYe2BR -O checkpoint/vgg_face_dag.pt
-mkdir -p checkpoint/encoder
-gdown https://drive.google.com/uc?id=1ha4WXsaIpZfMHsqNLvqOPlUXsgh9VawU -O checkpoint/encoder/checkpoint_b.pt
-gdown https://drive.google.com/uc?id=1hfxDLujRIGU0G7pOdW9MMSBRzxZBmSKJ -O checkpoint/encoder/checkpoint_g.pt
-gdown https://drive.google.com/uc?id=1htekHopgxaW-MIjs6pYy7pyIK0v7Q0iS -O checkpoint/encoder/checkpoint_gb.pt
-
-pushd third_party/face_parsing
-./scripts/download_checkpoints.sh
-popd
diff --git a/scripts/install.sh b/scripts/install.sh
deleted file mode 100644
index 7f9d8f49eb0b5359766eff5fd83de6cddee90eeb..0000000000000000000000000000000000000000
--- a/scripts/install.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-# conda create -n stylegan python=3.7
-# conda activate stylegan
-conda install -c conda-forge/label/gcc7 opencv --yes
-conda install tensorflow-gpu=1.15 cudatoolkit=10.0 --yes
-conda install pytorch torchvision cudatoolkit=10.0 -c pytorch --yes
-pip install -r requirements.txt
diff --git a/scripts/run.sh b/scripts/run.sh
deleted file mode 100644
index 9edd891342c9722d12ac2d28329ef04188792c21..0000000000000000000000000000000000000000
--- a/scripts/run.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-set -x
-
-# Example command
-# ```
-# ./scripts/run.sh b  "dataset/Abraham Lincoln_01.png" 0.75
-# ```
-
-spectral_sensitivity="$1"
-path="$2"
-blur_radius="$3"
-
-
-list="$(dirname "${path}")"
-list="$(basename "${list}")"
-
-if [ "${spectral_sensitivity}" == "b" ]; then
-  FLAGS=(--spectral_sensitivity b --encoder_ckpt checkpoint/encoder/checkpoint_b.pt);
-elif [ "${spectral_sensitivity}" == "gb" ]; then
-  FLAGS=(--spectral_sensitivity "gb" --encoder_ckpt checkpoint/encoder/checkpoint_gb.pt);
-else
-  FLAGS=(--spectral_sensitivity "g" --encoder_ckpt checkpoint/encoder/checkpoint_g.pt);
-fi
-
-name="${path%.*}"
-name="${name##*/}"
-echo "${name}"
-
-# TODO: I did l2 or cos for contextual
-time python projector.py \
-    "${path}"  \
-    --gaussian "${blur_radius}" \
-    --log_dir "log/"  \
-    --results_dir "results/" \
-    "${FLAGS[@]}"
diff --git a/tools/__init__.py b/tools/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/tools/data/__init__.py b/tools/data/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/tools/data/align_images.py b/tools/data/align_images.py
deleted file mode 100644
index 0ea56d222cde6888002fe82f1f9b34312d4dd48f..0000000000000000000000000000000000000000
--- a/tools/data/align_images.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import argparse
-import json
-import os
-from os.path import join as pjoin
-import sys
-import bz2
-import numpy as np
-import cv2
-from tqdm import tqdm
-from tensorflow.keras.utils import get_file
-from utils.ffhq_dataset.face_alignment import image_align
-from utils.ffhq_dataset.landmarks_detector import LandmarksDetector
-
-LANDMARKS_MODEL_URL = 'http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2'
-
-
-def unpack_bz2(src_path):
-    data = bz2.BZ2File(src_path).read()
-    dst_path = src_path[:-4]
-    with open(dst_path, 'wb') as fp:
-        fp.write(data)
-    return dst_path
-
-
-class SizePathMap(dict):
-    """{size: {aligned_face_path0, aligned_face_path1, ...}, ...}"""
-    def add_item(self, size, path):
-        if size not in self:
-            self[size] = set()
-        self[size].add(path)
-
-    def get_sizes(self):
-        sizes = []
-        for key, paths in self.items():
-            sizes.extend([key,]*len(paths))
-        return sizes
-
-    def serialize(self):
-        result = {}
-        for key, paths in self.items():
-            result[key] = list(paths)
-        return result
-
-
-def main(args):
-    landmarks_model_path = unpack_bz2(get_file('shape_predictor_68_face_landmarks.dat.bz2',
-                                               LANDMARKS_MODEL_URL, cache_subdir='temp'))
-
-    landmarks_detector = LandmarksDetector(landmarks_model_path)
-    face_sizes = SizePathMap()
-    raw_img_dir = args.raw_image_dir
-    img_names = [n for n in os.listdir(raw_img_dir) if os.path.isfile(pjoin(raw_img_dir, n))]
-    aligned_image_dir = args.aligned_image_dir
-    os.makedirs(aligned_image_dir, exist_ok=True)
-    pbar = tqdm(img_names)
-    for img_name in pbar:
-        pbar.set_description(img_name)
-        if os.path.splitext(img_name)[-1] == '.txt':
-            continue
-        raw_img_path = os.path.join(raw_img_dir, img_name)
-        try:
-            for i, face_landmarks in enumerate(landmarks_detector.get_landmarks(raw_img_path), start=1):
-                face_img_name = '%s_%02d.png' % (os.path.splitext(img_name)[0], i)
-                aligned_face_path = os.path.join(aligned_image_dir, face_img_name)
-
-                face_size = image_align(
-                    raw_img_path, aligned_face_path, face_landmarks, resize=args.resize
-                )
-                face_sizes.add_item(face_size, aligned_face_path)
-                pbar.set_description(f"{img_name}: {face_size}")
-
-                if args.draw:
-                    visual = LandmarksDetector.draw(cv2.imread(raw_img_path), face_landmarks)
-                    cv2.imwrite(
-                            pjoin(args.aligned_image_dir, os.path.splitext(face_img_name)[0] + "_landmarks.png"),
-                            visual
-                    )
-        except Exception as e:
-            print('[Error]', e, 'error happened when processing', raw_img_path)
-
-    print(args.raw_image_dir, ':')
-    sizes = face_sizes.get_sizes()
-    results = {
-            'mean_size': np.mean(sizes),
-            'num_faces_detected': len(sizes),
-            'num_images': len(img_names),
-            'sizes': sizes,
-            'size_path_dict': face_sizes.serialize(),
-        }
-    print('\t', results)
-    if args.out_stats is not None:
-        os.makedirs(os.path.dirname(args.out_stats), exist_ok=True)
-        with open(out_stats, 'w') as f:
-            json.dump(results, f)
-
-
-def parse_args(args=None, namespace=None):
-    parser = argparse.ArgumentParser(description="""
-        Extracts and aligns all faces from images using DLib and a function from original FFHQ dataset preparation step
-        python align_images.py /raw_images /aligned_images
-        """
-    )
-    parser.add_argument('raw_image_dir')
-    parser.add_argument('aligned_image_dir')
-    parser.add_argument('--resize',
-            help="True if want to resize to 1024",
-            action='store_true')
-    parser.add_argument('--draw',
-            help="True if want to visualize landmarks",
-            action='store_true')
-    parser.add_argument('--out_stats',
-            help="output_fn for statistics of faces", default=None)
-    return parser.parse_args(args=args, namespace=namespace)
-
-
-if __name__ == "__main__":
-    main(parse_args())
diff --git a/tools/initialize.py b/tools/initialize.py
deleted file mode 100644
index 855bd49aac15503896030830f9b081d15957ae03..0000000000000000000000000000000000000000
--- a/tools/initialize.py
+++ /dev/null
@@ -1,160 +0,0 @@
-from argparse import ArgumentParser, Namespace
-from typing import (
-    List,
-    Tuple,
-)
-
-import numpy as np
-from PIL import Image
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torchvision.transforms import (
-    Compose,
-    Grayscale,
-    Resize,
-    ToTensor,
-)
-
-from models.encoder import Encoder
-from models.encoder4editing import (
-    get_latents as get_e4e_latents,
-    setup_model as setup_e4e_model,
-)
-from utils.misc import (
-    optional_string,
-    iterable_to_str,
-    stem,
-)
-
-
-
-class ColorEncoderArguments:
-    def __init__(self):
-        parser = ArgumentParser("Encode an image via a feed-forward encoder")
-
-        self.add_arguments(parser)
-
-        self.parser = parser
-
-    @staticmethod
-    def add_arguments(parser: ArgumentParser):
-        parser.add_argument("--encoder_ckpt", default=None,
-                            help="encoder checkpoint path. initialize w with encoder output if specified")
-        parser.add_argument("--encoder_size", type=int, default=256,
-                            help="Resize to this size to pass as input to the encoder")
-
-
-class InitializerArguments:
-    @classmethod
-    def add_arguments(cls, parser: ArgumentParser):
-        ColorEncoderArguments.add_arguments(parser)
-        cls.add_e4e_arguments(parser)
-        parser.add_argument("--mix_layer_range", default=[10, 18], type=int, nargs=2,
-                help="replace layers <start> to <end> in the e4e code by the color code")
-
-        parser.add_argument("--init_latent", default=None, help="path to init wp")
-
-    @staticmethod
-    def to_string(args: Namespace):
-        return (f"init{stem(args.init_latent).lstrip('0')[:10]}" if args.init_latent
-               else f"init({iterable_to_str(args.mix_layer_range)})")
-            #+ optional_string(args.init_noise > 0, f"-initN{args.init_noise}")
-
-    @staticmethod
-    def add_e4e_arguments(parser: ArgumentParser):
-        parser.add_argument("--e4e_ckpt", default='checkpoint/e4e_ffhq_encode.pt',
-                            help="e4e checkpoint path.")
-        parser.add_argument("--e4e_size", type=int, default=256,
-                            help="Resize to this size to pass as input to the e4e")
-
-
-
-def create_color_encoder(args: Namespace):
-    encoder = Encoder(1, args.encoder_size, 512)
-    ckpt = torch.load(args.encoder_ckpt)
-    encoder.load_state_dict(ckpt["model"])
-    return encoder
-
-
-def transform_input(img: Image):
-    tsfm = Compose([
-        Grayscale(),
-        Resize(args.encoder_size),
-        ToTensor(),
-    ])
-    return tsfm(img)
-
-
-def encode_color(imgs: torch.Tensor, args: Namespace) -> torch.Tensor:
-    assert args.encoder_size is not None
-
-    imgs = Resize(args.encoder_size)(imgs)
-
-    color_encoder = create_color_encoder(args).to(imgs.device)
-    color_encoder.eval()
-    with torch.no_grad():
-        latent = color_encoder(imgs)
-    return latent.detach()
-
-
-def resize(imgs: torch.Tensor, size: int) -> torch.Tensor:
-    return F.interpolate(imgs, size=size, mode='bilinear')
-
-
-class Initializer(nn.Module):
-    def __init__(self, args: Namespace):
-        super().__init__()
-
-        self.path = None
-        if args.init_latent is not None:
-            self.path = args.init_latent
-            return
-
-
-        assert args.encoder_size is not None
-        self.color_encoder = create_color_encoder(args)
-        self.color_encoder.eval()
-        self.color_encoder_size = args.encoder_size
-
-        self.e4e, e4e_opts = setup_e4e_model(args.e4e_ckpt)
-        assert 'cars_' not in e4e_opts.dataset_type
-        self.e4e.decoder.eval()
-        self.e4e.eval()
-        self.e4e_size = args.e4e_size
-
-        self.mix_layer_range = args.mix_layer_range
-
-    def encode_color(self, imgs: torch.Tensor) -> torch.Tensor:
-        """
-        Get the color W code
-        """
-        imgs = resize(imgs, self.color_encoder_size)
-
-        latent = self.color_encoder(imgs)
-
-        return latent
-
-    def encode_shape(self, imgs: torch.Tensor) -> torch.Tensor:
-        imgs = resize(imgs, self.e4e_size)
-        imgs = (imgs - 0.5) / 0.5
-        if imgs.shape[1] == 1: # 1 channel
-            imgs = imgs.repeat(1, 3, 1, 1)
-        return get_e4e_latents(self.e4e, imgs)
-
-    def load(self, device: torch.device):
-        latent_np = np.load(self.path)
-        return torch.tensor(latent_np, device=device)[None, ...]
-
-    def forward(self, imgs: torch.Tensor) -> torch.Tensor:
-        if self.path is not None:
-            return self.load(imgs.device)
-
-        shape_code = self.encode_shape(imgs)
-        color_code = self.encode_color(imgs)
-
-        # style mix
-        latent = shape_code
-        start, end = self.mix_layer_range
-        latent[:, start:end] = color_code
-        return latent
diff --git a/tools/match_histogram.py b/tools/match_histogram.py
deleted file mode 100644
index be55a7788363bc3b212b82864547592faa936b87..0000000000000000000000000000000000000000
--- a/tools/match_histogram.py
+++ /dev/null
@@ -1,167 +0,0 @@
-from argparse import (
-    ArgumentParser,
-    Namespace,
-)
-import os
-from os.path import join as pjoin
-from typing import Optional
-import sys
-
-import numpy as np
-import cv2
-from skimage import exposure
-
-
-# sys.path.append('Face_Detection')
-# from align_warp_back_multiple_dlib import match_histograms
-
-
-def calculate_cdf(histogram):
-    """
-    This method calculates the cumulative distribution function
-    :param array histogram: The values of the histogram
-    :return: normalized_cdf: The normalized cumulative distribution function
-    :rtype: array
-    """
-    # Get the cumulative sum of the elements
-    cdf = histogram.cumsum()
-
-    # Normalize the cdf
-    normalized_cdf = cdf / float(cdf.max())
-
-    return normalized_cdf
-
-
-def calculate_lookup(src_cdf, ref_cdf):
-    """
-    This method creates the lookup table
-    :param array src_cdf: The cdf for the source image
-    :param array ref_cdf: The cdf for the reference image
-    :return: lookup_table: The lookup table
-    :rtype: array
-    """
-    lookup_table = np.zeros(256)
-    lookup_val = 0
-    for src_pixel_val in range(len(src_cdf)):
-        lookup_val
-        for ref_pixel_val in range(len(ref_cdf)):
-            if ref_cdf[ref_pixel_val] >= src_cdf[src_pixel_val]:
-                lookup_val = ref_pixel_val
-                break
-        lookup_table[src_pixel_val] = lookup_val
-    return lookup_table
-
-
-def match_histograms(src_image, ref_image, src_mask=None, ref_mask=None):
-    """
-    This method matches the source image histogram to the
-    reference signal
-    :param image src_image: The original source image
-    :param image  ref_image: The reference image
-    :return: image_after_matching
-    :rtype: image (array)
-    """
-    # Split the images into the different color channels
-    # b means blue, g means green and r means red
-    src_b, src_g, src_r = cv2.split(src_image)
-    ref_b, ref_g, ref_r = cv2.split(ref_image)
-
-    def rv(im):
-        if ref_mask is None:
-            return im.flatten()
-        return im[ref_mask]
-
-    def sv(im):
-        if src_mask is None:
-            return im.flatten()
-        return im[src_mask]
-
-    # Compute the b, g, and r histograms separately
-    # The flatten() Numpy method returns a copy of the array c
-    # collapsed into one dimension.
-    src_hist_blue, bin_0 = np.histogram(sv(src_b), 256, [0, 256])
-    src_hist_green, bin_1 = np.histogram(sv(src_g), 256, [0, 256])
-    src_hist_red, bin_2 = np.histogram(sv(src_r), 256, [0, 256])
-    ref_hist_blue, bin_3 = np.histogram(rv(ref_b), 256, [0, 256])
-    ref_hist_green, bin_4 = np.histogram(rv(ref_g), 256, [0, 256])
-    ref_hist_red, bin_5 = np.histogram(rv(ref_r), 256, [0, 256])
-
-    # Compute the normalized cdf for the source and reference image
-    src_cdf_blue = calculate_cdf(src_hist_blue)
-    src_cdf_green = calculate_cdf(src_hist_green)
-    src_cdf_red = calculate_cdf(src_hist_red)
-    ref_cdf_blue = calculate_cdf(ref_hist_blue)
-    ref_cdf_green = calculate_cdf(ref_hist_green)
-    ref_cdf_red = calculate_cdf(ref_hist_red)
-
-    # Make a separate lookup table for each color
-    blue_lookup_table = calculate_lookup(src_cdf_blue, ref_cdf_blue)
-    green_lookup_table = calculate_lookup(src_cdf_green, ref_cdf_green)
-    red_lookup_table = calculate_lookup(src_cdf_red, ref_cdf_red)
-
-    # Use the lookup function to transform the colors of the original
-    # source image
-    blue_after_transform = cv2.LUT(src_b, blue_lookup_table)
-    green_after_transform = cv2.LUT(src_g, green_lookup_table)
-    red_after_transform = cv2.LUT(src_r, red_lookup_table)
-
-    # Put the image back together
-    image_after_matching = cv2.merge([blue_after_transform, green_after_transform, red_after_transform])
-    image_after_matching = cv2.convertScaleAbs(image_after_matching)
-
-    return image_after_matching
-
-
-def convert_to_BW(im, mode):
-    if mode == "b":
-        gray = im[..., 0]
-    elif mode == "gb":
-        gray = (im[..., 0].astype(float) + im[..., 1]) / 2.0
-    else:
-        gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
-    gray = gray.astype(np.uint8)
-
-    return np.stack([gray] * 3, axis=-1)
-
-
-def parse_args(args=None, namespace: Optional[Namespace] = None):
-    parser = ArgumentParser('match histogram of src to ref')
-    parser.add_argument('src')
-    parser.add_argument('ref')
-    parser.add_argument('--out', default=None, help="converted src that matches ref")
-    parser.add_argument('--src_mask', default=None, help="mask on which to match the histogram")
-    parser.add_argument('--ref_mask', default=None, help="mask on which to match the histogram")
-    parser.add_argument('--spectral_sensitivity', choices=['b', 'gb', 'g'], help="match the histogram of corresponding sensitive channel(s)")
-    parser.add_argument('--crop', type=int, default=0, help="crop the boundary to match")
-    return parser.parse_args(args=args, namespace=namespace)
-
-
-def main(args):
-    A = cv2.imread(args.ref)
-    A = convert_to_BW(A, args.spectral_sensitivity)
-    B = cv2.imread(args.src, 0)
-    B = np.stack((B,) * 3, axis=-1)
-
-    mask_A = cv2.resize(cv2.imread(args.ref_mask, 0), A.shape[:2][::-1],
-                        interpolation=cv2.INTER_NEAREST) > 0 if args.ref_mask else None
-    mask_B = cv2.resize(cv2.imread(args.src_mask, 0), B.shape[:2][::-1],
-                        interpolation=cv2.INTER_NEAREST) > 0 if args.src_mask else None
-
-    if args.crop > 0:
-        c = args.crop
-        bc = int(c / A.shape[0] * B.shape[0] + 0.5)
-        A = A[c:-c, c:-c]
-        B = B[bc:-bc, bc:-bc]
-
-    B = match_histograms(B, A, src_mask=mask_B, ref_mask=mask_A)
-    # B = exposure.match_histograms(B, A, multichannel=True)
-
-    if args.out:
-        os.makedirs(os.path.dirname(args.out), exist_ok=True)
-        cv2.imwrite(args.out, B)
-
-    return B
-
-
-if __name__ == "__main__":
-    main(parse_args())
diff --git a/tools/match_skin_histogram.py b/tools/match_skin_histogram.py
deleted file mode 100644
index 6c35072eca46fdbb88ae87e66c30dd10f76d3257..0000000000000000000000000000000000000000
--- a/tools/match_skin_histogram.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from argparse import Namespace
-import os
-from os.path import join as pjoin
-from typing import Optional
-
-import cv2
-import torch
-
-from tools import (
-    parse_face,
-    match_histogram,
-)
-from utils.torch_helpers import make_image
-from utils.misc import stem
-
-
-def match_skin_histogram(
-        imgs: torch.Tensor,
-        sibling_img: torch.Tensor,
-        spectral_sensitivity,
-        im_sibling_dir: str,
-        mask_dir: str,
-        matched_hist_fn: Optional[str] = None,
-        normalize=None,  # normalize the range of the tensor
-):
-    """
-    Extract the skin of the input and sibling images. Create a new input image by matching
-    its histogram to the sibling.
-    """
-    # TODO: Currently only allows imgs of batch size 1
-    im_sibling_dir = os.path.abspath(im_sibling_dir)
-    mask_dir = os.path.abspath(mask_dir)
-
-    img_np = make_image(imgs)[0]
-    sibling_np = make_image(sibling_img)[0][...,::-1]
-
-    # save img, sibling
-    os.makedirs(im_sibling_dir, exist_ok=True)
-    im_name, sibling_name = 'input.png', 'sibling.png'
-    cv2.imwrite(pjoin(im_sibling_dir, im_name), img_np)
-    cv2.imwrite(pjoin(im_sibling_dir, sibling_name), sibling_np)
-
-    # face parsing
-    parse_face.main(
-        Namespace(in_dir=im_sibling_dir, out_dir=mask_dir, include_hair=False)
-    )
-
-    # match_histogram
-    mh_args = match_histogram.parse_args(
-        args=[
-            pjoin(im_sibling_dir, im_name),
-            pjoin(im_sibling_dir, sibling_name),
-        ],
-        namespace=Namespace(
-            out=matched_hist_fn if matched_hist_fn else pjoin(im_sibling_dir, "match_histogram.png"),
-            src_mask=pjoin(mask_dir, im_name),
-            ref_mask=pjoin(mask_dir, sibling_name),
-            spectral_sensitivity=spectral_sensitivity,
-        )
-    )
-    matched_np = match_histogram.main(mh_args) / 255.0  # [0, 1]
-    matched = torch.FloatTensor(matched_np).permute(2, 0, 1)[None,...]  #BCHW
-
-    if normalize is not None:
-        matched = normalize(matched)
-
-    return matched
diff --git a/tools/parse_face.py b/tools/parse_face.py
deleted file mode 100644
index e85142622219a8f8b4d48b9ace56585910fe4892..0000000000000000000000000000000000000000
--- a/tools/parse_face.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from argparse import ArgumentParser
-import os
-from os.path import join as pjoin
-from subprocess import run
-
-import numpy as np
-import cv2
-from tqdm import tqdm
-
-
-def create_skin_mask(anno_dir, mask_dir, skin_thresh=13, include_hair=False):
-    names = os.listdir(anno_dir)
-    names = [n for n in names if n.endswith('.png')]
-    os.makedirs(mask_dir, exist_ok=True)
-    for name in tqdm(names):
-        anno = cv2.imread(pjoin(anno_dir, name), 0)
-        mask = np.logical_and(0 < anno, anno <= skin_thresh)
-        if include_hair:
-            mask |= anno == 17
-        cv2.imwrite(pjoin(mask_dir, name), mask * 255)
-
-
-def main(args):
-    FACE_PARSING_DIR = 'third_party/face_parsing'
-
-    main_env = os.getcwd()
-    os.chdir(FACE_PARSING_DIR)
-    tmp_parse_dir = pjoin(args.out_dir, 'face_parsing')
-    cmd = [
-        'python',
-        'test.py',
-        args.in_dir,
-        tmp_parse_dir,
-    ]
-    print(' '.join(cmd))
-    run(cmd)
-
-    create_skin_mask(tmp_parse_dir, args.out_dir, include_hair=args.include_hair)
-
-    os.chdir(main_env)
-
-
-def parse_args(args=None, namespace=None):
-    parser = ArgumentParser("Face Parsing and generate skin (& hair) mask")
-    parser.add_argument('in_dir')
-    parser.add_argument('out_dir')
-    parser.add_argument('--include_hair', action="store_true", help="include hair in the mask")
-    return parser.parse_args(args=args, namespace=namespace)
-
-
-if __name__ == "__main__":
-    main(parse_args())
-
-
-
diff --git a/torch_utils/__init__.py b/torch_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b0f4efcbe1e3cd4199eeecb043d5afe1548307
--- /dev/null
+++ b/torch_utils/__init__.py
@@ -0,0 +1,11 @@
+﻿# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# empty
diff --git a/torch_utils/custom_ops.py b/torch_utils/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda77a69777a69bd3eda96713c29f66fe3b016b9
--- /dev/null
+++ b/torch_utils/custom_ops.py
@@ -0,0 +1,238 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import glob
+import torch
+import torch.utils.cpp_extension
+import importlib
+import hashlib
+import shutil
+from pathlib import Path
+import re
+import uuid
+
+from torch.utils.file_baton import FileBaton
+
+#----------------------------------------------------------------------------
+# Global options.
+
+verbosity = 'brief' # Verbosity level: 'none', 'brief', 'full'
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    patterns = [
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio */vc/bin',
+    ]
+    for pattern in patterns:
+        matches = sorted(glob.glob(pattern))
+        if len(matches):
+            return matches[-1]
+    return None
+
+def _get_mangled_gpu_name():
+    name = torch.cuda.get_device_name().lower()
+    out = []
+    for c in name:
+        if re.match('[a-z0-9_-]+', c):
+            out.append(c)
+        else:
+            out.append('-')
+    return ''.join(out)
+
+
+#----------------------------------------------------------------------------
+# Main entry point for compiling and loading C++/CUDA plugins.
+
+_cached_plugins = dict()
+
+def get_plugin(module_name, sources, **build_kwargs):
+    assert verbosity in ['none', 'brief', 'full']
+
+    # Already cached?
+    if module_name in _cached_plugins:
+        return _cached_plugins[module_name]
+
+    # Print status.
+    if verbosity == 'full':
+        print(f'Setting up PyTorch plugin "{module_name}"...')
+    elif verbosity == 'brief':
+        print(f'Setting up PyTorch plugin "{module_name}"... ', end='', flush=True)
+
+    try: # pylint: disable=too-many-nested-blocks
+        # Make sure we can find the necessary compiler binaries.
+        if os.name == 'nt' and os.system("where cl.exe >nul 2>nul") != 0:
+            compiler_bindir = _find_compiler_bindir()
+            if compiler_bindir is None:
+                raise RuntimeError(f'Could not find MSVC/GCC/CLANG installation on this computer. Check _find_compiler_bindir() in "{__file__}".')
+            os.environ['PATH'] += ';' + compiler_bindir
+
+        # Compile and load.
+        verbose_build = (verbosity == 'full')
+
+        # Incremental build md5sum trickery.  Copies all the input source files
+        # into a cached build directory under a combined md5 digest of the input
+        # source files.  Copying is done only if the combined digest has changed.
+        # This keeps input file timestamps and filenames the same as in previous
+        # extension builds, allowing for fast incremental rebuilds.
+        #
+        # This optimization is done only in case all the source files reside in
+        # a single directory (just for simplicity) and if the TORCH_EXTENSIONS_DIR
+        # environment variable is set (we take this as a signal that the user
+        # actually cares about this.)
+        source_dirs_set = set(os.path.dirname(source) for source in sources)
+        if len(source_dirs_set) == 1 and ('TORCH_EXTENSIONS_DIR' in os.environ):
+            all_source_files = sorted(list(x for x in Path(list(source_dirs_set)[0]).iterdir() if x.is_file()))
+
+            # Compute a combined hash digest for all source files in the same
+            # custom op directory (usually .cu, .cpp, .py and .h files).
+            hash_md5 = hashlib.md5()
+            for src in all_source_files:
+                with open(src, 'rb') as f:
+                    hash_md5.update(f.read())
+            build_dir = torch.utils.cpp_extension._get_build_directory(module_name, verbose=verbose_build) # pylint: disable=protected-access
+            digest_build_dir = os.path.join(build_dir, hash_md5.hexdigest())
+
+            if not os.path.isdir(digest_build_dir):
+                os.makedirs(digest_build_dir, exist_ok=True)
+                baton = FileBaton(os.path.join(digest_build_dir, 'lock'))
+                if baton.try_acquire():
+                    try:
+                        for src in all_source_files:
+                            shutil.copyfile(src, os.path.join(digest_build_dir, os.path.basename(src)))
+                    finally:
+                        baton.release()
+                else:
+                    # Someone else is copying source files under the digest dir,
+                    # wait until done and continue.
+                    baton.wait()
+            digest_sources = [os.path.join(digest_build_dir, os.path.basename(x)) for x in sources]
+            torch.utils.cpp_extension.load(name=module_name, build_directory=build_dir,
+                verbose=verbose_build, sources=digest_sources, **build_kwargs)
+        else:
+            torch.utils.cpp_extension.load(name=module_name, verbose=verbose_build, sources=sources, **build_kwargs)
+        module = importlib.import_module(module_name)
+
+    except:
+        if verbosity == 'brief':
+            print('Failed!')
+        raise
+
+    # Print status and add to cache.
+    if verbosity == 'full':
+        print(f'Done setting up PyTorch plugin "{module_name}".')
+    elif verbosity == 'brief':
+        print('Done.')
+    _cached_plugins[module_name] = module
+    return module
+
+#----------------------------------------------------------------------------
+def get_plugin_v3(module_name, sources, headers=None, source_dir=None, **build_kwargs):
+    assert verbosity in ['none', 'brief', 'full']
+    if headers is None:
+        headers = []
+    if source_dir is not None:
+        sources = [os.path.join(source_dir, fname) for fname in sources]
+        headers = [os.path.join(source_dir, fname) for fname in headers]
+
+    # Already cached?
+    if module_name in _cached_plugins:
+        return _cached_plugins[module_name]
+
+    # Print status.
+    if verbosity == 'full':
+        print(f'Setting up PyTorch plugin "{module_name}"...')
+    elif verbosity == 'brief':
+        print(f'Setting up PyTorch plugin "{module_name}"... ', end='', flush=True)
+    verbose_build = (verbosity == 'full')
+
+    # Compile and load.
+    try: # pylint: disable=too-many-nested-blocks
+        # Make sure we can find the necessary compiler binaries.
+        if os.name == 'nt' and os.system("where cl.exe >nul 2>nul") != 0:
+            compiler_bindir = _find_compiler_bindir()
+            if compiler_bindir is None:
+                raise RuntimeError(f'Could not find MSVC/GCC/CLANG installation on this computer. Check _find_compiler_bindir() in "{__file__}".')
+            os.environ['PATH'] += ';' + compiler_bindir
+
+        # Some containers set TORCH_CUDA_ARCH_LIST to a list that can either
+        # break the build or unnecessarily restrict what's available to nvcc.
+        # Unset it to let nvcc decide based on what's available on the
+        # machine.
+        os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+        # Incremental build md5sum trickery.  Copies all the input source files
+        # into a cached build directory under a combined md5 digest of the input
+        # source files.  Copying is done only if the combined digest has changed.
+        # This keeps input file timestamps and filenames the same as in previous
+        # extension builds, allowing for fast incremental rebuilds.
+        #
+        # This optimization is done only in case all the source files reside in
+        # a single directory (just for simplicity) and if the TORCH_EXTENSIONS_DIR
+        # environment variable is set (we take this as a signal that the user
+        # actually cares about this.)
+        #
+        # EDIT: We now do it regardless of TORCH_EXTENSIOS_DIR, in order to work
+        # around the *.cu dependency bug in ninja config.
+        #
+        all_source_files = sorted(sources + headers)
+        all_source_dirs = set(os.path.dirname(fname) for fname in all_source_files)
+        if len(all_source_dirs) == 1: # and ('TORCH_EXTENSIONS_DIR' in os.environ):
+
+            # Compute combined hash digest for all source files.
+            hash_md5 = hashlib.md5()
+            for src in all_source_files:
+                with open(src, 'rb') as f:
+                    hash_md5.update(f.read())
+
+            # Select cached build directory name.
+            source_digest = hash_md5.hexdigest()
+            build_top_dir = torch.utils.cpp_extension._get_build_directory(module_name, verbose=verbose_build) # pylint: disable=protected-access
+            cached_build_dir = os.path.join(build_top_dir, f'{source_digest}-{_get_mangled_gpu_name()}')
+
+            if not os.path.isdir(cached_build_dir):
+                tmpdir = f'{build_top_dir}/srctmp-{uuid.uuid4().hex}'
+                os.makedirs(tmpdir)
+                for src in all_source_files:
+                    shutil.copyfile(src, os.path.join(tmpdir, os.path.basename(src)))
+                try:
+                    os.replace(tmpdir, cached_build_dir) # atomic
+                except OSError:
+                    # source directory already exists, delete tmpdir and its contents.
+                    shutil.rmtree(tmpdir)
+                    if not os.path.isdir(cached_build_dir): raise
+
+            # Compile.
+            cached_sources = [os.path.join(cached_build_dir, os.path.basename(fname)) for fname in sources]
+            torch.utils.cpp_extension.load(name=module_name, build_directory=cached_build_dir,
+                verbose=verbose_build, sources=cached_sources, **build_kwargs)
+        else:
+            torch.utils.cpp_extension.load(name=module_name, verbose=verbose_build, sources=sources, **build_kwargs)
+
+        # Load.
+        module = importlib.import_module(module_name)
+
+    except:
+        if verbosity == 'brief':
+            print('Failed!')
+        raise
+
+    # Print status and add to cache dict.
+    if verbosity == 'full':
+        print(f'Done setting up PyTorch plugin "{module_name}".')
+    elif verbosity == 'brief':
+        print('Done.')
+    _cached_plugins[module_name] = module
+    return module
\ No newline at end of file
diff --git a/torch_utils/misc.py b/torch_utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd512ab8b61ece35d81ec35f43948a843efbbce1
--- /dev/null
+++ b/torch_utils/misc.py
@@ -0,0 +1,264 @@
+﻿# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import re
+import contextlib
+import numpy as np
+import torch
+import warnings
+import dnnlib
+
+#----------------------------------------------------------------------------
+# Cached construction of constant tensors. Avoids CPU=>GPU copy when the
+# same constant is used multiple times.
+
+_constant_cache = dict()
+
+def constant(value, shape=None, dtype=None, device=None, memory_format=None):
+    value = np.asarray(value)
+    if shape is not None:
+        shape = tuple(shape)
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.device('cpu')
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+
+    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
+    tensor = _constant_cache.get(key, None)
+    if tensor is None:
+        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
+        if shape is not None:
+            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
+        tensor = tensor.contiguous(memory_format=memory_format)
+        _constant_cache[key] = tensor
+    return tensor
+
+#----------------------------------------------------------------------------
+# Replace NaN/Inf with specified numerical values.
+
+try:
+    nan_to_num = torch.nan_to_num # 1.8.0a0
+except AttributeError:
+    def nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None): # pylint: disable=redefined-builtin
+        assert isinstance(input, torch.Tensor)
+        if posinf is None:
+            posinf = torch.finfo(input.dtype).max
+        if neginf is None:
+            neginf = torch.finfo(input.dtype).min
+        assert nan == 0
+        return torch.clamp(input.unsqueeze(0).nansum(0), min=neginf, max=posinf, out=out)
+
+#----------------------------------------------------------------------------
+# Symbolic assert.
+
+try:
+    symbolic_assert = torch._assert # 1.8.0a0 # pylint: disable=protected-access
+except AttributeError:
+    symbolic_assert = torch.Assert # 1.7.0
+
+#----------------------------------------------------------------------------
+# Context manager to suppress known warnings in torch.jit.trace().
+
+class suppress_tracer_warnings(warnings.catch_warnings):
+    def __enter__(self):
+        super().__enter__()
+        warnings.simplefilter('ignore', category=torch.jit.TracerWarning)
+        return self
+
+#----------------------------------------------------------------------------
+# Assert that the shape of a tensor matches the given list of integers.
+# None indicates that the size of a dimension is allowed to vary.
+# Performs symbolic assertion when used in torch.jit.trace().
+
+def assert_shape(tensor, ref_shape):
+    if tensor.ndim != len(ref_shape):
+        raise AssertionError(f'Wrong number of dimensions: got {tensor.ndim}, expected {len(ref_shape)}')
+    for idx, (size, ref_size) in enumerate(zip(tensor.shape, ref_shape)):
+        if ref_size is None:
+            pass
+        elif isinstance(ref_size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(torch.as_tensor(size), ref_size), f'Wrong size for dimension {idx}')
+        elif isinstance(size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(size, torch.as_tensor(ref_size)), f'Wrong size for dimension {idx}: expected {ref_size}')
+        elif size != ref_size:
+            raise AssertionError(f'Wrong size for dimension {idx}: got {size}, expected {ref_size}')
+
+#----------------------------------------------------------------------------
+# Function decorator that calls torch.autograd.profiler.record_function().
+
+def profiled_function(fn):
+    def decorator(*args, **kwargs):
+        with torch.autograd.profiler.record_function(fn.__name__):
+            return fn(*args, **kwargs)
+    decorator.__name__ = fn.__name__
+    return decorator
+
+#----------------------------------------------------------------------------
+# Sampler for torch.utils.data.DataLoader that loops over the dataset
+# indefinitely, shuffling items as it goes.
+
+class InfiniteSampler(torch.utils.data.Sampler):
+    def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed=0, window_size=0.5):
+        assert len(dataset) > 0
+        assert num_replicas > 0
+        assert 0 <= rank < num_replicas
+        assert 0 <= window_size <= 1
+        super().__init__(dataset)
+        self.dataset = dataset
+        self.rank = rank
+        self.num_replicas = num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+        self.window_size = window_size
+
+    def __iter__(self):
+        order = np.arange(len(self.dataset))
+        rnd = None
+        window = 0
+        if self.shuffle:
+            rnd = np.random.RandomState(self.seed)
+            rnd.shuffle(order)
+            window = int(np.rint(order.size * self.window_size))
+
+        idx = 0
+        while True:
+            i = idx % order.size
+            if idx % self.num_replicas == self.rank:
+                yield order[i]
+            if window >= 2:
+                j = (i - rnd.randint(window)) % order.size
+                order[i], order[j] = order[j], order[i]
+            idx += 1
+
+#----------------------------------------------------------------------------
+# Utilities for operating with torch.nn.Module parameters and buffers.
+
+def params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.parameters()) + list(module.buffers())
+
+def named_params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.named_parameters()) + list(module.named_buffers())
+
+def copy_params_and_buffers(src_module, dst_module, require_all=False):
+    assert isinstance(src_module, torch.nn.Module)
+    assert isinstance(dst_module, torch.nn.Module)
+    src_tensors = {name: tensor for name, tensor in named_params_and_buffers(src_module)}
+    for name, tensor in named_params_and_buffers(dst_module):
+        assert (name in src_tensors) or (not require_all)
+        if name in src_tensors:
+            tensor.copy_(src_tensors[name].detach()).requires_grad_(tensor.requires_grad)
+
+#----------------------------------------------------------------------------
+# Context manager for easily enabling/disabling DistributedDataParallel
+# synchronization.
+
+@contextlib.contextmanager
+def ddp_sync(module, sync):
+    assert isinstance(module, torch.nn.Module)
+    if sync or not isinstance(module, torch.nn.parallel.DistributedDataParallel):
+        yield
+    else:
+        with module.no_sync():
+            yield
+
+#----------------------------------------------------------------------------
+# Check DistributedDataParallel consistency across processes.
+
+def check_ddp_consistency(module, ignore_regex=None):
+    assert isinstance(module, torch.nn.Module)
+    for name, tensor in named_params_and_buffers(module):
+        fullname = type(module).__name__ + '.' + name
+        if ignore_regex is not None and re.fullmatch(ignore_regex, fullname):
+            continue
+        tensor = tensor.detach()
+        other = tensor.clone()
+        torch.distributed.broadcast(tensor=other, src=0)
+        assert (nan_to_num(tensor) == nan_to_num(other)).all(), fullname
+
+#----------------------------------------------------------------------------
+# Print summary table of module hierarchy.
+
+def print_module_summary(module, inputs, max_nesting=3, skip_redundant=True):
+    assert isinstance(module, torch.nn.Module)
+    assert not isinstance(module, torch.jit.ScriptModule)
+    assert isinstance(inputs, (tuple, list))
+
+    # Register hooks.
+    entries = []
+    nesting = [0]
+    def pre_hook(_mod, _inputs):
+        nesting[0] += 1
+    def post_hook(mod, _inputs, outputs):
+        nesting[0] -= 1
+        if nesting[0] <= max_nesting:
+            outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs]
+            outputs = [t for t in outputs if isinstance(t, torch.Tensor)]
+            entries.append(dnnlib.EasyDict(mod=mod, outputs=outputs))
+    hooks = [mod.register_forward_pre_hook(pre_hook) for mod in module.modules()]
+    hooks += [mod.register_forward_hook(post_hook) for mod in module.modules()]
+
+    # Run module.
+    outputs = module(*inputs)
+    for hook in hooks:
+        hook.remove()
+
+    # Identify unique outputs, parameters, and buffers.
+    tensors_seen = set()
+    for e in entries:
+        e.unique_params = [t for t in e.mod.parameters() if id(t) not in tensors_seen]
+        e.unique_buffers = [t for t in e.mod.buffers() if id(t) not in tensors_seen]
+        e.unique_outputs = [t for t in e.outputs if id(t) not in tensors_seen]
+        tensors_seen |= {id(t) for t in e.unique_params + e.unique_buffers + e.unique_outputs}
+
+    # Filter out redundant entries.
+    if skip_redundant:
+        entries = [e for e in entries if len(e.unique_params) or len(e.unique_buffers) or len(e.unique_outputs)]
+
+    # Construct table.
+    rows = [[type(module).__name__, 'Parameters', 'Buffers', 'Output shape', 'Datatype']]
+    rows += [['---'] * len(rows[0])]
+    param_total = 0
+    buffer_total = 0
+    submodule_names = {mod: name for name, mod in module.named_modules()}
+    for e in entries:
+        name = '<top-level>' if e.mod is module else submodule_names[e.mod]
+        param_size = sum(t.numel() for t in e.unique_params)
+        buffer_size = sum(t.numel() for t in e.unique_buffers)
+        output_shapes = [str(list(e.outputs[0].shape)) for t in e.outputs]
+        output_dtypes = [str(t.dtype).split('.')[-1] for t in e.outputs]
+        rows += [[
+            name + (':0' if len(e.outputs) >= 2 else ''),
+            str(param_size) if param_size else '-',
+            str(buffer_size) if buffer_size else '-',
+            (output_shapes + ['-'])[0],
+            (output_dtypes + ['-'])[0],
+        ]]
+        for idx in range(1, len(e.outputs)):
+            rows += [[name + f':{idx}', '-', '-', output_shapes[idx], output_dtypes[idx]]]
+        param_total += param_size
+        buffer_total += buffer_size
+    rows += [['---'] * len(rows[0])]
+    rows += [['Total', str(param_total), str(buffer_total), '-', '-']]
+
+    # Print table.
+    widths = [max(len(cell) for cell in column) for column in zip(*rows)]
+    print()
+    for row in rows:
+        print('  '.join(cell + ' ' * (width - len(cell)) for cell, width in zip(row, widths)))
+    print()
+    return outputs
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/models.py b/torch_utils/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..762550239ba6f1e09f4887bf1b27fd421745a589
--- /dev/null
+++ b/torch_utils/models.py
@@ -0,0 +1,756 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py
+
+import math
+import random
+import functools
+import operator
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+import torch.nn.init as init
+from torch.autograd import Function
+
+from .op_edit import FusedLeakyReLU, fused_leaky_relu, upfirdn2d
+
+
+class PixelNorm(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input):
+        return input * torch.rsqrt(torch.mean(input ** 2, dim=1, keepdim=True) + 1e-8)
+
+
+def make_kernel(k):
+    k = torch.tensor(k, dtype=torch.float32)
+    if k.ndim == 1:
+        k = k[None, :] * k[:, None]
+    k /= k.sum()
+    return k
+
+
+class Upsample(nn.Module):
+    def __init__(self, kernel, factor=2):
+        super().__init__()
+
+        self.factor = factor
+        kernel = make_kernel(kernel) * (factor ** 2)
+        self.register_buffer("kernel", kernel)
+
+        p = kernel.shape[0] - factor
+
+        pad0 = (p + 1) // 2 + factor - 1
+        pad1 = p // 2
+
+        self.pad = (pad0, pad1)
+
+    def forward(self, input):
+        out = upfirdn2d(input, self.kernel, up=self.factor, down=1, pad=self.pad)
+        return out
+
+
+class Downsample(nn.Module):
+    def __init__(self, kernel, factor=2):
+        super().__init__()
+
+        self.factor = factor
+        kernel = make_kernel(kernel)
+        self.register_buffer("kernel", kernel)
+
+        p = kernel.shape[0] - factor
+
+        pad0 = (p + 1) // 2
+        pad1 = p // 2
+
+        self.pad = (pad0, pad1)
+
+    def forward(self, input):
+        out = upfirdn2d(input, self.kernel, up=1, down=self.factor, pad=self.pad)
+        return out
+
+
+class Blur(nn.Module):
+    def __init__(self, kernel, pad, upsample_factor=1):
+        super().__init__()
+
+        kernel = make_kernel(kernel)
+
+        if upsample_factor > 1:
+            kernel = kernel * (upsample_factor ** 2)
+
+        self.register_buffer("kernel", kernel)
+
+        self.pad = pad
+
+    def forward(self, input):
+        out = upfirdn2d(input, self.kernel, pad=self.pad)
+        return out
+
+
+class EqualConv2d(nn.Module):
+    def __init__(
+        self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True
+    ):
+        super().__init__()
+
+        self.weight = nn.Parameter(
+            torch.randn(out_channel, in_channel, kernel_size, kernel_size)
+        )
+        self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2)
+
+        self.stride = stride
+        self.padding = padding
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_channel))
+
+        else:
+            self.bias = None
+
+    def forward(self, input):
+        out = F.conv2d(
+            input,
+            self.weight * self.scale,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+        )
+        return out
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},"
+            f" {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})"
+        )
+
+
+class EqualLinear(nn.Module):
+    def __init__(
+        self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None
+    ):
+        super().__init__()
+
+        self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
+        else:
+            self.bias = None
+
+        self.activation = activation
+
+        self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+        self.lr_mul = lr_mul
+
+    def forward(self, input):
+        if self.activation:
+            out = F.linear(input, self.weight * self.scale)
+            out = fused_leaky_relu(out, self.bias * self.lr_mul)
+        else:
+            out = F.linear(
+                input, self.weight * self.scale, bias=self.bias * self.lr_mul
+            )
+        return out
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})"
+        )
+
+
+class ScaledLeakyReLU(nn.Module):
+    def __init__(self, negative_slope=0.2):
+        super().__init__()
+        self.negative_slope = negative_slope
+
+    def forward(self, input):
+        out = F.leaky_relu(input, negative_slope=self.negative_slope)
+        return out * math.sqrt(2)
+
+
+class ModulatedConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        kernel_size,
+        style_dim,
+        demodulate=True,
+        upsample=False,
+        downsample=False,
+        blur_kernel=[1, 3, 3, 1],
+    ):
+        super().__init__()
+
+        self.eps = 1e-8
+        self.kernel_size = kernel_size
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        self.upsample = upsample
+        self.downsample = downsample
+
+        if upsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) - (kernel_size - 1)
+            pad0 = (p + 1) // 2 + factor - 1
+            pad1 = p // 2 + 1
+            self.blur = Blur(blur_kernel, pad=(pad0, pad1), upsample_factor=factor)
+
+        if downsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) + (kernel_size - 1)
+            pad0 = (p + 1) // 2
+            pad1 = p // 2
+            self.blur = Blur(blur_kernel, pad=(pad0, pad1))
+
+        fan_in = in_channel * kernel_size ** 2
+        self.scale = 1 / math.sqrt(fan_in)
+        self.padding = kernel_size // 2
+        self.weight = nn.Parameter(
+            torch.randn(1, out_channel, in_channel, kernel_size, kernel_size)
+        )
+        self.modulation = EqualLinear(style_dim, in_channel, bias_init=1)
+        self.demodulate = demodulate
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, "
+            f"upsample={self.upsample}, downsample={self.downsample})"
+        )
+
+    def forward(self, input, style):
+        batch, in_channel, height, width = input.shape
+
+        style = self.modulation(style).view(batch, 1, in_channel, 1, 1)
+        weight = self.scale * self.weight * style
+
+        if self.demodulate:
+            demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + 1e-8)
+            weight = weight * demod.view(batch, self.out_channel, 1, 1, 1)
+
+        weight = weight.view(
+            batch * self.out_channel, in_channel, self.kernel_size, self.kernel_size
+        )
+
+        if self.upsample:
+            input = input.view(1, batch * in_channel, height, width)
+            weight = weight.view(
+                batch, self.out_channel, in_channel, self.kernel_size, self.kernel_size
+            )
+            weight = weight.transpose(1, 2).reshape(
+                batch * in_channel, self.out_channel, self.kernel_size, self.kernel_size
+            )
+            out = F.conv_transpose2d(input, weight, padding=0, stride=2, groups=batch)
+            _, _, height, width = out.shape
+            out = out.view(batch, self.out_channel, height, width)
+            out = self.blur(out)
+
+        elif self.downsample:
+            input = self.blur(input)
+            _, _, height, width = input.shape
+            input = input.view(1, batch * in_channel, height, width)
+            out = F.conv2d(input, weight, padding=0, stride=2, groups=batch)
+            _, _, height, width = out.shape
+            out = out.view(batch, self.out_channel, height, width)
+
+        else:
+            input = input.view(1, batch * in_channel, height, width)
+            out = F.conv2d(input, weight, padding=self.padding, groups=batch)
+            _, _, height, width = out.shape
+            out = out.view(batch, self.out_channel, height, width)
+
+        return out
+
+
+class NoiseInjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(1))
+
+    def forward(self, image, noise=None):
+        if noise is None:
+            batch, _, height, width = image.shape
+            noise = image.new_empty(batch, 1, height, width).normal_()
+        return image + self.weight * noise
+
+
+class ConstantInput(nn.Module):
+    def __init__(self, channel, size=4):
+        super().__init__()
+        self.input = nn.Parameter(torch.randn(1, channel, size, size // 2))
+
+    def forward(self, input):
+        batch = input.shape[0]
+        out = self.input.repeat(batch, 1, 1, 1)
+        return out
+
+
+class StyledConv(nn.Module):
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        kernel_size,
+        style_dim,
+        upsample=False,
+        blur_kernel=[1, 3, 3, 1],
+        demodulate=True,
+    ):
+        super().__init__()
+        self.conv = ModulatedConv2d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            style_dim,
+            upsample=upsample,
+            blur_kernel=blur_kernel,
+            demodulate=demodulate,
+        )
+        self.noise = NoiseInjection()
+        self.activate = FusedLeakyReLU(out_channel)
+
+    def forward(self, input, style, noise=None):
+        out = self.conv(input, style)
+        out = self.noise(out, noise=noise)
+        out = self.activate(out)
+        return out
+
+
+class ToRGB(nn.Module):
+    def __init__(self, in_channel, style_dim, upsample=True, blur_kernel=[1, 3, 3, 1]):
+        super().__init__()
+        if upsample:
+            self.upsample = Upsample(blur_kernel)
+
+        self.conv = ModulatedConv2d(in_channel, 3, 1, style_dim, demodulate=False)
+        self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
+
+    def forward(self, input, style, skip=None):
+        out = self.conv(input, style)
+        out = out + self.bias
+
+        if skip is not None:
+            skip = self.upsample(skip)
+            out = out + skip
+
+        return out
+
+
+class Generator(nn.Module):
+    def __init__(
+        self,
+        size,
+        style_dim,
+        n_mlp,
+        channel_multiplier=1,
+        blur_kernel=[1, 3, 3, 1],
+        lr_mlp=0.01,
+        small=False,
+        small_isaac=False,
+    ):
+        super().__init__()
+
+        self.size = size
+
+        if small and size > 64:
+            raise ValueError("small only works for sizes <= 64")
+
+        self.style_dim = style_dim
+        layers = [PixelNorm()]
+
+        for i in range(n_mlp):
+            layers.append(
+                EqualLinear(
+                    style_dim, style_dim, lr_mul=lr_mlp, activation="fused_lrelu"
+                )
+            )
+
+        self.style = nn.Sequential(*layers)
+
+        if small:
+            self.channels = {
+                4: 64 * channel_multiplier,
+                8: 64 * channel_multiplier,
+                16: 64 * channel_multiplier,
+                32: 64 * channel_multiplier,
+                64: 64 * channel_multiplier,
+            }
+        elif small_isaac:
+            self.channels = {4: 256, 8: 256, 16: 256, 32: 256, 64: 128, 128: 128}
+        else:
+            self.channels = {
+                4: 512,
+                8: 512,
+                16: 512,
+                32: 512,
+                64: 256 * channel_multiplier,
+                128: 128 * channel_multiplier,
+                256: 64 * channel_multiplier,
+                512: 32 * channel_multiplier,
+                1024: 16 * channel_multiplier,
+            }
+
+        self.input = ConstantInput(self.channels[4])
+        self.conv1 = StyledConv(
+            self.channels[4], self.channels[4], 3, style_dim, blur_kernel=blur_kernel
+        )
+        self.to_rgb1 = ToRGB(self.channels[4], style_dim, upsample=False)
+
+        self.log_size = int(math.log(size, 2))
+        self.num_layers = (self.log_size - 2) * 2 + 1
+
+        self.convs = nn.ModuleList()
+        self.upsamples = nn.ModuleList()
+        self.to_rgbs = nn.ModuleList()
+        self.noises = nn.Module()
+
+        in_channel = self.channels[4]
+
+        for layer_idx in range(self.num_layers):
+            res = (layer_idx + 5) // 2
+            shape = [1, 1, 2 ** res, 2 ** res // 2]
+            self.noises.register_buffer(
+                "noise_{}".format(layer_idx), torch.randn(*shape)
+            )
+
+        for i in range(3, self.log_size + 1):
+            out_channel = self.channels[2 ** i]
+
+            self.convs.append(
+                StyledConv(
+                    in_channel,
+                    out_channel,
+                    3,
+                    style_dim,
+                    upsample=True,
+                    blur_kernel=blur_kernel,
+                )
+            )
+
+            self.convs.append(
+                StyledConv(
+                    out_channel, out_channel, 3, style_dim, blur_kernel=blur_kernel
+                )
+            )
+
+            self.to_rgbs.append(ToRGB(out_channel, style_dim))
+            in_channel = out_channel
+
+        self.n_latent = self.log_size * 2 - 2
+
+    def make_noise(self):
+        device = self.input.input.device
+
+        noises = [torch.randn(1, 1, 2 ** 2, 2 ** 2 // 2, device=device)]
+
+        for i in range(3, self.log_size + 1):
+            for _ in range(2):
+                noises.append(torch.randn(1, 1, 2 ** i, 2 ** i // 2, device=device))
+
+        return noises
+
+    def mean_latent(self, n_latent):
+        latent_in = torch.randn(
+            n_latent, self.style_dim, device=self.input.input.device
+        )
+        latent = self.style(latent_in).mean(0, keepdim=True)
+
+        return latent
+
+    def get_latent(self, input):
+        return self.style(input)
+
+    def forward(
+        self,
+        styles,
+        return_latents=False,
+        return_features=False,
+        inject_index=None,
+        truncation=1,
+        truncation_latent=None,
+        input_is_latent=False,
+        noise=None,
+        randomize_noise=True,
+        real=False,
+    ):
+        if not input_is_latent:
+            styles = [self.style(s) for s in styles]
+        if noise is None:
+            if randomize_noise:
+                noise = [None] * self.num_layers
+            else:
+                noise = [
+                    getattr(self.noises, "noise_{}".format(i))
+                    for i in range(self.num_layers)
+                ]
+
+        if truncation < 1:
+            # print('truncation_latent: ', truncation_latent.shape)
+            if not real: #if type(styles) == list:
+                style_t = []
+                for style in styles:
+                    style_t.append(
+                        truncation_latent + truncation * (style - truncation_latent) 
+                    ) # (-1.1162e-03-(-1.0914e-01))*0.8+(-1.0914e-01)
+                styles = style_t
+            else: # styles are latent (tensor: 1,18,512), for real PTI output
+                truncation_latent = truncation_latent.repeat(18,1).unsqueeze(0) # (1,512) --> (1,18,512)
+                styles = torch.add(truncation_latent,torch.mul(torch.sub(styles,truncation_latent),truncation))
+                # print('now styles after truncation : ', styles)
+        #if type(styles) == list and len(styles) < 2: # this if for input as list of [(1,512)]
+        if not real:
+            if len(styles) < 2:
+                inject_index = self.n_latent
+                if styles[0].ndim < 3:
+                    latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
+                else:
+                    latent = styles[0]
+            elif type(styles) == list:
+                if inject_index is None:
+                    inject_index = 4
+                
+                latent = styles[0].unsqueeze(0)
+                if latent.shape[1] == 1:
+                    latent = latent.repeat(1, inject_index, 1)
+                else:
+                    latent = latent[:, :inject_index, :]
+                latent2 = styles[1].unsqueeze(1).repeat(1, self.n_latent - inject_index, 1)
+                latent = torch.cat([latent, latent2], 1)
+        else: # input is tensor of size with torch.Size([1, 18, 512]), for real PTI output
+            latent = styles
+
+        # print(f'processed latent: {latent.shape}')
+
+        features = {}
+        out = self.input(latent)
+        features["out_0"] = out
+        out = self.conv1(out, latent[:, 0], noise=noise[0])
+        features["conv1_0"] = out
+
+        skip = self.to_rgb1(out, latent[:, 1])
+        features["skip_0"] = skip
+        i = 1
+        for conv1, conv2, noise1, noise2, to_rgb in zip(
+            self.convs[::2], self.convs[1::2], noise[1::2], noise[2::2], self.to_rgbs
+        ):
+            out = conv1(out, latent[:, i], noise=noise1)
+            features["conv1_{}".format(i)] = out
+            out = conv2(out, latent[:, i + 1], noise=noise2)
+            features["conv2_{}".format(i)] = out
+            skip = to_rgb(out, latent[:, i + 2], skip)
+            features["skip_{}".format(i)] = skip
+
+            i += 2
+
+        image = skip
+
+        if return_latents:
+            return image, latent
+        elif return_features:
+            return image, features
+        else:
+            return image, None
+
+
+class ConvLayer(nn.Sequential):
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        kernel_size,
+        downsample=False,
+        blur_kernel=[1, 3, 3, 1],
+        bias=True,
+        activate=True,
+    ):
+        layers = []
+
+        if downsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) + (kernel_size - 1)
+            pad0 = (p + 1) // 2
+            pad1 = p // 2
+
+            layers.append(Blur(blur_kernel, pad=(pad0, pad1)))
+
+            stride = 2
+            self.padding = 0
+
+        else:
+            stride = 1
+            self.padding = kernel_size // 2
+
+        layers.append(
+            EqualConv2d(
+                in_channel,
+                out_channel,
+                kernel_size,
+                padding=self.padding,
+                stride=stride,
+                bias=bias and not activate,
+            )
+        )
+
+        if activate:
+            if bias:
+                layers.append(FusedLeakyReLU(out_channel))
+            else:
+                layers.append(ScaledLeakyReLU(0.2))
+
+        super().__init__(*layers)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
+        super().__init__()
+
+        self.conv1 = ConvLayer(in_channel, in_channel, 3)
+        self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True)
+
+        self.skip = ConvLayer(
+            in_channel, out_channel, 1, downsample=True, activate=False, bias=False
+        )
+
+    def forward(self, input):
+        out = self.conv1(input)
+        out = self.conv2(out)
+
+        skip = self.skip(input)
+        out = (out + skip) / math.sqrt(2)
+
+        return out
+
+
+class StyleDiscriminator(nn.Module):
+    def __init__(
+        self, size, channel_multiplier=2, blur_kernel=[1, 3, 3, 1], small=False
+    ):
+        super().__init__()
+
+        if small:
+            channels = {4: 64, 8: 64, 16: 64, 32: 64, 64: 64}
+
+        else:
+            channels = {
+                4: 512,
+                8: 512,
+                16: 512,
+                32: 512,
+                64: 256 * channel_multiplier,
+                128: 128 * channel_multiplier,
+                256: 64 * channel_multiplier,
+                512: 32 * channel_multiplier,
+                1024: 16 * channel_multiplier,
+            }
+
+        convs = [ConvLayer(3, channels[size], 1)]
+
+        log_size = int(math.log(size, 2))
+        in_channel = channels[size]
+
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2 ** (i - 1)]
+
+            convs.append(ResBlock(in_channel, out_channel, blur_kernel))
+
+            in_channel = out_channel
+
+        self.convs = nn.Sequential(*convs)
+
+        self.stddev_group = 4
+        self.stddev_feat = 1
+
+        self.final_conv = ConvLayer(in_channel + 1, channels[4], 3)
+        self.final_linear = nn.Sequential(
+            EqualLinear(channels[4] * 4 * 4, channels[4], activation="fused_lrelu"),
+            EqualLinear(channels[4], 1),
+        )
+
+    
+    def forward(self, input):
+        h = input
+        h_list = []
+        
+        for index, blocklist in enumerate(self.convs):
+            h = blocklist(h)
+            h_list.append(h)
+         
+        out = h
+        batch, channel, height, width = out.shape
+        group = min(batch, self.stddev_group)
+        stddev = out.view(
+            group, -1, self.stddev_feat, channel // self.stddev_feat, height, width
+        )
+        stddev = torch.sqrt(stddev.var(0, unbiased=False) + 1e-8)
+        stddev = stddev.mean([2, 3, 4], keepdims=True).squeeze(2)
+        stddev = stddev.repeat(group, 1, height, width)
+        out = torch.cat([out, stddev], 1)
+
+        out = self.final_conv(out)
+        h_list.append(out)
+        
+        out = out.view(batch, -1)
+        out = self.final_linear(out)
+        
+        return out, h_list
+
+
+class StyleEncoder(nn.Module):
+    def __init__(self, size, w_dim=512):
+        super().__init__()
+        
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            64: 256,
+            128: 128,
+            256: 64,
+            512: 32,
+            1024: 16
+        }        
+        
+        self.w_dim = w_dim
+        log_size = int(math.log(size, 2))
+        convs = [ConvLayer(3, channels[size], 1)]
+
+        in_channel = channels[size]
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2 ** (i - 1)]
+            convs.append(ResBlock(in_channel, out_channel))
+            in_channel = out_channel
+
+        convs.append(EqualConv2d(in_channel,2*self.w_dim, 4, padding=0, bias=False))    
+
+        self.convs = nn.Sequential(*convs)
+
+    def forward(self, input):
+        out = self.convs(input)
+        # return out.view(len(input), self.n_latents, self.w_dim)
+        reshaped =  out.view(len(input), 2*self.w_dim)
+        return reshaped[:,:self.w_dim], reshaped[:,self.w_dim:]
+
+def kaiming_init(m):
+    if isinstance(m, (nn.Linear, nn.Conv2d)):
+        init.kaiming_normal_(m.weight)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+    elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)):
+        m.weight.data.fill_(1)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+
+
+def normal_init(m):
+    if isinstance(m, (nn.Linear, nn.Conv2d)):
+        init.normal_(m.weight, 0, 0.02)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+    elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)):
+        m.weight.data.fill_(1)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
\ No newline at end of file
diff --git a/model.py b/torch_utils/models_face.py
similarity index 72%
rename from model.py
rename to torch_utils/models_face.py
index 67aec32b7857fba2767c30ce31667c7dbd19091d..ce3f5d2f3c41206c18a9dba973c8e5999ddf47fd 100644
--- a/model.py
+++ b/torch_utils/models_face.py
@@ -1,15 +1,17 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
 import math
 import random
 import functools
 import operator
-import numpy as np
 
 import torch
 from torch import nn
 from torch.nn import functional as F
+import torch.nn.init as init
 from torch.autograd import Function
 
-from op import FusedLeakyReLU, fused_leaky_relu, upfirdn2d
+from .op_edit import FusedLeakyReLU, fused_leaky_relu, upfirdn2d
 
 
 class PixelNorm(nn.Module):
@@ -37,7 +39,7 @@ class Upsample(nn.Module):
 
         self.factor = factor
         kernel = make_kernel(kernel) * (factor ** 2)
-        self.register_buffer('kernel', kernel)
+        self.register_buffer("kernel", kernel)
 
         p = kernel.shape[0] - factor
 
@@ -58,7 +60,7 @@ class Downsample(nn.Module):
 
         self.factor = factor
         kernel = make_kernel(kernel)
-        self.register_buffer('kernel', kernel)
+        self.register_buffer("kernel", kernel)
 
         p = kernel.shape[0] - factor
 
@@ -82,7 +84,7 @@ class Blur(nn.Module):
         if upsample_factor > 1:
             kernel = kernel * (upsample_factor ** 2)
 
-        self.register_buffer('kernel', kernel)
+        self.register_buffer("kernel", kernel)
 
         self.pad = pad
 
@@ -125,8 +127,8 @@ class EqualConv2d(nn.Module):
 
     def __repr__(self):
         return (
-            f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},'
-            f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})'
+            f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},"
+            f" {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})"
         )
 
 
@@ -163,7 +165,7 @@ class EqualLinear(nn.Module):
 
     def __repr__(self):
         return (
-            f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})'
+            f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})"
         )
 
 
@@ -230,8 +232,8 @@ class ModulatedConv2d(nn.Module):
 
     def __repr__(self):
         return (
-            f'{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, '
-            f'upsample={self.upsample}, downsample={self.downsample})'
+            f"{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, "
+            f"upsample={self.upsample}, downsample={self.downsample})"
         )
 
     def forward(self, input, style):
@@ -354,7 +356,6 @@ class ToRGB(nn.Module):
 
     def forward(self, input, style, skip=None):
         out = self.conv(input, style)
-        style_modulated = out
         out = out + self.bias
 
         if skip is not None:
@@ -362,7 +363,7 @@ class ToRGB(nn.Module):
 
             out = out + skip
 
-        return out, style_modulated
+        return out
 
 
 class Generator(nn.Module):
@@ -371,14 +372,19 @@ class Generator(nn.Module):
         size,
         style_dim,
         n_mlp,
-        channel_multiplier=2,
+        channel_multiplier=1,
         blur_kernel=[1, 3, 3, 1],
         lr_mlp=0.01,
+        small=False,
+        small_isaac=False,
     ):
         super().__init__()
 
         self.size = size
 
+        if small and size > 64:
+            raise ValueError("small only works for sizes <= 64")
+
         self.style_dim = style_dim
 
         layers = [PixelNorm()]
@@ -386,23 +392,34 @@ class Generator(nn.Module):
         for i in range(n_mlp):
             layers.append(
                 EqualLinear(
-                    style_dim, style_dim, lr_mul=lr_mlp, activation='fused_lrelu'
+                    style_dim, style_dim, lr_mul=lr_mlp, activation="fused_lrelu"
                 )
             )
 
         self.style = nn.Sequential(*layers)
 
-        self.channels = {
-            4: 512,
-            8: 512,
-            16: 512,
-            32: 512,
-            64: 256 * channel_multiplier,
-            128: 128 * channel_multiplier,
-            256: 64 * channel_multiplier,
-            512: 32 * channel_multiplier,
-            1024: 16 * channel_multiplier,
-        }
+        if small:
+            self.channels = {
+                4: 64 * channel_multiplier,
+                8: 64 * channel_multiplier,
+                16: 64 * channel_multiplier,
+                32: 64 * channel_multiplier,
+                64: 64 * channel_multiplier,
+            }
+        elif small_isaac:
+            self.channels = {4: 256, 8: 256, 16: 256, 32: 256, 64: 128, 128: 128}
+        else:
+            self.channels = {
+                4: 512,
+                8: 512,
+                16: 512,
+                32: 512,
+                64: 256 * channel_multiplier,
+                128: 128 * channel_multiplier,
+                256: 64 * channel_multiplier,
+                512: 32 * channel_multiplier,
+                1024: 16 * channel_multiplier,
+            }
 
         self.input = ConstantInput(self.channels[4])
         self.conv1 = StyledConv(
@@ -423,7 +440,9 @@ class Generator(nn.Module):
         for layer_idx in range(self.num_layers):
             res = (layer_idx + 5) // 2
             shape = [1, 1, 2 ** res, 2 ** res]
-            self.noises.register_buffer(f'noise_{layer_idx}', torch.randn(*shape))
+            self.noises.register_buffer(
+                "noise_{}".format(layer_idx), torch.randn(*shape)
+            )
 
         for i in range(3, self.log_size + 1):
             out_channel = self.channels[2 ** i]
@@ -451,32 +470,17 @@ class Generator(nn.Module):
 
         self.n_latent = self.log_size * 2 - 2
 
-    @property
-    def device(self):
-        # TODO if multi-gpu is expected, could use the following more expensive version
-        #device, = list(set(p.device for p in self.parameters()))
-        return next(self.parameters()).device
-
-    @staticmethod
-    def get_latent_size(size):
-        log_size = int(math.log(size, 2))
-        return log_size * 2 - 2
+    def make_noise(self):
+        device = self.input.input.device
 
-    @staticmethod
-    def make_noise_by_size(size: int, device: torch.device):
-        log_size = int(math.log(size, 2))
         noises = [torch.randn(1, 1, 2 ** 2, 2 ** 2, device=device)]
 
-        for i in range(3, log_size + 1):
+        for i in range(3, self.log_size + 1):
             for _ in range(2):
                 noises.append(torch.randn(1, 1, 2 ** i, 2 ** i, device=device))
 
         return noises
 
-
-    def make_noise(self):
-        return self.make_noise_by_size(self.size, self.input.input.device)
-
     def mean_latent(self, n_latent):
         latent_in = torch.randn(
             n_latent, self.style_dim, device=self.input.input.device
@@ -492,6 +496,7 @@ class Generator(nn.Module):
         self,
         styles,
         return_latents=False,
+        return_features=False,
         inject_index=None,
         truncation=1,
         truncation_latent=None,
@@ -500,14 +505,15 @@ class Generator(nn.Module):
         randomize_noise=True,
     ):
         if not input_is_latent:
+            # print("haha")
             styles = [self.style(s) for s in styles]
-
         if noise is None:
             if randomize_noise:
                 noise = [None] * self.num_layers
             else:
                 noise = [
-                    getattr(self.noises, f'noise_{i}') for i in range(self.num_layers)
+                    getattr(self.noises, "noise_{}".format(i))
+                    for i in range(self.num_layers)
                 ]
 
         if truncation < 1:
@@ -519,50 +525,61 @@ class Generator(nn.Module):
                 )
 
             styles = style_t
-
+        # print(styles)
         if len(styles) < 2:
             inject_index = self.n_latent
-
+            
             if styles[0].ndim < 3:
                 latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-
+                # print("a")
             else:
+                # print(len(styles))
                 latent = styles[0]
+                # print("b", latent.shape)
 
         else:
+            # print("c")
             if inject_index is None:
-                inject_index = random.randint(1, self.n_latent - 1)
-
-            latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
+                inject_index = 4
+            
+            latent = styles[0].unsqueeze(0)
+            if latent.shape[1] == 1:
+                latent = latent.repeat(1, inject_index, 1)
+            else:
+                latent = latent[:, :inject_index, :]
             latent2 = styles[1].unsqueeze(1).repeat(1, self.n_latent - inject_index, 1)
 
             latent = torch.cat([latent, latent2], 1)
 
+        features = {}
         out = self.input(latent)
+        features["out_0"] = out
         out = self.conv1(out, latent[:, 0], noise=noise[0])
+        features["conv1_0"] = out
 
-        skip, rgb_mod = self.to_rgb1(out, latent[:, 1])
-
-
-        rgbs = [rgb_mod] # all but the last skip
+        skip = self.to_rgb1(out, latent[:, 1])
+        features["skip_0"] = skip
         i = 1
         for conv1, conv2, noise1, noise2, to_rgb in zip(
             self.convs[::2], self.convs[1::2], noise[1::2], noise[2::2], self.to_rgbs
         ):
             out = conv1(out, latent[:, i], noise=noise1)
+            features["conv1_{}".format(i)] = out
             out = conv2(out, latent[:, i + 1], noise=noise2)
-            skip, rgb_mod = to_rgb(out, latent[:, i + 2], skip)
-            rgbs.append(rgb_mod)
+            features["conv2_{}".format(i)] = out
+            skip = to_rgb(out, latent[:, i + 2], skip)
+            features["skip_{}".format(i)] = skip
 
             i += 2
 
         image = skip
 
         if return_latents:
-            return image, latent, rgbs
-
+            return image, latent
+        elif return_features:
+            return image, features
         else:
-            return image, None, rgbs
+            return image, None
 
 
 class ConvLayer(nn.Sequential):
@@ -635,21 +652,27 @@ class ResBlock(nn.Module):
         return out
 
 
-class Discriminator(nn.Module):
-    def __init__(self, size, channel_multiplier=2, blur_kernel=[1, 3, 3, 1]):
+class StyleDiscriminator(nn.Module):
+    def __init__(
+        self, size, channel_multiplier=2, blur_kernel=[1, 3, 3, 1], small=False
+    ):
         super().__init__()
 
-        channels = {
-            4: 512,
-            8: 512,
-            16: 512,
-            32: 512,
-            64: 256 * channel_multiplier,
-            128: 128 * channel_multiplier,
-            256: 64 * channel_multiplier,
-            512: 32 * channel_multiplier,
-            1024: 16 * channel_multiplier,
-        }
+        if small:
+            channels = {4: 64, 8: 64, 16: 64, 32: 64, 64: 64}
+
+        else:
+            channels = {
+                4: 512,
+                8: 512,
+                16: 512,
+                32: 512,
+                64: 256 * channel_multiplier,
+                128: 128 * channel_multiplier,
+                256: 64 * channel_multiplier,
+                512: 32 * channel_multiplier,
+                1024: 16 * channel_multiplier,
+            }
 
         convs = [ConvLayer(3, channels[size], 1)]
 
@@ -671,13 +694,39 @@ class Discriminator(nn.Module):
 
         self.final_conv = ConvLayer(in_channel + 1, channels[4], 3)
         self.final_linear = nn.Sequential(
-            EqualLinear(channels[4] * 4 * 4, channels[4], activation='fused_lrelu'),
+            EqualLinear(channels[4] * 4 * 4, channels[4], activation="fused_lrelu"),
             EqualLinear(channels[4], 1),
         )
 
-    def forward(self, input):
-        out = self.convs(input)
+#     def forward(self, input):
+#         out = self.convs(input)
+
+#         batch, channel, height, width = out.shape
+#         group = min(batch, self.stddev_group)
+#         stddev = out.view(
+#             group, -1, self.stddev_feat, channel // self.stddev_feat, height, width
+#         )
+#         stddev = torch.sqrt(stddev.var(0, unbiased=False) + 1e-8)
+#         stddev = stddev.mean([2, 3, 4], keepdims=True).squeeze(2)
+#         stddev = stddev.repeat(group, 1, height, width)
+#         out = torch.cat([out, stddev], 1)
+
+#         out = self.final_conv(out)
+
+#         out = out.view(batch, -1)
+#         out = self.final_linear(out)
 
+#         return out
+    
+    def forward(self, input):
+        h = input
+        h_list = []
+        
+        for index, blocklist in enumerate(self.convs):
+            h = blocklist(h)
+            h_list.append(h)
+         
+        out = h
         batch, channel, height, width = out.shape
         group = min(batch, self.stddev_group)
         stddev = out.view(
@@ -689,9 +738,72 @@ class Discriminator(nn.Module):
         out = torch.cat([out, stddev], 1)
 
         out = self.final_conv(out)
-
+        h_list.append(out)
+        
         out = out.view(batch, -1)
         out = self.final_linear(out)
+        
+        return out, h_list
 
-        return out
 
+class StyleEncoder(nn.Module):
+    def __init__(self, size, w_dim=512):
+        super().__init__()
+        
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            64: 256,
+            128: 128,
+            256: 64,
+            512: 32,
+            1024: 16
+        }        
+        
+        self.w_dim = w_dim
+        log_size = int(math.log(size, 2))
+        
+        # self.n_latents = log_size*2 - 2
+        
+        convs = [ConvLayer(3, channels[size], 1)]
+
+        in_channel = channels[size]
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2 ** (i - 1)]
+            convs.append(ResBlock(in_channel, out_channel))
+            in_channel = out_channel
+
+        # convs.append(EqualConv2d(in_channel, self.n_latents*self.w_dim, 4, padding=0, bias=False))
+        convs.append(EqualConv2d(in_channel,2*self.w_dim, 4, padding=0, bias=False))    
+
+
+        self.convs = nn.Sequential(*convs)
+
+    def forward(self, input):
+        out = self.convs(input)
+        # return out.view(len(input), self.n_latents, self.w_dim)
+        reshaped =  out.view(len(input), 2*self.w_dim)
+        return reshaped[:,:self.w_dim], reshaped[:,self.w_dim:]
+
+def kaiming_init(m):
+    if isinstance(m, (nn.Linear, nn.Conv2d)):
+        init.kaiming_normal_(m.weight)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+    elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)):
+        m.weight.data.fill_(1)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+
+
+def normal_init(m):
+    if isinstance(m, (nn.Linear, nn.Conv2d)):
+        init.normal_(m.weight, 0, 0.02)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+    elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)):
+        m.weight.data.fill_(1)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
\ No newline at end of file
diff --git a/op/__init__.py b/torch_utils/op_edit/__init__.py
similarity index 60%
rename from op/__init__.py
rename to torch_utils/op_edit/__init__.py
index d0918d92285955855be89f00096b888ee5597ce3..d2a7efe79d871852affd9de7b46f726a7942f218 100644
--- a/op/__init__.py
+++ b/torch_utils/op_edit/__init__.py
@@ -1,2 +1,4 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
 from .fused_act import FusedLeakyReLU, fused_leaky_relu
 from .upfirdn2d import upfirdn2d
diff --git a/op/fused_act.py b/torch_utils/op_edit/fused_act.py
similarity index 76%
rename from op/fused_act.py
rename to torch_utils/op_edit/fused_act.py
index 8459d510d7b79684779dfe47f5b46d81c94b4a4d..138f090bc67b94b363c346cbf405990f1bbdff68 100644
--- a/op/fused_act.py
+++ b/torch_utils/op_edit/fused_act.py
@@ -1,17 +1,20 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
 import os
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 from torch.autograd import Function
 from torch.utils.cpp_extension import load
 
 
 module_path = os.path.dirname(__file__)
 fused = load(
-    'fused',
+    "fused",
     sources=[
-        os.path.join(module_path, 'fused_bias_act.cpp'),
-        os.path.join(module_path, 'fused_bias_act_kernel.cu'),
+        os.path.join(module_path, "fused_bias_act.cpp"),
+        os.path.join(module_path, "fused_bias_act_kernel.cu"),
     ],
 )
 
@@ -40,7 +43,7 @@ class FusedLeakyReLUFunctionBackward(Function):
 
     @staticmethod
     def backward(ctx, gradgrad_input, gradgrad_bias):
-        out, = ctx.saved_tensors
+        (out,) = ctx.saved_tensors
         gradgrad_out = fused.fused_bias_act(
             gradgrad_input, gradgrad_bias, out, 3, 1, ctx.negative_slope, ctx.scale
         )
@@ -61,7 +64,7 @@ class FusedLeakyReLUFunction(Function):
 
     @staticmethod
     def backward(ctx, grad_output):
-        out, = ctx.saved_tensors
+        (out,) = ctx.saved_tensors
 
         grad_input, grad_bias = FusedLeakyReLUFunctionBackward.apply(
             grad_output, out, ctx.negative_slope, ctx.scale
@@ -83,4 +86,14 @@ class FusedLeakyReLU(nn.Module):
 
 
 def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5):
-    return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale)
+    if input.device.type == "cpu":
+        rest_dim = [1] * (input.ndim - bias.ndim - 1)
+        return (
+            F.leaky_relu(
+                input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=0.2
+            )
+            * scale
+        )
+
+    else:
+        return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale)
diff --git a/op/fused_bias_act.cpp b/torch_utils/op_edit/fused_bias_act.cpp
similarity index 93%
rename from op/fused_bias_act.cpp
rename to torch_utils/op_edit/fused_bias_act.cpp
index 02be898f970bcc8ea297867fcaa4e71b24b3d949..a79a3d65b8fb56393c954630ae8ce5a5c8a8bb7d 100644
--- a/op/fused_bias_act.cpp
+++ b/torch_utils/op_edit/fused_bias_act.cpp
@@ -1,3 +1,5 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
 #include <torch/extension.h>
 
 
diff --git a/op/fused_bias_act_kernel.cu b/torch_utils/op_edit/fused_bias_act_kernel.cu
similarity index 97%
rename from op/fused_bias_act_kernel.cu
rename to torch_utils/op_edit/fused_bias_act_kernel.cu
index c9fa56fea7ede7072dc8925cfb0148f136eb85b8..2d72170bfbd766d7f6ccaf9bdd866833a5dad14f 100644
--- a/op/fused_bias_act_kernel.cu
+++ b/torch_utils/op_edit/fused_bias_act_kernel.cu
@@ -1,3 +1,5 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
 // Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
 //
 // This work is made available under the Nvidia Source Code License-NC.
diff --git a/op/upfirdn2d.cpp b/torch_utils/op_edit/upfirdn2d.cpp
similarity index 94%
rename from op/upfirdn2d.cpp
rename to torch_utils/op_edit/upfirdn2d.cpp
index d2e633dc896433c205e18bc3e455539192ff968e..ec39812ba6f50386a0b7f5bf95545265ec419930 100644
--- a/op/upfirdn2d.cpp
+++ b/torch_utils/op_edit/upfirdn2d.cpp
@@ -1,3 +1,5 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
 #include <torch/extension.h>
 
 
diff --git a/op/upfirdn2d.py b/torch_utils/op_edit/upfirdn2d.py
similarity index 84%
rename from op/upfirdn2d.py
rename to torch_utils/op_edit/upfirdn2d.py
index f1bbf96777f2c7267c1fef1733972014684ea22b..874c09c5e98bee1ace64408aa31ec547dfe695a4 100644
--- a/op/upfirdn2d.py
+++ b/torch_utils/op_edit/upfirdn2d.py
@@ -1,16 +1,19 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
 import os
 
 import torch
+from torch.nn import functional as F
 from torch.autograd import Function
 from torch.utils.cpp_extension import load
 
 
 module_path = os.path.dirname(__file__)
 upfirdn2d_op = load(
-    'upfirdn2d',
+    "upfirdn2d",
     sources=[
-        os.path.join(module_path, 'upfirdn2d.cpp'),
-        os.path.join(module_path, 'upfirdn2d_kernel.cu'),
+        os.path.join(module_path, "upfirdn2d.cpp"),
+        os.path.join(module_path, "upfirdn2d_kernel.cu"),
     ],
 )
 
@@ -60,7 +63,7 @@ class UpFirDn2dBackward(Function):
 
     @staticmethod
     def backward(ctx, gradgrad_input):
-        kernel, = ctx.saved_tensors
+        (kernel,) = ctx.saved_tensors
 
         gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], ctx.in_size[3], 1)
 
@@ -142,9 +145,15 @@ class UpFirDn2d(Function):
 
 
 def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
-    out = UpFirDn2d.apply(
-        input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1])
-    )
+    if input.device.type == "cpu":
+        out = upfirdn2d_native(
+            input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]
+        )
+
+    else:
+        out = UpFirDn2d.apply(
+            input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1])
+        )
 
     return out
 
@@ -152,6 +161,9 @@ def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
 def upfirdn2d_native(
     input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
 ):
+    _, channel, in_h, in_w = input.shape
+    input = input.reshape(-1, in_h, in_w, 1)
+
     _, in_h, in_w, minor = input.shape
     kernel_h, kernel_w = kernel.shape
 
@@ -182,6 +194,9 @@ def upfirdn2d_native(
         in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
     )
     out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
 
-    return out[:, ::down_y, ::down_x, :]
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
 
+    return out.view(-1, channel, out_h, out_w)
diff --git a/torch_utils/op_edit/upfirdn2d_kernel.cu b/torch_utils/op_edit/upfirdn2d_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f82f113bd489e86b4e5ee6bc40c9c3d75e30aead
--- /dev/null
+++ b/torch_utils/op_edit/upfirdn2d_kernel.cu
@@ -0,0 +1,371 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <torch/types.h>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
+
+  if (c * b > a) {
+    c--;
+  }
+
+  return c;
+}
+
+struct UpFirDn2DKernelParams {
+  int up_x;
+  int up_y;
+  int down_x;
+  int down_y;
+  int pad_x0;
+  int pad_x1;
+  int pad_y0;
+  int pad_y1;
+
+  int major_dim;
+  int in_h;
+  int in_w;
+  int minor_dim;
+  int kernel_h;
+  int kernel_w;
+  int out_h;
+  int out_w;
+  int loop_major;
+  int loop_x;
+};
+
+template <typename scalar_t>
+__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input,
+                                       const scalar_t *kernel,
+                                       const UpFirDn2DKernelParams p) {
+  int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int out_y = minor_idx / p.minor_dim;
+  minor_idx -= out_y * p.minor_dim;
+  int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (out_x_base >= p.out_w || out_y >= p.out_h ||
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0;
+  int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h);
+  int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y;
+  int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y;
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major && major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, out_x = out_x_base;
+         loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) {
+      int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0;
+      int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w);
+      int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x;
+      int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x;
+
+      const scalar_t *x_p =
+          &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim +
+                 minor_idx];
+      const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x];
+      int x_px = p.minor_dim;
+      int k_px = -p.up_x;
+      int x_py = p.in_w * p.minor_dim;
+      int k_py = -p.up_y * p.kernel_w;
+
+      scalar_t v = 0.0f;
+
+      for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+          v += static_cast<scalar_t>(*x_p) * static_cast<scalar_t>(*k_p);
+          x_p += x_px;
+          k_p += k_px;
+        }
+
+        x_p += x_py - w * x_px;
+        k_p += k_py - w * k_px;
+      }
+
+      out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+          minor_idx] = v;
+    }
+  }
+}
+
+template <typename scalar_t, int up_x, int up_y, int down_x, int down_y,
+          int kernel_h, int kernel_w, int tile_out_h, int tile_out_w>
+__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input,
+                                 const scalar_t *kernel,
+                                 const UpFirDn2DKernelParams p) {
+  const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
+  const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
+
+  __shared__ volatile float sk[kernel_h][kernel_w];
+  __shared__ volatile float sx[tile_in_h][tile_in_w];
+
+  int minor_idx = blockIdx.x;
+  int tile_out_y = minor_idx / p.minor_dim;
+  minor_idx -= tile_out_y * p.minor_dim;
+  tile_out_y *= tile_out_h;
+  int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h |
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w;
+       tap_idx += blockDim.x) {
+    int ky = tap_idx / kernel_w;
+    int kx = tap_idx - ky * kernel_w;
+    scalar_t v = 0.0;
+
+    if (kx < p.kernel_w & ky < p.kernel_h) {
+      v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
+    }
+
+    sk[ky][kx] = v;
+  }
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major & major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, tile_out_x = tile_out_x_base;
+         loop_x < p.loop_x & tile_out_x < p.out_w;
+         loop_x++, tile_out_x += tile_out_w) {
+      int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
+      int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
+      int tile_in_x = floor_div(tile_mid_x, up_x);
+      int tile_in_y = floor_div(tile_mid_y, up_y);
+
+      __syncthreads();
+
+      for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w;
+           in_idx += blockDim.x) {
+        int rel_in_y = in_idx / tile_in_w;
+        int rel_in_x = in_idx - rel_in_y * tile_in_w;
+        int in_x = rel_in_x + tile_in_x;
+        int in_y = rel_in_y + tile_in_y;
+
+        scalar_t v = 0.0;
+
+        if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
+          v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) *
+                        p.minor_dim +
+                    minor_idx];
+        }
+
+        sx[rel_in_y][rel_in_x] = v;
+      }
+
+      __syncthreads();
+      for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w;
+           out_idx += blockDim.x) {
+        int rel_out_y = out_idx / tile_out_w;
+        int rel_out_x = out_idx - rel_out_y * tile_out_w;
+        int out_x = rel_out_x + tile_out_x;
+        int out_y = rel_out_y + tile_out_y;
+
+        int mid_x = tile_mid_x + rel_out_x * down_x;
+        int mid_y = tile_mid_y + rel_out_y * down_y;
+        int in_x = floor_div(mid_x, up_x);
+        int in_y = floor_div(mid_y, up_y);
+        int rel_in_x = in_x - tile_in_x;
+        int rel_in_y = in_y - tile_in_y;
+        int kernel_x = (in_x + 1) * up_x - mid_x - 1;
+        int kernel_y = (in_y + 1) * up_y - mid_y - 1;
+
+        scalar_t v = 0.0;
+
+#pragma unroll
+        for (int y = 0; y < kernel_h / up_y; y++)
+#pragma unroll
+          for (int x = 0; x < kernel_w / up_x; x++)
+            v += sx[rel_in_y + y][rel_in_x + x] *
+                 sk[kernel_y + y * up_y][kernel_x + x * up_x];
+
+        if (out_x < p.out_w & out_y < p.out_h) {
+          out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+              minor_idx] = v;
+        }
+      }
+    }
+  }
+}
+
+torch::Tensor upfirdn2d_op(const torch::Tensor &input,
+                           const torch::Tensor &kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  UpFirDn2DKernelParams p;
+
+  auto x = input.contiguous();
+  auto k = kernel.contiguous();
+
+  p.major_dim = x.size(0);
+  p.in_h = x.size(1);
+  p.in_w = x.size(2);
+  p.minor_dim = x.size(3);
+  p.kernel_h = k.size(0);
+  p.kernel_w = k.size(1);
+  p.up_x = up_x;
+  p.up_y = up_y;
+  p.down_x = down_x;
+  p.down_y = down_y;
+  p.pad_x0 = pad_x0;
+  p.pad_x1 = pad_x1;
+  p.pad_y0 = pad_y0;
+  p.pad_y1 = pad_y1;
+
+  p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) /
+            p.down_y;
+  p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) /
+            p.down_x;
+
+  auto out =
+      at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
+
+  int mode = -1;
+
+  int tile_out_h = -1;
+  int tile_out_w = -1;
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 1;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 3 && p.kernel_w <= 3) {
+    mode = 2;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 3;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 4;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 5;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 6;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  dim3 block_size;
+  dim3 grid_size;
+
+  if (tile_out_h > 0 && tile_out_w > 0) {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 1;
+    block_size = dim3(32 * 8, 1, 1);
+    grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
+                     (p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  } else {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 4;
+    block_size = dim3(4, 32, 1);
+    grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1,
+                     (p.out_w - 1) / (p.loop_x * block_size.y) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    switch (mode) {
+    case 1:
+      upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 4, 4, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 2:
+      upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 3, 3, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 3:
+      upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 4, 4, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 4:
+      upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 2, 2, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 5:
+      upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 6:
+      upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    default:
+      upfirdn2d_kernel_large<scalar_t><<<grid_size, block_size, 0, stream>>>(
+          out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+          k.data_ptr<scalar_t>(), p);
+    }
+  });
+
+  return out;
+}
\ No newline at end of file
diff --git a/torch_utils/ops/__init__.py b/torch_utils/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c46c314cf2ff24fff74d7308dd8cc50767dd870
--- /dev/null
+++ b/torch_utils/ops/__init__.py
@@ -0,0 +1,3 @@
+﻿# Copyright (c) SenseTime Research. All rights reserved.
+
+#empty
\ No newline at end of file
diff --git a/torch_utils/ops/bias_act.cpp b/torch_utils/ops/bias_act.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aef47317a3ae018de6ea620060337bcf44b2d649
--- /dev/null
+++ b/torch_utils/ops/bias_act.cpp
@@ -0,0 +1,101 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+
+static bool has_same_layout(torch::Tensor x, torch::Tensor y)
+{
+    if (x.dim() != y.dim())
+        return false;
+    for (int64_t i = 0; i < x.dim(); i++)
+    {
+        if (x.size(i) != y.size(i))
+            return false;
+        if (x.size(i) >= 2 && x.stride(i) != y.stride(i))
+            return false;
+    }
+    return true;
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor bias_act(torch::Tensor x, torch::Tensor b, torch::Tensor xref, torch::Tensor yref, torch::Tensor dy, int grad, int dim, int act, float alpha, float gain, float clamp)
+{
+    // Validate arguments.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    TORCH_CHECK(b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()), "b must have the same dtype and device as x");
+    TORCH_CHECK(xref.numel() == 0 || (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() && xref.device() == x.device()), "xref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(yref.numel() == 0 || (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() && yref.device() == x.device()), "yref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() && dy.device() == x.device()), "dy must have the same dtype and device as x");
+    TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+    TORCH_CHECK(b.dim() == 1, "b must have rank 1");
+    TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()), "dim is out of bounds");
+    TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim), "b has wrong number of elements");
+    TORCH_CHECK(grad >= 0, "grad must be non-negative");
+
+    // Validate layout.
+    TORCH_CHECK(x.is_non_overlapping_and_dense(), "x must be non-overlapping and dense");
+    TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
+    TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x), "xref must have the same layout as x");
+    TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x), "yref must have the same layout as x");
+    TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x), "dy must have the same layout as x");
+
+    // Create output tensor.
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    torch::Tensor y = torch::empty_like(x);
+    TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");
+
+    // Initialize CUDA kernel parameters.
+    bias_act_kernel_params p;
+    p.x     = x.data_ptr();
+    p.b     = (b.numel()) ? b.data_ptr() : NULL;
+    p.xref  = (xref.numel()) ? xref.data_ptr() : NULL;
+    p.yref  = (yref.numel()) ? yref.data_ptr() : NULL;
+    p.dy    = (dy.numel()) ? dy.data_ptr() : NULL;
+    p.y     = y.data_ptr();
+    p.grad  = grad;
+    p.act   = act;
+    p.alpha = alpha;
+    p.gain  = gain;
+    p.clamp = clamp;
+    p.sizeX = (int)x.numel();
+    p.sizeB = (int)b.numel();
+    p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;
+
+    // Choose CUDA kernel.
+    void* kernel;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
+    {
+        kernel = choose_bias_act_kernel<scalar_t>(p);
+    });
+    TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");
+
+    // Launch CUDA kernel.
+    p.loopX = 4;
+    int blockSize = 4 * 32;
+    int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+    void* args[] = {&p};
+    AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
+    return y;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("bias_act", &bias_act);
+}
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/bias_act.cu b/torch_utils/ops/bias_act.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f0fc48475dbceb3476e5a41954a9711d5ade07e1
--- /dev/null
+++ b/torch_utils/ops/bias_act.cu
@@ -0,0 +1,175 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>     { typedef double scalar_t; };
+template <> struct InternalType<float>      { typedef float  scalar_t; };
+template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T, int A>
+__global__ void bias_act_kernel(bias_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    int G                 = p.grad;
+    scalar_t alpha        = (scalar_t)p.alpha;
+    scalar_t gain         = (scalar_t)p.gain;
+    scalar_t clamp        = (scalar_t)p.clamp;
+    scalar_t one          = (scalar_t)1;
+    scalar_t two          = (scalar_t)2;
+    scalar_t expRange     = (scalar_t)80;
+    scalar_t halfExpRange = (scalar_t)40;
+    scalar_t seluScale    = (scalar_t)1.0507009873554804934193349852946;
+    scalar_t seluAlpha    = (scalar_t)1.6732632423543772848170429916717;
+
+    // Loop over elements.
+    int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+    for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x)
+    {
+        // Load.
+        scalar_t x = (scalar_t)((const T*)p.x)[xi];
+        scalar_t b = (p.b) ? (scalar_t)((const T*)p.b)[(xi / p.stepB) % p.sizeB] : 0;
+        scalar_t xref = (p.xref) ? (scalar_t)((const T*)p.xref)[xi] : 0;
+        scalar_t yref = (p.yref) ? (scalar_t)((const T*)p.yref)[xi] : 0;
+        scalar_t dy = (p.dy) ? (scalar_t)((const T*)p.dy)[xi] : one;
+        scalar_t yy = (gain != 0) ? yref / gain : 0;
+        scalar_t y = 0;
+
+        // Apply bias.
+        ((G == 0) ? x : xref) += b;
+
+        // linear
+        if (A == 1)
+        {
+            if (G == 0) y = x;
+            if (G == 1) y = x;
+        }
+
+        // relu
+        if (A == 2)
+        {
+            if (G == 0) y = (x > 0) ? x : 0;
+            if (G == 1) y = (yy > 0) ? x : 0;
+        }
+
+        // lrelu
+        if (A == 3)
+        {
+            if (G == 0) y = (x > 0) ? x : x * alpha;
+            if (G == 1) y = (yy > 0) ? x : x * alpha;
+        }
+
+        // tanh
+        if (A == 4)
+        {
+            if (G == 0) { scalar_t c = exp(x); scalar_t d = one / c; y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d); }
+            if (G == 1) y = x * (one - yy * yy);
+            if (G == 2) y = x * (one - yy * yy) * (-two * yy);
+        }
+
+        // sigmoid
+        if (A == 5)
+        {
+            if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
+            if (G == 1) y = x * yy * (one - yy);
+            if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
+        }
+
+        // elu
+        if (A == 6)
+        {
+            if (G == 0) y = (x >= 0) ? x : exp(x) - one;
+            if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
+        }
+
+        // selu
+        if (A == 7)
+        {
+            if (G == 0) y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
+            if (G == 1) y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
+        }
+
+        // softplus
+        if (A == 8)
+        {
+            if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
+            if (G == 1) y = x * (one - exp(-yy));
+            if (G == 2) { scalar_t c = exp(-yy); y = x * c * (one - c); }
+        }
+
+        // swish
+        if (A == 9)
+        {
+            if (G == 0)
+                y = (x < -expRange) ? 0 : x / (exp(-x) + one);
+            else
+            {
+                scalar_t c = exp(xref);
+                scalar_t d = c + one;
+                if (G == 1)
+                    y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
+                else
+                    y = (xref > halfExpRange) ? 0 : x * c * (xref * (two - d) + two * d) / (d * d * d);
+                yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
+            }
+        }
+
+        // Apply gain.
+        y *= gain * dy;
+
+        // Clamp.
+        if (clamp >= 0)
+        {
+            if (G == 0)
+                y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
+            else
+                y = (yref > -clamp & yref < clamp) ? y : 0;
+        }
+
+        // Store.
+        ((T*)p.y)[xi] = (T)y;
+    }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p)
+{
+    if (p.act == 1) return (void*)bias_act_kernel<T, 1>;
+    if (p.act == 2) return (void*)bias_act_kernel<T, 2>;
+    if (p.act == 3) return (void*)bias_act_kernel<T, 3>;
+    if (p.act == 4) return (void*)bias_act_kernel<T, 4>;
+    if (p.act == 5) return (void*)bias_act_kernel<T, 5>;
+    if (p.act == 6) return (void*)bias_act_kernel<T, 6>;
+    if (p.act == 7) return (void*)bias_act_kernel<T, 7>;
+    if (p.act == 8) return (void*)bias_act_kernel<T, 8>;
+    if (p.act == 9) return (void*)bias_act_kernel<T, 9>;
+    return NULL;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template void* choose_bias_act_kernel<double>       (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<float>        (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<c10::Half>    (const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/bias_act.h b/torch_utils/ops/bias_act.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0246aa06c3dcd5919111fdc914136014b9044b5
--- /dev/null
+++ b/torch_utils/ops/bias_act.h
@@ -0,0 +1,40 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct bias_act_kernel_params
+{
+    const void* x;      // [sizeX]
+    const void* b;      // [sizeB] or NULL
+    const void* xref;   // [sizeX] or NULL
+    const void* yref;   // [sizeX] or NULL
+    const void* dy;     // [sizeX] or NULL
+    void*       y;      // [sizeX]
+
+    int         grad;
+    int         act;
+    float       alpha;
+    float       gain;
+    float       clamp;
+
+    int         sizeX;
+    int         sizeB;
+    int         stepB;
+    int         loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/bias_act.py b/torch_utils/ops/bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..8041208be7680ddeceb1a87a9db9faae7101e7bf
--- /dev/null
+++ b/torch_utils/ops/bias_act.py
@@ -0,0 +1,214 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom PyTorch ops for efficient bias and activation."""
+
+import os
+import warnings
+import numpy as np
+import torch
+import dnnlib
+import traceback
+
+from .. import custom_ops
+from .. import misc
+
+#----------------------------------------------------------------------------
+
+activation_funcs = {
+    'linear':   dnnlib.EasyDict(func=lambda x, **_:         x,                                          def_alpha=0,    def_gain=1,             cuda_idx=1, ref='',  has_2nd_grad=False),
+    'relu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.relu(x),                def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=2, ref='y', has_2nd_grad=False),
+    'lrelu':    dnnlib.EasyDict(func=lambda x, alpha, **_:  torch.nn.functional.leaky_relu(x, alpha),   def_alpha=0.2,  def_gain=np.sqrt(2),    cuda_idx=3, ref='y', has_2nd_grad=False),
+    'tanh':     dnnlib.EasyDict(func=lambda x, **_:         torch.tanh(x),                              def_alpha=0,    def_gain=1,             cuda_idx=4, ref='y', has_2nd_grad=True),
+    'sigmoid':  dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x),                           def_alpha=0,    def_gain=1,             cuda_idx=5, ref='y', has_2nd_grad=True),
+    'elu':      dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.elu(x),                 def_alpha=0,    def_gain=1,             cuda_idx=6, ref='y', has_2nd_grad=True),
+    'selu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.selu(x),                def_alpha=0,    def_gain=1,             cuda_idx=7, ref='y', has_2nd_grad=True),
+    'softplus': dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.softplus(x),            def_alpha=0,    def_gain=1,             cuda_idx=8, ref='y', has_2nd_grad=True),
+    'swish':    dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x) * x,                       def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=9, ref='x', has_2nd_grad=True),
+}
+
+#----------------------------------------------------------------------------
+
+_inited = False
+_plugin = None
+_null_tensor = torch.empty([0])
+
+def _init():
+    global _inited, _plugin
+    if not _inited:
+        _inited = True
+        sources = ['bias_act.cpp', 'bias_act.cu']
+        sources = [os.path.join(os.path.dirname(__file__), s) for s in sources]
+        try:
+            _plugin = custom_ops.get_plugin('bias_act_plugin', sources=sources, extra_cuda_cflags=['--use_fast_math'])
+        except:
+            warnings.warn('Failed to build CUDA kernels for bias_act. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc())
+    return _plugin is not None
+
+#----------------------------------------------------------------------------
+
+def bias_act(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None, impl='cuda'):
+    r"""Fused bias and activation function.
+
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports first and second order gradients,
+    but not third order gradients.
+
+    Args:
+        x:      Input activation tensor. Can be of any shape.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `dim`.
+        dim:    The dimension in `x` corresponding to the elements of `b`.
+                The value of `dim` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying 1.
+        clamp:  Clamp the output values to `[-clamp, +clamp]`, or `None` to disable
+                the clamping (default).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _bias_act_cuda(dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(x, b)
+    return _bias_act_ref(x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _bias_act_ref(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Slow reference implementation of `bias_act()` using standard TensorFlow ops.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.ndim == 1
+        assert 0 <= dim < x.ndim
+        assert b.shape[0] == x.shape[dim]
+        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])
+
+    # Evaluate activation function.
+    alpha = float(alpha)
+    x = spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        x = x * gain
+
+    # Clamp.
+    if clamp >= 0:
+        x = x.clamp(-clamp, clamp) # pylint: disable=invalid-unary-operand-type
+    return x
+
+#----------------------------------------------------------------------------
+
+_bias_act_cuda_cache = dict()
+
+def _bias_act_cuda(dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Fast CUDA implementation of `bias_act()` using custom ops.
+    """
+    # Parse arguments.
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Lookup from cache.
+    key = (dim, act, alpha, gain, clamp)
+    if key in _bias_act_cuda_cache:
+        return _bias_act_cuda_cache[key]
+
+    # Forward op.
+    class BiasActCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, b): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride()[1] == 1 else torch.contiguous_format
+            x = x.contiguous(memory_format=ctx.memory_format)
+            b = b.contiguous() if b is not None else _null_tensor
+            y = x
+            if act != 'linear' or gain != 1 or clamp >= 0 or b is not _null_tensor:
+                y = _plugin.bias_act(x, b, _null_tensor, _null_tensor, _null_tensor, 0, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                y if 'y' in spec.ref else _null_tensor)
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            dy = dy.contiguous(memory_format=ctx.memory_format)
+            x, b, y = ctx.saved_tensors
+            dx = None
+            db = None
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                dx = dy
+                if act != 'linear' or gain != 1 or clamp >= 0:
+                    dx = BiasActCudaGrad.apply(dy, x, b, y)
+
+            if ctx.needs_input_grad[1]:
+                db = dx.sum([i for i in range(dx.ndim) if i != dim])
+
+            return dx, db
+
+    # Backward op.
+    class BiasActCudaGrad(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if dy.ndim > 2 and dy.stride()[1] == 1 else torch.contiguous_format
+            dx = _plugin.bias_act(dy, b, x, y, _null_tensor, 1, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                dy if spec.has_2nd_grad else _null_tensor,
+                x, b, y)
+            return dx
+
+        @staticmethod
+        def backward(ctx, d_dx): # pylint: disable=arguments-differ
+            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
+            dy, x, b, y = ctx.saved_tensors
+            d_dy = None
+            d_x = None
+            d_b = None
+            d_y = None
+
+            if ctx.needs_input_grad[0]:
+                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
+
+            if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]):
+                d_x = _plugin.bias_act(d_dx, b, x, y, dy, 2, dim, spec.cuda_idx, alpha, gain, clamp)
+
+            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
+                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
+
+            return d_dy, d_x, d_b, d_y
+
+    # Add to cache.
+    _bias_act_cuda_cache[key] = BiasActCuda
+    return BiasActCuda
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/conv2d_gradfix.py b/torch_utils/ops/conv2d_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..093036b728336d6f2f593aaea187054a8af8d523
--- /dev/null
+++ b/torch_utils/ops/conv2d_gradfix.py
@@ -0,0 +1,172 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom replacement for `torch.nn.functional.conv2d` that supports
+arbitrarily high order gradients with zero performance penalty."""
+
+import warnings
+import contextlib
+import torch
+
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+
+#----------------------------------------------------------------------------
+
+enabled = False                     # Enable the custom op by setting this to true.
+weight_gradients_disabled = False   # Forcefully disable computation of gradients with respect to the weights.
+
+@contextlib.contextmanager
+def no_weight_gradients():
+    global weight_gradients_disabled
+    old = weight_gradients_disabled
+    weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+
+#----------------------------------------------------------------------------
+
+def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=False, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=0, dilation=dilation, groups=groups).apply(input, weight, bias)
+    return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=True, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation).apply(input, weight, bias)
+    return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation)
+
+#----------------------------------------------------------------------------
+
+def _should_use_custom_op(input):
+    assert isinstance(input, torch.Tensor)
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+    if input.device.type != 'cuda':
+        return False
+    if any(torch.__version__.startswith(x) for x in ['1.7.', '1.8.', '1.9']):
+        return True
+    warnings.warn(f'conv2d_gradfix not supported on PyTorch {torch.__version__}. Falling back to torch.nn.functional.conv2d().')
+    return False
+
+def _tuple_of_ints(xs, ndim):
+    xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs,) * ndim
+    assert len(xs) == ndim
+    assert all(isinstance(x, int) for x in xs)
+    return xs
+
+#----------------------------------------------------------------------------
+
+_conv2d_gradfix_cache = dict()
+
+def _conv2d_gradfix(transpose, weight_shape, stride, padding, output_padding, dilation, groups):
+    # Parse arguments.
+    ndim = 2
+    weight_shape = tuple(weight_shape)
+    stride = _tuple_of_ints(stride, ndim)
+    padding = _tuple_of_ints(padding, ndim)
+    output_padding = _tuple_of_ints(output_padding, ndim)
+    dilation = _tuple_of_ints(dilation, ndim)
+
+    # Lookup from cache.
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups)
+    if key in _conv2d_gradfix_cache:
+        return _conv2d_gradfix_cache[key]
+
+    # Validate arguments.
+    assert groups >= 1
+    assert len(weight_shape) == ndim + 2
+    assert all(stride[i] >= 1 for i in range(ndim))
+    assert all(padding[i] >= 0 for i in range(ndim))
+    assert all(dilation[i] >= 0 for i in range(ndim))
+    if not transpose:
+        assert all(output_padding[i] == 0 for i in range(ndim))
+    else: # transpose
+        assert all(0 <= output_padding[i] < max(stride[i], dilation[i]) for i in range(ndim))
+
+    # Helpers.
+    common_kwargs = dict(stride=stride, padding=padding, dilation=dilation, groups=groups)
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [0, 0]
+        return [
+            input_shape[i + 2]
+            - (output_shape[i + 2] - 1) * stride[i]
+            - (1 - 2 * padding[i])
+            - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+
+    # Forward & backward.
+    class Conv2d(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            assert weight.shape == weight_shape
+            if not transpose:
+                output = torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, **common_kwargs)
+            else: # transpose
+                output = torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, output_padding=output_padding, **common_kwargs)
+            ctx.save_for_backward(input, weight)
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            grad_input = None
+            grad_weight = None
+            grad_bias = None
+
+            if ctx.needs_input_grad[0]:
+                p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape)
+                grad_input = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs).apply(grad_output, weight, None)
+                assert grad_input.shape == input.shape
+
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+                grad_weight = Conv2dGradWeight.apply(grad_output, input)
+                assert grad_weight.shape == weight_shape
+
+            if ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum([0, 2, 3])
+
+            return grad_input, grad_weight, grad_bias
+
+    # Gradient with respect to the weights.
+    class Conv2dGradWeight(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, grad_output, input):
+            op = torch._C._jit_get_operation('aten::cudnn_convolution_backward_weight' if not transpose else 'aten::cudnn_convolution_transpose_backward_weight')
+            flags = [torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic, torch.backends.cudnn.allow_tf32]
+            grad_weight = op(weight_shape, grad_output, input, padding, stride, dilation, groups, *flags)
+            assert grad_weight.shape == weight_shape
+            ctx.save_for_backward(grad_output, input)
+            return grad_weight
+
+        @staticmethod
+        def backward(ctx, grad2_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad2_grad_output = None
+            grad2_input = None
+
+            if ctx.needs_input_grad[0]:
+                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, None)
+                assert grad2_grad_output.shape == grad_output.shape
+
+            if ctx.needs_input_grad[1]:
+                p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape)
+                grad2_input = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs).apply(grad_output, grad2_grad_weight, None)
+                assert grad2_input.shape == input.shape
+
+            return grad2_grad_output, grad2_input
+
+    _conv2d_gradfix_cache[key] = Conv2d
+    return Conv2d
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/conv2d_resample.py b/torch_utils/ops/conv2d_resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a0883f731156af72ec19829ef0bfb8026682be
--- /dev/null
+++ b/torch_utils/ops/conv2d_resample.py
@@ -0,0 +1,158 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""2D convolution with optional up/downsampling."""
+
+import torch
+
+from .. import misc
+from . import conv2d_gradfix
+from . import upfirdn2d
+from .upfirdn2d import _parse_padding
+from .upfirdn2d import _get_filter_size
+
+#----------------------------------------------------------------------------
+
+def _get_weight_shape(w):
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        shape = [int(sz) for sz in w.shape]
+    misc.assert_shape(w, shape)
+    return shape
+
+#----------------------------------------------------------------------------
+
+def _conv2d_wrapper(x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True):
+    """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations.
+    """
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+
+    # Flip weight if requested.
+    if not flip_weight: # conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False).
+        w = w.flip([2, 3])
+
+    # Workaround performance pitfall in cuDNN 8.0.5, triggered when using
+    # 1x1 kernel + memory_format=channels_last + less than 64 channels.
+    if kw == 1 and kh == 1 and stride == 1 and padding in [0, [0, 0], (0, 0)] and not transpose:
+        if x.stride()[1] == 1 and min(out_channels, in_channels_per_group) < 64:
+            if out_channels <= 4 and groups == 1:
+                in_shape = x.shape
+                x = w.squeeze(3).squeeze(2) @ x.reshape([in_shape[0], in_channels_per_group, -1])
+                x = x.reshape([in_shape[0], out_channels, in_shape[2], in_shape[3]])
+            else:
+                x = x.to(memory_format=torch.contiguous_format)
+                w = w.to(memory_format=torch.contiguous_format)
+                x = conv2d_gradfix.conv2d(x, w, groups=groups)
+            return x.to(memory_format=torch.channels_last)
+
+    # Otherwise => execute using conv2d_gradfix.
+    op = conv2d_gradfix.conv_transpose2d if transpose else conv2d_gradfix.conv2d
+    return op(x, w, stride=stride, padding=padding, groups=groups)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def conv2d_resample(x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False):
+    r"""2D convolution with optional up/downsampling.
+
+    Padding is performed only once at the beginning, not between the operations.
+
+    Args:
+        x:              Input tensor of shape
+                        `[batch_size, in_channels, in_height, in_width]`.
+        w:              Weight tensor of shape
+                        `[out_channels, in_channels//groups, kernel_height, kernel_width]`.
+        f:              Low-pass filter for up/downsampling. Must be prepared beforehand by
+                        calling upfirdn2d.setup_filter(). None = identity (default).
+        up:             Integer upsampling factor (default: 1).
+        down:           Integer downsampling factor (default: 1).
+        padding:        Padding with respect to the upsampled image. Can be a single number
+                        or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                        (default: 0).
+        groups:         Split input channels into N groups (default: 1).
+        flip_weight:    False = convolution, True = correlation (default: True).
+        flip_filter:    False = convolution, True = correlation (default: False).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
+    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
+    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2] and f.dtype == torch.float32)
+    assert isinstance(up, int) and (up >= 1)
+    assert isinstance(down, int) and (down >= 1)
+    assert isinstance(groups, int) and (groups >= 1)
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+    fw, fh = _get_filter_size(f)
+    px0, px1, py0, py1 = _parse_padding(padding)
+
+    # Adjust padding to account for up/downsampling.
+    if up > 1:
+        px0 += (fw + up - 1) // 2
+        px1 += (fw - up) // 2
+        py0 += (fh + up - 1) // 2
+        py1 += (fh - up) // 2
+    if down > 1:
+        px0 += (fw - down + 1) // 2
+        px1 += (fw - down) // 2
+        py0 += (fh - down + 1) // 2
+        py1 += (fh - down) // 2
+
+    # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve.
+    if kw == 1 and kh == 1 and (down > 1 and up == 1):
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample.
+    if kw == 1 and kh == 1 and (up > 1 and down == 1):
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        x = upfirdn2d.upfirdn2d(x=x, f=f, up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+        return x
+
+    # Fast path: downsampling only => use strided convolution.
+    if down > 1 and up == 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: upsampling with optional downsampling => use transpose strided convolution.
+    if up > 1:
+        if groups == 1:
+            w = w.transpose(0, 1)
+        else:
+            w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw)
+            w = w.transpose(1, 2)
+            w = w.reshape(groups * in_channels_per_group, out_channels // groups, kh, kw)
+        px0 -= kw - 1
+        px1 -= kw - up
+        py0 -= kh - 1
+        py1 -= kh - up
+        pxt = max(min(-px0, -px1), 0)
+        pyt = max(min(-py0, -py1), 0)
+        x = _conv2d_wrapper(x=x, w=w, stride=up, padding=[pyt,pxt], groups=groups, transpose=True, flip_weight=(not flip_weight))
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0+pxt,px1+pxt,py0+pyt,py1+pyt], gain=up**2, flip_filter=flip_filter)
+        if down > 1:
+            x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+        return x
+
+    # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d.
+    if up == 1 and down == 1:
+        if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0:
+            return _conv2d_wrapper(x=x, w=w, padding=[py0,px0], groups=groups, flip_weight=flip_weight)
+
+    # Fallback: Generic reference implementation.
+    x = upfirdn2d.upfirdn2d(x=x, f=(f if up > 1 else None), up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+    if down > 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+    return x
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/filtered_lrelu.cpp b/torch_utils/ops/filtered_lrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e253d1f3ffe84e54e667bf61a45dfe66264a73c
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.cpp
@@ -0,0 +1,300 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "filtered_lrelu.h"
+
+//------------------------------------------------------------------------
+
+static std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b, torch::Tensor si,
+    int up, int down, int px0, int px1, int py0, int py1, int sx, int sy, float gain, float slope, float clamp, bool flip_filters, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() && b.device() == x.device(), "all input tensors must reside on the same device");
+    TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat, "fu and fd must be float32");
+    TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat, "x and b must be float16 or float32");
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK((fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2), "fu and fd must be rank 1 or 2");
+    TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX, "fu is too large");
+    TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX, "fd is too large");
+    TORCH_CHECK(fu.numel() > 0, "fu is empty");
+    TORCH_CHECK(fd.numel() > 0, "fd is empty");
+    TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1), "b must be a vector with the same number of channels as x");
+    TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");
+
+    // Figure out how much shared memory is available on the device.
+    int maxSharedBytes = 0;
+    AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, x.device().index()));
+    int sharedKB = maxSharedBytes >> 10;
+
+    // Populate enough launch parameters to check if a CUDA kernel exists.
+    filtered_lrelu_kernel_params p;
+    p.up      = up;
+    p.down    = down;
+    p.fuShape = make_int2((int)fu.size(-1), fu.dim() == 2 ? (int)fu.size(0) : 0); // shape [n, 0] indicates separable filter.
+    p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
+    filtered_lrelu_kernel_spec test_spec = choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
+    if (!test_spec.exec)
+    {
+        // No kernel found - return empty tensors and indicate missing kernel with return code of -1.
+        return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
+    }
+
+    // Input/output element size.
+    int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;
+
+    // Input sizes.
+    int64_t xw = (int)x.size(3);
+    int64_t xh = (int)x.size(2);
+    int64_t fut_w = (int)fu.size(-1) - 1;
+    int64_t fut_h = (int)fu.size(0)  - 1;
+    int64_t fdt_w = (int)fd.size(-1) - 1;
+    int64_t fdt_h = (int)fd.size(0)  - 1;
+
+    // Logical size of upsampled buffer.
+    int64_t cw = xw * up + (px0 + px1) - fut_w;
+    int64_t ch = xh * up + (py0 + py1) - fut_h;
+    TORCH_CHECK(cw > fdt_w && ch > fdt_h, "upsampled buffer must be at least the size of downsampling filter");
+    TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");
+
+    // Compute output size and allocate.
+    int64_t yw = (cw - fdt_w + (down - 1)) / down;
+    int64_t yh = (ch - fdt_h + (down - 1)) / down;
+    TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
+    TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
+    torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(), x.suggest_memory_format());
+
+    // Allocate sign tensor.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    int64_t sw_active = 0; // Active width of sign tensor.
+    if (writeSigns)
+    {
+        sw_active = yw * down - (down - 1) + fdt_w;     // Active width in elements.
+        int64_t sh = yh * down - (down - 1) + fdt_h;    // Height = active height.
+        int64_t sw = (sw_active + 15) & ~15;            // Width  = active width in elements, rounded up to multiple of 16.
+        TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
+        s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+    else if (readSigns)
+        sw_active = s.size(3) << 2;
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX, "signs is too large");
+    }
+
+    // Populate rest of CUDA kernel parameters.
+    p.x         = x.data_ptr();
+    p.y         = y.data_ptr();
+    p.b         = b.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.fu        = fu.data_ptr<float>();
+    p.fd        = fd.data_ptr<float>();
+    p.pad0      = make_int2(px0, py0);
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.flip      = (flip_filters) ? 1 : 0;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.yShape    = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3), (int)s.size(2)) : make_int2(0, 0); // Width is in bytes. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+    p.swLimit   = (sw_active + 3) >> 2; // Rounded up to bytes.
+
+    // x, y, b strides are in bytes.
+    p.xStride   = make_longlong4(sz * x.stride(3), sz * x.stride(2), sz * x.stride(1), sz * x.stride(0));
+    p.yStride   = make_longlong4(sz * y.stride(3), sz * y.stride(2), sz * y.stride(1), sz * y.stride(0));
+    p.bStride   = sz * b.stride(0);
+
+    // fu, fd strides are in elements.
+    p.fuStride  = make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
+    p.fdStride  = make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);
+
+    // Determine if indices don't fit in int32. Support negative strides although Torch currently never produces those.
+    bool index64b = false;
+    if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
+    if (std::min(x.size(0) * p.xStride.w, 0ll) + std::min(x.size(1) * p.xStride.z, 0ll) + std::min(x.size(2) * p.xStride.y, 0ll) + std::min(x.size(3) * p.xStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(x.size(0) * p.xStride.w, 0ll) + std::max(x.size(1) * p.xStride.z, 0ll) + std::max(x.size(2) * p.xStride.y, 0ll) + std::max(x.size(3) * p.xStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (std::min(y.size(0) * p.yStride.w, 0ll) + std::min(y.size(1) * p.yStride.z, 0ll) + std::min(y.size(2) * p.yStride.y, 0ll) + std::min(y.size(3) * p.yStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(y.size(0) * p.yStride.w, 0ll) + std::max(y.size(1) * p.yStride.z, 0ll) + std::max(y.size(2) * p.yStride.y, 0ll) + std::max(y.size(3) * p.yStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (s.numel() > INT_MAX) index64b = true;
+
+    // Choose CUDA kernel.
+    filtered_lrelu_kernel_spec spec = { 0 };
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_cuda", [&]
+    {
+        if constexpr (sizeof(scalar_t) <= 4) // Exclude doubles. constexpr prevents template instantiation.
+        {
+            // Choose kernel based on index type, datatype and sign read/write modes.
+            if      (!index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true,  false>(p, sharedKB);
+            else if (!index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true >(p, sharedKB);
+            else if (!index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(p, sharedKB);
+            else if ( index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true,  false>(p, sharedKB);
+            else if ( index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true >(p, sharedKB);
+            else if ( index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(p, sharedKB);
+        }
+    });
+    TORCH_CHECK(spec.exec, "internal error - CUDA kernel not found") // This should not happen because we tested earlier that kernel exists.
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = spec.numWarps * 32;
+    int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
+    int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
+    int gz = p.yShape.z * p.yShape.w;
+
+    // Repeat multiple horizontal tiles in a CTA?
+    if (spec.xrep)
+    {
+        p.tilesXrep = spec.xrep;
+        p.tilesXdim = gx;
+
+        gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
+        std::swap(gx, gy);
+    }
+    else
+    {
+        p.tilesXrep = 0;
+        p.tilesXdim = 0;
+    }
+
+    // Launch filter setup kernel.
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0, at::cuda::getCurrentCUDAStream()));
+
+    // Copy kernels to constant memory.
+    if      ( writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<true,  false>(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns &&  readSigns) AT_CUDA_CHECK((copy_filters<false, true >(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<false, false>(at::cuda::getCurrentCUDAStream())));
+
+    // Set cache and shared memory configurations for main kernel.
+    AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
+    if (spec.dynamicSharedKB) // Need dynamically allocated shared memory?
+        AT_CUDA_CHECK(cudaFuncSetAttribute(spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize, spec.dynamicSharedKB << 10));
+    AT_CUDA_CHECK(cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
+
+    // Launch main kernel.
+    const int maxSubGz = 65535; // CUDA maximum for block z dimension.
+    for (int zofs=0; zofs < gz; zofs += maxSubGz) // Do multiple launches if gz is too big.
+    {
+        p.blockZofs = zofs;
+        int subGz = std::min(maxSubGz, gz - zofs);
+        AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args, spec.dynamicSharedKB << 10, at::cuda::getCurrentCUDAStream()));
+    }
+
+    // Done.
+    return std::make_tuple(y, so, 0);
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor filtered_lrelu_act(torch::Tensor x, torch::Tensor si, int sx, int sy, float gain, float slope, float clamp, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat || x.dtype() == torch::kDouble, "x must be float16, float32 or float64");
+
+    // Output signs if we don't have sign input.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    if (writeSigns)
+    {
+        int64_t sw = x.size(3);
+        sw = (sw + 15) & ~15; // Round to a multiple of 16 for coalescing.
+        s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX, "signs tensor is too large");
+    }
+
+    // Initialize CUDA kernel parameters.
+    filtered_lrelu_act_kernel_params p;
+    p.x         = x.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.xStride   = make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3) << 2, (int)s.size(2)) : make_int2(0, 0); // Width is in elements. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+
+    // Choose CUDA kernel.
+    void* func = 0;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_act_cuda", [&]
+    {
+        if (writeSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
+        else if (readSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
+        else
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
+    });
+    TORCH_CHECK(func, "internal error - CUDA kernel not found");
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = 128; // 4 warps per block.
+
+    // Logical size of launch = writeSigns ? p.s : p.x
+    uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
+    uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
+    uint32_t gz = p.xShape.z * p.xShape.w; // Same as in p.sShape if signs are in use.
+    gx = (gx - 1) / bx + 1;
+
+    // Make sure grid y and z dimensions are within CUDA launch limits. Kernel loops internally to do the rest.
+    const uint32_t gmax = 65535;
+    gy = std::min(gy, gmax);
+    gz = std::min(gz, gmax);
+
+    // Launch.
+    AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0, at::cuda::getCurrentCUDAStream()));
+    return so;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("filtered_lrelu",      &filtered_lrelu);      // The whole thing.
+    m.def("filtered_lrelu_act_", &filtered_lrelu_act);  // Activation and sign tensor handling only. Modifies data tensor in-place.
+}
+
+//------------------------------------------------------------------------
\ No newline at end of file
diff --git a/torch_utils/ops/filtered_lrelu.cu b/torch_utils/ops/filtered_lrelu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..50bad61678bec055dade2b337f12ea395aeb382e
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.cu
@@ -0,0 +1,1284 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "filtered_lrelu.h"
+#include <cstdint>
+
+//------------------------------------------------------------------------
+// Helpers.
+
+enum // Filter modes.
+{
+    MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
+    MODE_FUSD = 1,  // Full upsampling, separable downsampling.
+    MODE_SUFD = 2,  // Separable upsampling, full downsampling.
+    MODE_FUFD = 3,  // Full upsampling, full downsampling.
+};
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>
+{
+    typedef double scalar_t; typedef double2 vec2_t; typedef double4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_double2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_double4(0, 0, 0, 0); }
+    __device__ __forceinline__ static double clamp(double x, double c) { return fmin(fmax(x, -c), c); }
+};
+template <> struct InternalType<float>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+template <> struct InternalType<c10::Half>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+
+#define MIN(A, B)       ((A) < (B) ? (A) : (B))
+#define MAX(A, B)       ((A) > (B) ? (A) : (B))
+#define CEIL_DIV(A, B) (((B)==1) ? (A) : \
+                        ((B)==2) ? ((int)((A)+1) >> 1) : \
+                        ((B)==4) ? ((int)((A)+3) >> 2) : \
+                        (((A) + ((A) > 0 ? (B) - 1 : 0)) / (B)))
+
+// This works only up to blocks of size 256 x 256 and for all N that are powers of two.
+template <int N> __device__ __forceinline__ void fast_div_mod(int& x, int& y, unsigned int i)
+{
+    if ((N & (N-1)) && N <= 256)
+        y = (i * ((1<<24)/N + 1)) >> 24; // Assumes N <= 256, i < N*256.
+    else
+        y = i/N;
+
+    x = i - y*N;
+}
+
+// Type cast stride before reading it.
+template <class T> __device__ __forceinline__ T get_stride(const int64_t& x)
+{
+    return *reinterpret_cast<const T*>(&x);
+}
+
+//------------------------------------------------------------------------
+// Filters, setup kernel, copying function.
+
+#define MAX_FILTER_SIZE 32
+
+// Combined up/down filter buffers so that transfer can be done with one copy.
+__device__              float g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in global memory, written by setup kernel.
+__device__ __constant__ float c_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in constant memory, read by main kernel.
+
+// Accessors to combined buffers to index up/down filters individually.
+#define c_fu (c_fbuf)
+#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+#define g_fu (g_fbuf)
+#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+
+// Set up filters into global memory buffer.
+static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p)
+{
+    for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE; idx += blockDim.x)
+    {
+        int x, y;
+        fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);
+
+        int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
+        int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
+        if (p.fuShape.y > 0)
+            g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y) ? 0.0f : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
+        else
+            g_fu[idx] = (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];
+
+        int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
+        int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
+        if (p.fdShape.y > 0)
+            g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y) ? 0.0f : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
+        else
+            g_fd[idx] = (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
+    }
+}
+
+// Host function to copy filters written by setup kernel into constant buffer for main kernel.
+template <bool, bool> static cudaError_t copy_filters(cudaStream_t stream)
+{
+    void* src = 0;
+    cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
+    if (err) return err;
+    return cudaMemcpyToSymbolAsync(c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream);
+}
+
+//------------------------------------------------------------------------
+// Coordinate spaces:
+// - Relative to input tensor:      inX, inY, tileInX, tileInY
+// - Relative to input tile:        relInX, relInY, tileInW, tileInH
+// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
+// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
+// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
+//
+// Relationships between coordinate spaces:
+// - inX = tileInX + relInX
+// - inY = tileInY + relInY
+// - relUpX = relInX * up + phaseInX
+// - relUpY = relInY * up + phaseInY
+// - relUpX = relOutX * down
+// - relUpY = relOutY * down
+// - outX = tileOutX + relOutX
+// - outY = tileOutY + relOutY
+
+extern __shared__ char s_buf_raw[]; // When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.
+
+template <class T, class index_t, int sharedKB, bool signWrite, bool signRead, int filterMode, int up, int fuSize, int down, int fdSize, int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep, bool enableWriteSkip>
+static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p)
+{
+    // Check that we don't try to support non-existing filter modes.
+    static_assert(up   == 1 || up   == 2 || up   == 4, "only up=1, up=2, up=4 scales supported");
+    static_assert(down == 1 || down == 2 || down == 4, "only down=1, down=2, down=4 scales supported");
+    static_assert(fuSize >= up,   "upsampling filter size must be at least upsampling factor");
+    static_assert(fdSize >= down, "downsampling filter size must be at least downsampling factor");
+    static_assert(fuSize % up   == 0, "upsampling filter size must be divisible with upsampling factor");
+    static_assert(fdSize % down == 0, "downsampling filter size must be divisible with downsampling factor");
+    static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE, "filter size greater than MAX_FILTER_SIZE");
+    static_assert(up   != 1 || (fuSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "up=1 supported only for 1x1 full filters");
+    static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "down=1 supported only for 1x1 full filters");
+    static_assert(!(up   == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "full filters not supported for up=4");
+    static_assert(!(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "full filters not supported for down=4");
+
+    // Static definitions.
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    typedef typename InternalType<T>::vec2_t vec2_t;
+    typedef typename InternalType<T>::vec4_t vec4_t;
+    const int tileUpW    = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) & ~3;  // Upsampled tile width, rounded up to multiple of 4.
+    const int tileUpH    = tileOutH * down + (fdSize - 1) - (down - 1);             // Upsampled tile height.
+    const int tileInW    = CEIL_DIV(tileUpW  + (fuSize - 1), up);                   // Input tile width.
+    const int tileInH    = CEIL_DIV(tileUpH  + (fuSize - 1), up);                   // Input tile height.
+    const int tileUpH_up = CEIL_DIV(tileUpH, up) * up;                              // Upsampled tile height rounded up to a multiple of up.
+    const int tileInH_up = CEIL_DIV(tileUpH_up + (fuSize - 1), up);                 // For allocations only, to avoid shared memory read overruns with up=2 and up=4.
+
+    // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
+    const bool downInline = (down == 1) && ((up == 1 && filterMode == MODE_FUFD) || (up == 2 && filterMode == MODE_SUFD));
+
+    // Sizes of logical buffers.
+    const int szIn    = tileInH_up * tileInW;
+    const int szUpX   = tileInH_up * tileUpW;
+    const int szUpXY  = downInline ? 0 : (tileUpH * tileUpW);
+    const int szDownX = tileUpH * tileOutW;
+
+    // Sizes for shared memory arrays.
+    const int s_buf0_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUSD) ? MAX(szIn, szDownX) :
+        (filterMode == MODE_SUFD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUFD) ? szIn :
+        -1;
+    const int s_buf1_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szUpX, szDownX) :
+        (filterMode == MODE_FUSD) ? szUpXY :
+        (filterMode == MODE_SUFD) ? szUpX  :
+        (filterMode == MODE_FUFD) ? szUpXY :
+        -1;
+
+    // Ensure U128 alignment.
+    const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
+    const int s_buf1_size = (s_buf1_size_base + 3) & ~3;
+
+    // Check at compile time that we don't use too much shared memory.
+    static_assert((s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10), "shared memory overflow");
+
+    // Declare shared memory arrays.
+    scalar_t* s_buf0;
+    scalar_t* s_buf1;
+    if (sharedKB <= 48)
+    {
+        // Allocate shared memory arrays here.
+        __shared__ scalar_t s_buf0_st[(sharedKB > 48) ? (1<<24) : (s_buf0_size + s_buf1_size)]; // Prevent launching if this isn't optimized away when unused.
+        s_buf0 = s_buf0_st;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+    else
+    {
+        // Use the dynamically allocated shared memory array.
+        s_buf0 = (scalar_t*)s_buf_raw;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+
+    // Pointers to the buffers.
+    scalar_t* s_tileIn;       // Input tile:                      [relInX * tileInH + relInY]
+    scalar_t* s_tileUpX;      // After horizontal upsampling:     [relInY * tileUpW + relUpX]
+    scalar_t* s_tileUpXY;     // After upsampling:                [relUpY * tileUpW + relUpX]
+    scalar_t* s_tileDownX;    // After horizontal downsampling:   [relUpY * tileOutW + relOutX]
+    if (filterMode == MODE_SUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+        s_tileDownX = s_buf1;
+    }
+    else if (filterMode == MODE_FUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+        s_tileDownX = s_buf0;
+    }
+    else if (filterMode == MODE_SUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+    }
+    else if (filterMode == MODE_FUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+    }
+
+    // Allow large grids in z direction via per-launch offset.
+    int channelIdx = blockIdx.z + p.blockZofs;
+    int batchIdx = channelIdx / p.yShape.z;
+    channelIdx -= batchIdx * p.yShape.z;
+
+    // Offset to output feature map. In bytes.
+    index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) + batchIdx * get_stride<index_t>(p.yStride.w);
+
+    // Sign shift amount.
+    uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;
+
+    // Inner tile loop.
+    #pragma unroll 1
+    for (int tileIdx = 0; !enableXrep || (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y)); tileIdx++)
+    {
+        // Locate output tile.
+        int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
+        int tileOutX = tileX * tileOutW;
+        int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;
+
+        // Locate input tile.
+        int tmpX = tileOutX * down - p.pad0.x;
+        int tmpY = tileOutY * down - p.pad0.y;
+        int tileInX = CEIL_DIV(tmpX, up);
+        int tileInY = CEIL_DIV(tmpY, up);
+        const int phaseInX = tileInX * up - tmpX;
+        const int phaseInY = tileInY * up - tmpY;
+
+        // Extra sync if input and output buffers are the same and we are not on first tile.
+        if (enableXrep && tileIdx > 0 && (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) || (filterMode == MODE_FUFD && downInline)))
+            __syncthreads();
+
+        // Load input tile & apply bias. Unrolled.
+        scalar_t b = (scalar_t)*(const T*)((const char*)p.b + (channelIdx * get_stride<index_t>(p.bStride)));
+        index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) + batchIdx * get_stride<index_t>(p.xStride.w);
+        int idx = threadIdx.x;
+        const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
+        #pragma unroll
+        for (int loop = 0; loop < loopCountIN; loop++)
+        {
+            int relInX, relInY;
+            fast_div_mod<tileInW>(relInX, relInY, idx);
+            int inX = tileInX + relInX;
+            int inY = tileInY + relInY;
+            scalar_t v = 0;
+
+            if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
+                v = (scalar_t)*((const T*)((const char*)p.x + (inX * get_stride<index_t>(p.xStride.x) + inY * get_stride<index_t>(p.xStride.y) + mapOfsIn))) + b;
+
+            bool skip = (loop == loopCountIN-1) && (idx >= tileInW * tileInH);
+            if (!skip)
+                s_tileIn[idx] = v;
+
+            idx += threadsPerBlock;
+        }
+
+        if (filterMode == MODE_SUSD || filterMode == MODE_SUFD) // Separable upsampling filter.
+        {
+            // Horizontal upsampling.
+            __syncthreads();
+            if (up == 4)
+            {
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    scalar_t a = s_tileIn[src0];
+                    if (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInX == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInX == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                    s_tileUpX[dst+2] = v.z;
+                    s_tileUpX[dst+3] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                bool p0 = (phaseInX == 0);
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    scalar_t a = s_tileIn[src0];
+                    if (p0) // (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                }
+            }
+
+            // Vertical upsampling & nonlinearity.
+
+            __syncthreads();
+            int groupMask = 15 << ((threadIdx.x & 31) & ~3);
+            int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+            int sShapeMaxY = MIN(p.sShape.y, tileOutY * down + tileUpH); // Avoid out-of-tile sign writes.
+            if (up == 4)
+            {
+                minY -= 3; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec4_t v = InternalType<T>::zero_vec4();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInY == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInY == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+                    index_t si2 = si0 + p.sShape.x * 2;
+                    index_t si3 = si0 + p.sShape.x * 3;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 <<  0;
+                            int sy = __float_as_uint(v.y) >> 31 <<  8;
+                            int sz = __float_as_uint(v.z) >> 31 << 16;
+                            int sw = __float_as_uint(v.w) >> 31 << 24;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (sz) v.z *= p.slope;
+                            if (sw) v.w *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 <<  0;
+                                int sy = __float_as_uint(v.y) >> 31 <<  8;
+                                int sz = __float_as_uint(v.z) >> 31 << 16;
+                                int sw = __float_as_uint(v.w) >> 31 << 24;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (sz) v.z *= p.slope;
+                                if (sw) v.w *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            int ss = (signX & 3) << 1;
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> ss; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> ss; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                            if ((uint32_t)(signY + 2) < p.sShape.y) { int s = p.s[si2] >> ss; if (s & 1) v.z *= p.slope; if (s & 2) v.z = 0.f; }
+                            if ((uint32_t)(signY + 3) < p.sShape.y) { int s = p.s[si3] >> ss; if (s & 1) v.w *= p.slope; if (s & 2) v.w = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[dst + 0 * tileUpW] = v.x;
+                    if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
+                    if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
+                    if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                minY -= 1; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec2_t v = InternalType<T>::zero_vec2();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 << 0;
+                            int sy = __float_as_uint(v.y) >> 31 << 8;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 << 0;
+                                int sy = __float_as_uint(v.y) >> 31 << 8;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> signXo; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> signXo; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                    }
+
+                    if (!downInline)
+                    {
+                        // Write into temporary buffer.
+                        s_tileUpXY[dst] = v.x;
+                        if (relUpY0 < tileUpH - 1)
+                            s_tileUpXY[dst + tileUpW] = v.y;
+                    }
+                    else
+                    {
+                        // Write directly into output buffer.
+                        if ((uint32_t)x < p.yShape.x)
+                        {
+                            int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
+                            index_t ofs = x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                            if ((uint32_t)y + 0 < p.yShape.y) *((T*)((char*)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
+                            if ((uint32_t)y + 1 < ymax) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.y))) = (T)(v.y * (scalar_t)c_fd[0]);
+                        }
+                    }
+                }
+            }
+        }
+        else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD)
+        {
+            // Full upsampling filter.
+
+            if (up == 2)
+            {
+                // 2 x 2-wide.
+                __syncthreads();
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y : 0; // Skip already written signs.
+                for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
+                    int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
+                    int src0 = relInX0 + tileInW * relInY0;
+                    int tap0y = (relInY0 * up + phaseInY - relUpY0);
+
+                    #define X_LOOP(TAPY, PX) \
+                        for (int sx = 0; sx < fuSize / up; sx++) \
+                        { \
+                            v.x += a * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.z += b * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 0) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                            v.y += a * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.w += b * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 1) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                        }
+
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    if (tap0y == 0 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 0) }
+                    if (tap0y == 0 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 1) }
+                    if (tap0y == 1 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 0) }
+                    if (tap0y == 1 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 1) }
+
+                    #undef X_LOOP
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31;
+                            int sy = __float_as_uint(v.y) >> 31;
+                            int sz = __float_as_uint(v.z) >> 31;
+                            int sw = __float_as_uint(v.w) >> 31;
+                            if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31;
+                                int sy = __float_as_uint(v.y) >> 31;
+                                int sz = __float_as_uint(v.z) >> 31;
+                                int sw = __float_as_uint(v.w) >> 31;
+                                if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read sign and apply.
+                    {
+                        if ((uint32_t)signY < p.sShape.y)
+                        {
+                            int s = 0;
+                            if ((uint32_t)signXb     < p.swLimit) s  = p.s[si];
+                            if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
+                            s >>= (signX & 3) << 1;
+                            if (s & 0x01) v.x *= p.slope; if (s & 0x02) v.x = 0.f;
+                            if (s & 0x04) v.y *= p.slope; if (s & 0x08) v.y = 0.f;
+                            if (s & 0x10) v.z *= p.slope; if (s & 0x20) v.z = 0.f;
+                            if (s & 0x40) v.w *= p.slope; if (s & 0x80) v.w = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[idx + 0] = v.x;
+                    s_tileUpXY[idx + 1] = v.y;
+                    s_tileUpXY[idx + 2] = v.z;
+                    s_tileUpXY[idx + 3] = v.w;
+                }
+            }
+            else if (up == 1)
+            {
+                __syncthreads();
+                uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH; idx += blockDim.x)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0]; // 1x1 filter.
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    v *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write sign.
+                            uint32_t s = 0;
+                            uint32_t signXbit = (1u << signXo);
+                            if (v < 0.f)
+                            {
+                                s = signXbit;
+                                v *= p.slope;
+                            }
+                            if (fabsf(v) > p.clamp)
+                            {
+                                s = signXbit * 2;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write sign.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                uint32_t s = 0;
+                                uint32_t signXbit = (1u << signXo);
+                                if (v < 0.f)
+                                {
+                                    s = signXbit;
+                                    v *= p.slope;
+                                }
+                                if (fabsf(v) > p.clamp)
+                                {
+                                    s = signXbit * 2;
+                                    v = InternalType<T>::clamp(v, p.clamp);
+                                }
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                            else
+                            {
+                                // Just compute the value.
+                                if (v < 0.f) v *= p.slope;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead)
+                    {
+                        // Read sign and apply if within sign tensor bounds.
+                        if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y)
+                        {
+                            int s = p.s[si];
+                            s >>= signXo;
+                            if (s & 1) v *= p.slope;
+                            if (s & 2) v = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v < 0.f) v *= p.slope;
+                        v = InternalType<T>::clamp(v, p.clamp);
+                    }
+
+                    if (!downInline) // Write into temporary buffer.
+                        s_tileUpXY[idx] = v;
+                    else if ((uint32_t)x < p.yShape.x && (uint32_t)y < p.yShape.y) // Write directly into output buffer
+                        *((T*)((char*)p.y + (x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
+                }
+            }
+        }
+
+        // Downsampling.
+        if (filterMode == MODE_SUSD || filterMode == MODE_FUSD)
+        {
+            // Horizontal downsampling.
+            __syncthreads();
+            if (down == 4 && tileOutW % 4 == 0)
+            {
+                // Calculate 4 pixels at a time.
+                for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +  0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 +  4 + step] * (scalar_t)c_fd[step];
+                        v.z += s_tileUpXY[src0 +  8 + step] * (scalar_t)c_fd[step];
+                        v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                    s_tileDownX[idx+2] = v.z;
+                    s_tileDownX[idx+3] = v.w;
+                }
+            }
+            else if ((down == 2 || down == 4) && (tileOutW % 2 == 0))
+            {
+                // Calculate 2 pixels at a time.
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +    0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                }
+            }
+            else
+            {
+                // Calculate 1 pixel at a time.
+                for (int idx = threadIdx.x; idx < tileOutW * tileUpH; idx += blockDim.x)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src = relUpY * tileUpW + relUpX0;
+                    scalar_t v = 0.f;
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                        v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
+                    s_tileDownX[idx] = v;
+                }
+            }
+
+            // Vertical downsampling & store output tile.
+            __syncthreads();
+            for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+            {
+                int relOutX, relOutY0;
+                fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
+                int relUpY0 = relOutY0 * down;
+                int src0 = relUpY0 * tileOutW + relOutX;
+                scalar_t v = 0;
+                #pragma unroll
+                for (int step = 0; step < fdSize; step++)
+                    v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];
+
+                int outX = tileOutX + relOutX;
+                int outY = tileOutY + relOutY0;
+
+                if (outX < p.yShape.x & outY < p.yShape.y)
+                    *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+            }
+        }
+        else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD)
+        {
+            // Full downsampling filter.
+            if (down == 2)
+            {
+                // 2-wide.
+                __syncthreads();
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int relUpY0 = relOutY0 * down;
+                    int src0 = relUpY0 * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int sy = 0; sy < fdSize; sy++)
+                    #pragma unroll
+                    for (int sx = 0; sx < fdSize; sx++)
+                    {
+                        v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                        v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                    }
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outY < p.yShape.y)
+                    {
+                        index_t ofs = outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                        if (outX + 0 < p.yShape.x) *((T*)((char*)p.y + ofs)) = (T)v.x;
+                        if (outX + 1 < p.yShape.x) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.x))) = (T)v.y;
+                    }
+                }
+            }
+            else if (down == 1 && !downInline)
+            {
+                // Thread per pixel.
+                __syncthreads();
+                for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0]; // 1x1 filter.
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
+                        *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+                }
+            }
+        }
+
+        if (!enableXrep)
+            break;
+    }
+}
+
+//------------------------------------------------------------------------
+// Compute activation function and signs for upsampled data tensor, modifying data tensor in-place. Used for accelerating the generic variant.
+// Sign tensor is known to be contiguous, and p.x and p.s have the same z, w dimensions. 64-bit indexing is always used.
+
+template <class T, bool signWrite, bool signRead>
+static __global__ void filtered_lrelu_act_kernel(filtered_lrelu_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+
+    // Indexing.
+    int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
+    int32_t qmax = p.xShape.z * p.xShape.w; // Combined minibatch*channel maximum index.
+
+    // Loop to accommodate oversized tensors.
+    for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
+    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y)
+    {
+        // Extract z and w (channel, minibatch index).
+        int32_t w = q / p.xShape.z;
+        int32_t z = q - w * p.xShape.z;
+
+        // Choose behavior based on sign read/write mode.
+        if (signWrite)
+        {
+            // Process value if in p.x.
+            uint32_t s = 0;
+            if (x < p.xShape.x && y < p.xShape.y)
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+
+                // Gain, LReLU, clamp.
+                v *= p.gain;
+                if (v < 0.f)
+                {
+                    v *= p.slope;
+                    s = 1; // Sign.
+                }
+                if (fabsf(v) > p.clamp)
+                {
+                    v = InternalType<T>::clamp(v, p.clamp);
+                    s = 2; // Clamp.
+                }
+
+                *pv = (T)v; // Write value.
+            }
+
+            // Coalesce into threads 0 and 16 of warp.
+            uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
+            s <<= ((threadIdx.x & 15) << 1); // Shift into place.
+            s |= __shfl_xor_sync(m, s, 1); // Distribute.
+            s |= __shfl_xor_sync(m, s, 2);
+            s |= __shfl_xor_sync(m, s, 4);
+            s |= __shfl_xor_sync(m, s, 8);
+
+            // Write signs if leader and in p.s.
+            if (!(threadIdx.x & 15) && x < p.sShape.x) // y is always in.
+            {
+                uint64_t is = x + p.sShape.x * (y + (int64_t)p.sShape.y * q); // Contiguous.
+                ((uint32_t*)p.s)[is >> 4] = s;
+            }
+        }
+        else if (signRead)
+        {
+            // Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+
+                // Apply sign buffer offset.
+                uint32_t sx = x + p.sOfs.x;
+                uint32_t sy = y + p.sOfs.y;
+
+                // Read and apply signs if we land inside valid region of sign buffer.
+                if (sx < p.sShape.x && sy < p.sShape.y)
+                {
+                    uint64_t is = (sx >> 2) + (p.sShape.x >> 2) * (sy + (uint64_t)p.sShape.y * q); // Contiguous.
+                    unsigned char s = p.s[is];
+                    s >>= (sx & 3) << 1; // Shift into place.
+                    if (s & 1) // Sign?
+                        v *= p.slope;
+                    if (s & 2) // Clamp?
+                        v = 0.f;
+                }
+
+                *pv = (T)v; // Write value.
+            }
+        }
+        else
+        {
+            // Forward pass with no sign write. Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+                if (v < 0.f)
+                    v *= p.slope;
+                if (fabsf(v) > p.clamp)
+                    v = InternalType<T>::clamp(v, p.clamp);
+                *pv = (T)v; // Write value.
+            }
+        }
+    }
+}
+
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void)
+{
+    return (void*)filtered_lrelu_act_kernel<T, signWrite, signRead>;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB)
+{
+    filtered_lrelu_kernel_spec s = { 0 };
+
+    // Return the first matching kernel.
+#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS) \
+    if (sharedKB >= SH) \
+    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) || (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD))) \
+    if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) || (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD))) \
+    if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU && p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) \
+    { \
+        static_assert((D*TW % 4) == 0, "down * tileWidth must be divisible by 4"); \
+        static_assert(FU % U == 0, "upscaling filter size must be multiple of upscaling factor"); \
+        static_assert(FD % D == 0, "downscaling filter size must be multiple of downscaling factor"); \
+        s.setup = (void*)setup_filters_kernel; \
+        s.exec = (void*)filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, U, FU, D, FD, TW, TH, W*32, !!XR, !!WS>; \
+        s.tileOut = make_int2(TW, TH); \
+        s.numWarps = W; \
+        s.xrep = XR; \
+        s.dynamicSharedKB = (SH == 48) ? 0 : SH; \
+        return s; \
+    }
+
+    // Launch parameters for various kernel specializations.
+    // Small filters must be listed before large filters, otherwise the kernel for larger filter will always match first.
+    // Kernels that use more shared memory must be listed before those that use less, for the same reason.
+
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/1,1,  /*mode*/MODE_FUFD, /*tw,th,warps,xrep,wskip*/64,  178, 32,  0,  0) // 1t-upf1-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/152, 95,  16,  0,  0) // 4t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  22,  16,  0,  0) // 4t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  29,  16,  11, 0) // 4t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/60,  28,  16,  0,  0) // 4t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  28,  16,  0,  0) // 4t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  31,  16,  11, 0) // 4t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  36,  16,  0,  0) // 4t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  22,  16,  12, 0) // 4t-ups2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/29,  15,  16,  0,  0) // 4t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/96,  150, 28,  0,  0) // 6t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  35,  24,  0,  0) // 6t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  16,  10, 0) // 6t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/58,  28,  24,  8,  0) // 6t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/52,  28,  16,  0,  0) // 6t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  51,  16,  5,  0) // 6t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  56,  16,  6,  0) // 6t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  18,  16,  12, 0) // 6t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  31,  32,  6,  0) // 6t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  13,  24,  0,  0) // 6t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/148, 89,  24,  0,  0) // 8t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  31,  16,  5,  0) // 8t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  41,  16,  9,  0) // 8t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  26,  24,  0,  0) // 8t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  40,  16,  0,  0) // 8t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  24,  5,  0) // 8t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  50,  16,  0,  0) // 8t-ups4-downf2
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/24,  24,  32,  12, 1) // 8t-ups2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  13,  16,  10, 1) // 8t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  28,  28,  4,  0) // 8t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  10,  24,  0,  0) // 8t-upf2-downs4
+
+    #undef CASE
+    return s; // No kernel found.
+}
+
+//------------------------------------------------------------------------
\ No newline at end of file
diff --git a/torch_utils/ops/filtered_lrelu.h b/torch_utils/ops/filtered_lrelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..524c804122a2582e20e2e4e9c49267e1a1b6db60
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct filtered_lrelu_kernel_params
+{
+    // These parameters decide which kernel to use.
+    int             up;         // upsampling ratio (1, 2, 4)
+    int             down;       // downsampling ratio (1, 2, 4)
+    int2            fuShape;    // [size, 1] | [size, size]
+    int2            fdShape;    // [size, 1] | [size, size]
+
+    int             _dummy;     // Alignment.
+
+    // Rest of the parameters.
+    const void*     x;          // Input tensor.
+    void*           y;          // Output tensor.
+    const void*     b;          // Bias tensor.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+    const float*    fu;         // Upsampling filter.
+    const float*    fd;         // Downsampling filter.
+
+    int2            pad0;       // Left/top padding.
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+    int             flip;       // Filter kernel flip for gradient computation.
+
+    int             tilesXdim;  // Original number of horizontal output tiles.
+    int             tilesXrep;  // Number of horizontal tiles per CTA.
+    int             blockZofs;  // Block z offset to support large minibatch, channel dimensions.
+
+    int4            xShape;     // [width, height, channel, batch]
+    int4            yShape;     // [width, height, channel, batch]
+    int2            sShape;     // [width, height] - width is in bytes. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+    int             swLimit;    // Active width of sign tensor in bytes.
+
+    longlong4       xStride;    // Strides of all tensors except signs, same component order as shapes.
+    longlong4       yStride;    //
+    int64_t         bStride;    //
+    longlong3       fuStride;   //
+    longlong3       fdStride;   //
+};
+
+struct filtered_lrelu_act_kernel_params
+{
+    void*           x;          // Input/output, modified in-place.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+
+    int4            xShape;     // [width, height, channel, batch]
+    longlong4       xStride;    // Input/output tensor strides, same order as in shape.
+    int2            sShape;     // [width, height] - width is in elements. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct filtered_lrelu_kernel_spec
+{
+    void*   setup;              // Function for filter kernel setup.
+    void*   exec;               // Function for main operation.
+    int2    tileOut;            // Width/height of launch tile.
+    int     numWarps;           // Number of warps per thread block, determines launch block size.
+    int     xrep;               // For processing multiple horizontal tiles per thread block.
+    int     dynamicSharedKB;    // How much dynamic shared memory the exec kernel wants.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB);
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void);
+template <bool signWrite, bool signRead> cudaError_t copy_filters(cudaStream_t stream);
+
+//------------------------------------------------------------------------
\ No newline at end of file
diff --git a/torch_utils/ops/filtered_lrelu.py b/torch_utils/ops/filtered_lrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e3748fb725884b18b7e8119f569722b5bbe67f
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import numpy as np
+import torch
+import warnings
+
+from .. import custom_ops
+from .. import misc
+from . import upfirdn2d
+from . import bias_act
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+
+def _init():
+    global _plugin
+    if _plugin is None:
+
+        # sources=['filtered_lrelu.h', 'filtered_lrelu.cu', 'filtered_lrelu.cpp', 'filtered_lrelu_wr.cu', 'filtered_lrelu_rd.cu', 'filtered_lrelu_ns.cu']
+        # sources = [os.path.join(os.path.dirname(__file__), s) for s in sources]
+        # try:
+        #     _plugin = custom_ops.get_plugin('filtered_lrelu_plugin', sources=sources, extra_cuda_cflags=['--use_fast_math', '--allow-unsupported-compiler'])
+        # except:
+        #     warnings.warn('Failed to build CUDA kernels for filtered_lrelu_plugin. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc())
+
+        _plugin = custom_ops.get_plugin_v3(
+            module_name='filtered_lrelu_plugin',
+            sources=['filtered_lrelu.cpp', 'filtered_lrelu_wr.cu', 'filtered_lrelu_rd.cu', 'filtered_lrelu_ns.cu'],
+            headers=['filtered_lrelu.h', 'filtered_lrelu.cu'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math', '--allow-unsupported-compiler'],
+        )
+    return True
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor)
+    assert 1 <= f.ndim <= 2
+    return f.shape[-1], f.shape[0] # width, height
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, (int, np.integer)) for x in padding)
+    padding = [int(x) for x in padding]
+    if len(padding) == 2:
+        px, py = padding
+        padding = [px, px, py, py]
+    px0, px1, py0, py1 = padding
+    return px0, px1, py0, py1
+
+#----------------------------------------------------------------------------
+
+def filtered_lrelu(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False, impl='cuda'):
+    r"""Filtered leaky ReLU for a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Add channel-specific bias if provided (`b`).
+
+    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    3. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    4. Convolve the image with the specified upsampling FIR filter (`fu`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    5. Multiply each value by the provided gain factor (`gain`).
+
+    6. Apply leaky ReLU activation function to each value.
+
+    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is provided.
+
+    8. Convolve the image with the specified downsampling FIR filter (`fd`), shrinking
+       it so that the footprint of all output pixels lies within the input image.
+
+    9. Downsample the image by keeping every Nth pixel (`down`).
+
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float16/float64 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        fu:          Float32 upsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        fd:          Float32 downsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        b:           Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                     as `x`. The length of vector must must match the channel dimension of `x`.
+        up:          Integer upsampling factor (default: 1).
+        down:        Integer downsampling factor. (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        gain:        Overall scaling factor for signal magnitude (default: sqrt(2)).
+        slope:       Slope on the negative side of leaky ReLU (default: 0.2).
+        clamp:       Maximum magnitude for leaky ReLU output (default: None).
+        flip_filter: False = convolution, True = correlation (default: False).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _filtered_lrelu_cuda(up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter).apply(x, fu, fd, b, None, 0, 0)
+    return _filtered_lrelu_ref(x, fu=fu, fd=fd, b=b, up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _filtered_lrelu_ref(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Slow and memory-inefficient reference implementation of `filtered_lrelu()` using
+    existing `upfirdn2n()` and `bias_act()` ops.
+    """
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    fu_w, fu_h = _get_filter_size(fu)
+    fd_w, fd_h = _get_filter_size(fd)
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.dtype == x.dtype
+        misc.assert_shape(b, [x.shape[1]])
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    assert slope == float(slope) and slope >= 0
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+
+    # Calculate output size.
+    batch_size, channels, in_h, in_w = x.shape
+    in_dtype = x.dtype
+    out_w = (in_w * up + (px0 + px1) - (fu_w - 1) - (fd_w - 1) + (down - 1)) // down
+    out_h = (in_h * up + (py0 + py1) - (fu_h - 1) - (fd_h - 1) + (down - 1)) // down
+
+    # Compute using existing ops.
+    x = bias_act.bias_act(x=x, b=b) # Apply bias.
+    x = upfirdn2d.upfirdn2d(x=x, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+    x = bias_act.bias_act(x=x, act='lrelu', alpha=slope, gain=gain, clamp=clamp) # Bias, leaky ReLU, clamp.
+    x = upfirdn2d.upfirdn2d(x=x, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+    # Check output shape & dtype.
+    misc.assert_shape(x, [batch_size, channels, out_h, out_w])
+    assert x.dtype == in_dtype
+    return x
+
+#----------------------------------------------------------------------------
+
+_filtered_lrelu_cuda_cache = dict()
+
+def _filtered_lrelu_cuda(up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.
+    """
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    gain = float(gain)
+    assert slope == float(slope) and slope >= 0
+    slope = float(slope)
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+    clamp = float(clamp if clamp is not None else 'inf')
+
+    # Lookup from cache.
+    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
+    if key in _filtered_lrelu_cuda_cache:
+        return _filtered_lrelu_cuda_cache[key]
+
+    # Forward op.
+    class FilteredLReluCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, fu, fd, b, si, sx, sy): # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+
+            # Replace empty up/downsample kernels with full 1x1 kernels (faster than separable).
+            if fu is None:
+                fu = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if fd is None:
+                fd = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            assert 1 <= fu.ndim <= 2
+            assert 1 <= fd.ndim <= 2
+
+            # Replace separable 1x1 kernels with full 1x1 kernels when scale factor is 1.
+            if up == 1 and fu.ndim == 1 and fu.shape[0] == 1:
+                fu = fu.square()[None]
+            if down == 1 and fd.ndim == 1 and fd.shape[0] == 1:
+                fd = fd.square()[None]
+
+            # Missing sign input tensor.
+            if si is None:
+                si = torch.empty([0])
+
+            # Missing bias tensor.
+            if b is None:
+                b = torch.zeros([x.shape[1]], dtype=x.dtype, device=x.device)
+
+            # Construct internal sign tensor only if gradients are needed.
+            write_signs = (si.numel() == 0) and (x.requires_grad or b.requires_grad)
+
+            # Warn if input storage strides are not in decreasing order due to e.g. channels-last layout.
+            strides = [x.stride(i) for i in range(x.ndim) if x.size(i) > 1]
+            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
+                warnings.warn("low-performance memory layout detected in filtered_lrelu input", RuntimeWarning)
+
+            # Call C++/Cuda plugin if datatype is supported.
+            if x.dtype in [torch.float16, torch.float32]:
+                if torch.cuda.current_stream(x.device) != torch.cuda.default_stream(x.device):
+                    warnings.warn("filtered_lrelu called with non-default cuda stream but concurrent execution is not supported", RuntimeWarning)
+                y, so, return_code = _plugin.filtered_lrelu(x, fu, fd, b, si, up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp, flip_filter, write_signs)
+            else:
+                return_code = -1
+
+            # No Cuda kernel found? Fall back to generic implementation. Still more memory efficient than the reference implementation because
+            # only the bit-packed sign tensor is retained for gradient computation.
+            if return_code < 0:
+                warnings.warn("filtered_lrelu called with parameters that have no optimized CUDA kernel, using generic fallback", RuntimeWarning)
+
+                y = x.add(b.unsqueeze(-1).unsqueeze(-1)) # Add bias.
+                y = upfirdn2d.upfirdn2d(x=y, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+                so = _plugin.filtered_lrelu_act_(y, si, sx, sy, gain, slope, clamp, write_signs) # Activation function and sign handling. Modifies y in-place.
+                y = upfirdn2d.upfirdn2d(x=y, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+            # Prepare for gradient computation.
+            ctx.save_for_backward(fu, fd, (si if si.numel() else so))
+            ctx.x_shape = x.shape
+            ctx.y_shape = y.shape
+            ctx.s_ofs = sx, sy
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            fu, fd, si = ctx.saved_tensors
+            _, _, xh, xw = ctx.x_shape
+            _, _, yh, yw = ctx.y_shape
+            sx, sy = ctx.s_ofs
+            dx  = None # 0
+            dfu = None; assert not ctx.needs_input_grad[1]
+            dfd = None; assert not ctx.needs_input_grad[2]
+            db  = None # 3
+            dsi = None; assert not ctx.needs_input_grad[4]
+            dsx = None; assert not ctx.needs_input_grad[5]
+            dsy = None; assert not ctx.needs_input_grad[6]
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
+                pp = [
+                    (fu.shape[-1] - 1) + (fd.shape[-1] - 1) - px0,
+                    xw * up - yw * down + px0 - (up - 1),
+                    (fu.shape[0] - 1) + (fd.shape[0] - 1) - py0,
+                    xh * up - yh * down + py0 - (up - 1),
+                ]
+                gg = gain * (up ** 2) / (down ** 2)
+                ff = (not flip_filter)
+                sx = sx - (fu.shape[-1] - 1) + px0
+                sy = sy - (fu.shape[0]  - 1) + py0
+                dx = _filtered_lrelu_cuda(up=down, down=up, padding=pp, gain=gg, slope=slope, clamp=None, flip_filter=ff).apply(dy, fd, fu, None, si, sx, sy)
+
+            if ctx.needs_input_grad[3]:
+                db = dx.sum([0, 2, 3])
+
+            return dx, dfu, dfd, db, dsi, dsx, dsy
+
+    # Add to cache.
+    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
+    return FilteredLReluCuda
+
+#----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/torch_utils/ops/filtered_lrelu_ns.cu b/torch_utils/ops/filtered_lrelu_ns.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a65e743bc4a4c760a6f0605249041fdd52ec264d
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu_ns.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for no signs mode (no gradients required).
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, false>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, false>(cudaStream_t stream);
\ No newline at end of file
diff --git a/torch_utils/ops/filtered_lrelu_rd.cu b/torch_utils/ops/filtered_lrelu_rd.cu
new file mode 100644
index 0000000000000000000000000000000000000000..79c75b0c22b6b09476782166e30e74d00f2c7d61
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu_rd.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for sign read mode.
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, true>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, true>(cudaStream_t stream);
\ No newline at end of file
diff --git a/torch_utils/ops/filtered_lrelu_wr.cu b/torch_utils/ops/filtered_lrelu_wr.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7c3e82eaf5ba7df13adade0e77a7796f57bfdf10
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu_wr.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for sign write mode.
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, true, false>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     true, false>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    true, false>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<true, false>(cudaStream_t stream);
\ No newline at end of file
diff --git a/torch_utils/ops/fma.py b/torch_utils/ops/fma.py
new file mode 100644
index 0000000000000000000000000000000000000000..06530ed5e0731b1355b18c7fe1526786dc683d26
--- /dev/null
+++ b/torch_utils/ops/fma.py
@@ -0,0 +1,62 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Fused multiply-add, with slightly faster gradients than `torch.addcmul()`."""
+
+import torch
+
+#----------------------------------------------------------------------------
+
+def fma(a, b, c): # => a * b + c
+    return _FusedMultiplyAdd.apply(a, b, c)
+
+#----------------------------------------------------------------------------
+
+class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c
+    @staticmethod
+    def forward(ctx, a, b, c): # pylint: disable=arguments-differ
+        out = torch.addcmul(c, a, b)
+        ctx.save_for_backward(a, b)
+        ctx.c_shape = c.shape
+        return out
+
+    @staticmethod
+    def backward(ctx, dout): # pylint: disable=arguments-differ
+        a, b = ctx.saved_tensors
+        c_shape = ctx.c_shape
+        da = None
+        db = None
+        dc = None
+
+        if ctx.needs_input_grad[0]:
+            da = _unbroadcast(dout * b, a.shape)
+
+        if ctx.needs_input_grad[1]:
+            db = _unbroadcast(dout * a, b.shape)
+
+        if ctx.needs_input_grad[2]:
+            dc = _unbroadcast(dout, c_shape)
+
+        return da, db, dc
+
+#----------------------------------------------------------------------------
+
+def _unbroadcast(x, shape):
+    extra_dims = x.ndim - len(shape)
+    assert extra_dims >= 0
+    dim = [i for i in range(x.ndim) if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1)]
+    if len(dim):
+        x = x.sum(dim=dim, keepdim=True)
+    if extra_dims:
+        x = x.reshape(-1, *x.shape[extra_dims+1:])
+    assert x.shape == shape
+    return x
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/grid_sample_gradfix.py b/torch_utils/ops/grid_sample_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f69aad7510d49d55cd865b5e2554703f979b185
--- /dev/null
+++ b/torch_utils/ops/grid_sample_gradfix.py
@@ -0,0 +1,85 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom replacement for `torch.nn.functional.grid_sample` that
+supports arbitrarily high order gradients between the input and output.
+Only works on 2D images and assumes
+`mode='bilinear'`, `padding_mode='zeros'`, `align_corners=False`."""
+
+import warnings
+import torch
+
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+
+#----------------------------------------------------------------------------
+
+enabled = False  # Enable the custom op by setting this to true.
+
+#----------------------------------------------------------------------------
+
+def grid_sample(input, grid):
+    if _should_use_custom_op():
+        return _GridSample2dForward.apply(input, grid)
+    return torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False)
+
+#----------------------------------------------------------------------------
+
+def _should_use_custom_op():
+    if not enabled:
+        return False
+    if any(torch.__version__.startswith(x) for x in ['1.7.', '1.8.', '1.9']):
+        return True
+    warnings.warn(f'grid_sample_gradfix not supported on PyTorch {torch.__version__}. Falling back to torch.nn.functional.grid_sample().')
+    return False
+
+#----------------------------------------------------------------------------
+
+class _GridSample2dForward(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, grid):
+        assert input.ndim == 4
+        assert grid.ndim == 4
+        output = torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False)
+        ctx.save_for_backward(input, grid)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, grid = ctx.saved_tensors
+        grad_input, grad_grid = _GridSample2dBackward.apply(grad_output, input, grid)
+        return grad_input, grad_grid
+
+#----------------------------------------------------------------------------
+
+class _GridSample2dBackward(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, grad_output, input, grid):
+        op = torch._C._jit_get_operation('aten::grid_sampler_2d_backward')
+        grad_input, grad_grid = op(grad_output, input, grid, 0, 0, False)
+        ctx.save_for_backward(grid)
+        return grad_input, grad_grid
+
+    @staticmethod
+    def backward(ctx, grad2_grad_input, grad2_grad_grid):
+        _ = grad2_grad_grid # unused
+        grid, = ctx.saved_tensors
+        grad2_grad_output = None
+        grad2_input = None
+        grad2_grid = None
+
+        if ctx.needs_input_grad[0]:
+            grad2_grad_output = _GridSample2dForward.apply(grad2_grad_input, grid)
+
+        assert not ctx.needs_input_grad[2]
+        return grad2_grad_output, grad2_input, grad2_grid
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/upfirdn2d.cpp b/torch_utils/ops/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42bdd483490a555266c8f9b9dd6684464b2088bc
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.cpp
@@ -0,0 +1,105 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "upfirdn2d.h"
+
+//------------------------------------------------------------------------
+
+static torch::Tensor upfirdn2d(torch::Tensor x, torch::Tensor f, int upx, int upy, int downx, int downy, int padx0, int padx1, int pady0, int pady1, bool flip, float gain)
+{
+    // Validate arguments.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    TORCH_CHECK(f.device() == x.device(), "f must reside on the same device as x");
+    TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
+    TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+    TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(f.dim() == 2, "f must be rank 2");
+    TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
+    TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
+    TORCH_CHECK(downx >= 1 && downy >= 1, "downsampling factor must be at least 1");
+
+    // Create output tensor.
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    int outW = ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
+    int outH = ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
+    TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
+    torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW}, x.options(), x.suggest_memory_format());
+    TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
+
+    // Initialize CUDA kernel parameters.
+    upfirdn2d_kernel_params p;
+    p.x             = x.data_ptr();
+    p.f             = f.data_ptr<float>();
+    p.y             = y.data_ptr();
+    p.up            = make_int2(upx, upy);
+    p.down          = make_int2(downx, downy);
+    p.pad0          = make_int2(padx0, pady0);
+    p.flip          = (flip) ? 1 : 0;
+    p.gain          = gain;
+    p.inSize        = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.inStride      = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1), (int)x.stride(0));
+    p.filterSize    = make_int2((int)f.size(1), (int)f.size(0));
+    p.filterStride  = make_int2((int)f.stride(1), (int)f.stride(0));
+    p.outSize       = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+    p.outStride     = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1), (int)y.stride(0));
+    p.sizeMajor     = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
+    p.sizeMinor     = (p.inStride.z == 1) ? p.inSize.z : 1;
+
+    // Choose CUDA kernel.
+    upfirdn2d_kernel_spec spec;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
+    {
+        spec = choose_upfirdn2d_kernel<scalar_t>(p);
+    });
+
+    // Set looping options.
+    p.loopMajor     = (p.sizeMajor - 1) / 16384 + 1;
+    p.loopMinor     = spec.loopMinor;
+    p.loopX         = spec.loopX;
+    p.launchMinor   = (p.sizeMinor - 1) / p.loopMinor + 1;
+    p.launchMajor   = (p.sizeMajor - 1) / p.loopMajor + 1;
+
+    // Compute grid size.
+    dim3 blockSize, gridSize;
+    if (spec.tileOutW < 0) // large
+    {
+        blockSize = dim3(4, 32, 1);
+        gridSize = dim3(
+            ((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
+            (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1,
+            p.launchMajor);
+    }
+    else // small
+    {
+        blockSize = dim3(256, 1, 1);
+        gridSize = dim3(
+            ((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
+            (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1,
+            p.launchMajor);
+    }
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
+    return y;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("upfirdn2d", &upfirdn2d);
+}
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/upfirdn2d.cu b/torch_utils/ops/upfirdn2d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2126f450047bb4a7f2e77b27d105207d02acffcd
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.cu
@@ -0,0 +1,352 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "upfirdn2d.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>     { typedef double scalar_t; };
+template <> struct InternalType<float>      { typedef float  scalar_t; };
+template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };
+
+static __device__ __forceinline__ int floor_div(int a, int b)
+{
+    int t = 1 - a / b;
+    return (a + t * b) / b - t;
+}
+
+//------------------------------------------------------------------------
+// Generic CUDA implementation for large filters.
+
+template <class T> static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+
+    // Calculate thread index.
+    int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
+    int outY = minorBase / p.launchMinor;
+    minorBase -= outY * p.launchMinor;
+    int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
+    int majorBase = blockIdx.z * p.loopMajor;
+    if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
+        return;
+
+    // Setup Y receptive field.
+    int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
+    int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
+    int h = min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
+    int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
+    if (p.flip)
+        filterY = p.filterSize.y - 1 - filterY;
+
+    // Loop over major, minor, and X.
+    for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    for (int minorIdx = 0, minor = minorBase; minorIdx < p.loopMinor & minor < p.sizeMinor; minorIdx++, minor += p.launchMinor)
+    {
+        int nc = major * p.sizeMinor + minor;
+        int n = nc / p.inSize.z;
+        int c = nc - n * p.inSize.z;
+        for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x; loopX++, outX += blockDim.y)
+        {
+            // Setup X receptive field.
+            int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
+            int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
+            int w = min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) - inX;
+            int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
+            if (p.flip)
+                filterX = p.filterSize.x - 1 - filterX;
+
+            // Initialize pointers.
+            const T* xp = &((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w];
+            const float* fp = &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
+            int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
+            int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;
+
+            // Inner loop.
+            scalar_t v = 0;
+            for (int y = 0; y < h; y++)
+            {
+                for (int x = 0; x < w; x++)
+                {
+                    v += (scalar_t)(*xp) * (scalar_t)(*fp);
+                    xp += p.inStride.x;
+                    fp += filterStepX;
+                }
+                xp += p.inStride.y - w * p.inStride.x;
+                fp += filterStepY - w * filterStepX;
+            }
+
+            // Store result.
+            v *= p.gain;
+            ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Specialized CUDA implementation for small filters.
+
+template <class T, int upx, int upy, int downx, int downy, int filterW, int filterH, int tileOutW, int tileOutH, int loopMinor>
+static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
+    const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
+    __shared__ volatile scalar_t sf[filterH][filterW];
+    __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];
+
+    // Calculate tile index.
+    int minorBase = blockIdx.x;
+    int tileOutY = minorBase / p.launchMinor;
+    minorBase -= tileOutY * p.launchMinor;
+    minorBase *= loopMinor;
+    tileOutY *= tileOutH;
+    int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
+    int majorBase = blockIdx.z * p.loopMajor;
+    if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y | majorBase >= p.sizeMajor)
+        return;
+
+    // Load filter (flipped).
+    for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW; tapIdx += blockDim.x)
+    {
+        int fy = tapIdx / filterW;
+        int fx = tapIdx - fy * filterW;
+        scalar_t v = 0;
+        if (fx < p.filterSize.x & fy < p.filterSize.y)
+        {
+            int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
+            int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
+            v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
+        }
+        sf[fy][fx] = v;
+    }
+
+    // Loop over major and X.
+    for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    {
+        int baseNC = major * p.sizeMinor + minorBase;
+        int n = baseNC / p.inSize.z;
+        int baseC = baseNC - n * p.inSize.z;
+        for (int loopX = 0, tileOutX = tileOutXBase; loopX < p.loopX & tileOutX < p.outSize.x; loopX++, tileOutX += tileOutW)
+        {
+            // Load input pixels.
+            int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
+            int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
+            int tileInX = floor_div(tileMidX, upx);
+            int tileInY = floor_div(tileMidY, upy);
+            __syncthreads();
+            for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor; inIdx += blockDim.x)
+            {
+                int relC = inIdx;
+                int relInX = relC / loopMinor;
+                int relInY = relInX / tileInW;
+                relC -= relInX * loopMinor;
+                relInX -= relInY * tileInW;
+                int c = baseC + relC;
+                int inX = tileInX + relInX;
+                int inY = tileInY + relInY;
+                scalar_t v = 0;
+                if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y & c < p.inSize.z)
+                    v = (scalar_t)((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w];
+                sx[relInY][relInX][relC] = v;
+            }
+
+            // Loop over output pixels.
+            __syncthreads();
+            for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor; outIdx += blockDim.x)
+            {
+                int relC = outIdx;
+                int relOutX = relC / loopMinor;
+                int relOutY = relOutX / tileOutW;
+                relC -= relOutX * loopMinor;
+                relOutX -= relOutY * tileOutW;
+                int c = baseC + relC;
+                int outX = tileOutX + relOutX;
+                int outY = tileOutY + relOutY;
+
+                // Setup receptive field.
+                int midX = tileMidX + relOutX * downx;
+                int midY = tileMidY + relOutY * downy;
+                int inX = floor_div(midX, upx);
+                int inY = floor_div(midY, upy);
+                int relInX = inX - tileInX;
+                int relInY = inY - tileInY;
+                int filterX = (inX + 1) * upx - midX - 1; // flipped
+                int filterY = (inY + 1) * upy - midY - 1; // flipped
+
+                // Inner loop.
+                if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z)
+                {
+                    scalar_t v = 0;
+                    #pragma unroll
+                    for (int y = 0; y < filterH / upy; y++)
+                        #pragma unroll
+                        for (int x = 0; x < filterW / upx; x++)
+                            v += sx[relInY + y][relInX + x][relC] * sf[filterY + y * upy][filterX + x * upx];
+                    v *= p.gain;
+                    ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v;
+                }
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p)
+{
+    int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
+
+    upfirdn2d_kernel_spec spec = {(void*)upfirdn2d_kernel_large<T>, -1,-1,1, 4}; // contiguous
+    if (s == 1)           spec = {(void*)upfirdn2d_kernel_large<T>, -1,-1,4, 1}; // channels_last
+
+    if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // contiguous
+    {
+        if (fx <= 7  && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 7,7,  64,16,1>, 64,16,1, 1};
+        if (fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 6,6,  64,16,1>, 64,16,1, 1};
+        if (fx <= 5  && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 5,5,  64,16,1>, 64,16,1, 1};
+        if (fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,  64,16,1>, 64,16,1, 1};
+        if (fx <= 3  && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 3,3,  64,16,1>, 64,16,1, 1};
+        if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 20,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 12,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 8,1,  128,8,1>, 128,8,1, 1};
+        if (fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,24, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 20) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,20, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,16, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 12) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,12, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,8,  32,32,1>, 32,32,1, 1};
+    }
+    if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // channels_last
+    {
+        if (fx <= 7  && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 7,7,  16,16,8>,  16,16,8,  1};
+        if (fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,  16,16,8>,  16,16,8,  1};
+        if (fx <= 5  && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,  16,16,8>,  16,16,8,  1};
+        if (fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,  16,16,8>,  16,16,8,  1};
+        if (fx <= 3  && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,  16,16,8>,  16,16,8,  1};
+        if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 20,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 12,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 8,1,  128,1,16>, 128,1,16, 1};
+        if (fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,24, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 20) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,20, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,16, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 12) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,12, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,8,  1,128,16>, 1,128,16, 1};
+    }
+    if (s != 1 && p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // contiguous
+    {
+        if (fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 8,8, 64,16,1>, 64,16,1, 1};
+        if (fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 6,6, 64,16,1>, 64,16,1, 1};
+        if (fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 4,4, 64,16,1>, 64,16,1, 1};
+        if (fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 2,2, 64,16,1>, 64,16,1, 1};
+    }
+    if (s == 1 && p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // channels_last
+    {
+        if (fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 8,8, 16,16,8>, 16,16,8, 1};
+        if (fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 6,6, 16,16,8>, 16,16,8, 1};
+        if (fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 4,4, 16,16,8>, 16,16,8, 1};
+        if (fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 2,2, 16,16,8>, 16,16,8, 1};
+    }
+    if (s != 1 && p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // contiguous
+    {
+        if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 24,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 20,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 16,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 12,1, 128,8,1>, 128,8,1, 1};
+        if (fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 8,1,  128,8,1>, 128,8,1, 1};
+    }
+    if (s == 1 && p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // channels_last
+    {
+        if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 24,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 20,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 16,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 12,1, 128,1,16>, 128,1,16, 1};
+        if (fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 8,1,  128,1,16>, 128,1,16, 1};
+    }
+    if (s != 1 && p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // contiguous
+    {
+        if (fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,24, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 20) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,20, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,16, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 12) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,12, 32,32,1>, 32,32,1, 1};
+        if (fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,8,  32,32,1>, 32,32,1, 1};
+    }
+    if (s == 1 && p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // channels_last
+    {
+        if (fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,24, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 20) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,20, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,16, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 12) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,12, 1,128,16>, 1,128,16, 1};
+        if (fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,8,  1,128,16>, 1,128,16, 1};
+    }
+    if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) // contiguous
+    {
+        if (fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 8,8,  32,8,1>, 32,8,1, 1};
+        if (fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 6,6,  32,8,1>, 32,8,1, 1};
+        if (fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 4,4,  32,8,1>, 32,8,1, 1};
+        if (fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 2,2,  32,8,1>, 32,8,1, 1};
+    }
+    if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) // channels_last
+    {
+        if (fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 8,8,  8,8,8>, 8,8,8, 1};
+        if (fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 6,6,  8,8,8>, 8,8,8, 1};
+        if (fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 4,4,  8,8,8>, 8,8,8, 1};
+        if (fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 2,2,  8,8,8>, 8,8,8, 1};
+    }
+    if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) // contiguous
+    {
+        if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 24,1, 64,8,1>, 64,8,1, 1};
+        if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 20,1, 64,8,1>, 64,8,1, 1};
+        if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 16,1, 64,8,1>, 64,8,1, 1};
+        if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 12,1, 64,8,1>, 64,8,1, 1};
+        if (fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 8,1,  64,8,1>, 64,8,1, 1};
+    }
+    if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) // channels_last
+    {
+        if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 24,1, 64,1,8>, 64,1,8, 1};
+        if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 20,1, 64,1,8>, 64,1,8, 1};
+        if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 16,1, 64,1,8>, 64,1,8, 1};
+        if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 12,1, 64,1,8>, 64,1,8, 1};
+        if (fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 8,1,  64,1,8>, 64,1,8, 1};
+    }
+    if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) // contiguous
+    {
+        if (fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,24, 32,16,1>, 32,16,1, 1};
+        if (fx <= 1  && fy <= 20) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,20, 32,16,1>, 32,16,1, 1};
+        if (fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,16, 32,16,1>, 32,16,1, 1};
+        if (fx <= 1  && fy <= 12) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,12, 32,16,1>, 32,16,1, 1};
+        if (fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,8,  32,16,1>, 32,16,1, 1};
+    }
+    if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) // channels_last
+    {
+        if (fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,24, 1,64,8>, 1,64,8, 1};
+        if (fx <= 1  && fy <= 20) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,20, 1,64,8>, 1,64,8, 1};
+        if (fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,16, 1,64,8>, 1,64,8, 1};
+        if (fx <= 1  && fy <= 12) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,12, 1,64,8>, 1,64,8, 1};
+        if (fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,8,  1,64,8>, 1,64,8, 1};
+    }
+    return spec;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>   (const upfirdn2d_kernel_params& p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>    (const upfirdn2d_kernel_params& p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(const upfirdn2d_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/upfirdn2d.h b/torch_utils/ops/upfirdn2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc6e713694d3fcca0e06cecfb9437ffb4932ffe6
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.h
@@ -0,0 +1,61 @@
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct upfirdn2d_kernel_params
+{
+    const void*     x;
+    const float*    f;
+    void*           y;
+
+    int2            up;
+    int2            down;
+    int2            pad0;
+    int             flip;
+    float           gain;
+
+    int4            inSize;         // [width, height, channel, batch]
+    int4            inStride;
+    int2            filterSize;     // [width, height]
+    int2            filterStride;
+    int4            outSize;        // [width, height, channel, batch]
+    int4            outStride;
+    int             sizeMinor;
+    int             sizeMajor;
+
+    int             loopMinor;
+    int             loopMajor;
+    int             loopX;
+    int             launchMinor;
+    int             launchMajor;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct upfirdn2d_kernel_spec
+{
+    void*   kernel;
+    int     tileOutW;
+    int     tileOutH;
+    int     loopMinor;
+    int     loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/upfirdn2d.py b/torch_utils/ops/upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a14c15fe8737cf047338d2de795d7e40a1f4e9cc
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.py
@@ -0,0 +1,386 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom PyTorch ops for efficient resampling of 2D images."""
+
+import os
+import warnings
+import numpy as np
+import torch
+import traceback
+
+from .. import custom_ops
+from .. import misc
+from . import conv2d_gradfix
+
+#----------------------------------------------------------------------------
+
+_inited = False
+_plugin = None
+
+def _init():
+    global _inited, _plugin
+    if not _inited:
+        sources = ['upfirdn2d.cpp', 'upfirdn2d.cu']
+        sources = [os.path.join(os.path.dirname(__file__), s) for s in sources]
+        try:
+            _plugin = custom_ops.get_plugin('upfirdn2d_plugin', sources=sources, extra_cuda_cflags=['--use_fast_math'])
+        except:
+            warnings.warn('Failed to build CUDA kernels for upfirdn2d. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc())
+    return _plugin is not None
+
+def _parse_scaling(scaling):
+    if isinstance(scaling, int):
+        scaling = [scaling, scaling]
+    assert isinstance(scaling, (list, tuple))
+    assert all(isinstance(x, int) for x in scaling)
+    sx, sy = scaling
+    assert sx >= 1 and sy >= 1
+    return sx, sy
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, int) for x in padding)
+    if len(padding) == 2:
+        padx, pady = padding
+        padding = [padx, padx, pady, pady]
+    padx0, padx1, pady0, pady1 = padding
+    return padx0, padx1, pady0, pady1
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    fw = f.shape[-1]
+    fh = f.shape[0]
+    with misc.suppress_tracer_warnings():
+        fw = int(fw)
+        fh = int(fh)
+    misc.assert_shape(f, [fh, fw][:f.ndim])
+    assert fw >= 1 and fh >= 1
+    return fw, fh
+
+#----------------------------------------------------------------------------
+
+def setup_filter(f, device=torch.device('cpu'), normalize=True, flip_filter=False, gain=1, separable=None):
+    r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`.
+
+    Args:
+        f:           Torch tensor, numpy array, or python list of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable),
+                     `[]` (impulse), or
+                     `None` (identity).
+        device:      Result device (default: cpu).
+        normalize:   Normalize the filter so that it retains the magnitude
+                     for constant input signal (DC)? (default: True).
+        flip_filter: Flip the filter? (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        separable:   Return a separable filter? (default: select automatically).
+
+    Returns:
+        Float32 tensor of the shape
+        `[filter_height, filter_width]` (non-separable) or
+        `[filter_taps]` (separable).
+    """
+    # Validate.
+    if f is None:
+        f = 1
+    f = torch.as_tensor(f, dtype=torch.float32)
+    assert f.ndim in [0, 1, 2]
+    assert f.numel() > 0
+    if f.ndim == 0:
+        f = f[np.newaxis]
+
+    # Separable?
+    if separable is None:
+        separable = (f.ndim == 1 and f.numel() >= 8)
+    if f.ndim == 1 and not separable:
+        f = f.ger(f)
+    assert f.ndim == (1 if separable else 2)
+
+    # Apply normalize, flip, gain, and device.
+    if normalize:
+        f /= f.sum()
+    if flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(device=device)
+    return f
+
+#----------------------------------------------------------------------------
+
+def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Pad, upsample, filter, and downsample a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    2. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    3. Convolve the image with the specified 2D FIR filter (`f`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    4. Downsample the image by keeping every Nth pixel (`down`).
+
+    This sequence of operations bears close resemblance to scipy.signal.upfirdn().
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _upfirdn2d_cuda(up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain).apply(x, f)
+    return _upfirdn2d_ref(x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    if f is None:
+        f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    assert f.dtype == torch.float32 and not f.requires_grad
+    batch_size, num_channels, in_height, in_width = x.shape
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Upsample by inserting zeros.
+    x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+
+    # Pad or crop.
+    x = torch.nn.functional.pad(x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)])
+    x = x[:, :, max(-pady0, 0) : x.shape[2] - max(-pady1, 0), max(-padx0, 0) : x.shape[3] - max(-padx1, 0)]
+
+    # Setup filter.
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(x.dtype)
+    if not flip_filter:
+        f = f.flip(list(range(f.ndim)))
+
+    # Convolve with the filter.
+    f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim)
+    if f.ndim == 4:
+        x = conv2d_gradfix.conv2d(input=x, weight=f, groups=num_channels)
+    else:
+        x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels)
+        x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels)
+
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+
+#----------------------------------------------------------------------------
+
+_upfirdn2d_cuda_cache = dict()
+
+def _upfirdn2d_cuda(up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Fast CUDA implementation of `upfirdn2d()` using custom ops.
+    """
+    # Parse arguments.
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Lookup from cache.
+    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain)
+    if key in _upfirdn2d_cuda_cache:
+        return _upfirdn2d_cuda_cache[key]
+
+    # Forward op.
+    class Upfirdn2dCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, f): # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+            if f is None:
+                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+            y = x
+            if f.ndim == 2:
+                y = _plugin.upfirdn2d(y, f, upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain)
+            else:
+                y = _plugin.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1, padx0, padx1, 0, 0, flip_filter, np.sqrt(gain))
+                y = _plugin.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy, 0, 0, pady0, pady1, flip_filter, np.sqrt(gain))
+            ctx.save_for_backward(f)
+            ctx.x_shape = x.shape
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            f, = ctx.saved_tensors
+            _, _, ih, iw = ctx.x_shape
+            _, _, oh, ow = dy.shape
+            fw, fh = _get_filter_size(f)
+            p = [
+                fw - padx0 - 1,
+                iw * upx - ow * downx + padx0 - upx + 1,
+                fh - pady0 - 1,
+                ih * upy - oh * downy + pady0 - upy + 1,
+            ]
+            dx = None
+            df = None
+
+            if ctx.needs_input_grad[0]:
+                dx = _upfirdn2d_cuda(up=down, down=up, padding=p, flip_filter=(not flip_filter), gain=gain).apply(dy, f)
+
+            assert not ctx.needs_input_grad[1]
+            return dx, df
+
+    # Add to cache.
+    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda
+    return Upfirdn2dCuda
+
+#----------------------------------------------------------------------------
+
+def filter2d(x, f, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Filter a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape matches the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + fw // 2,
+        padx1 + (fw - 1) // 2,
+        pady0 + fh // 2,
+        pady1 + (fh - 1) // 2,
+    ]
+    return upfirdn2d(x, f, padding=p, flip_filter=flip_filter, gain=gain, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Upsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a multiple of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    upx, upy = _parse_scaling(up)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw + upx - 1) // 2,
+        padx1 + (fw - upx) // 2,
+        pady0 + (fh + upy - 1) // 2,
+        pady1 + (fh - upy) // 2,
+    ]
+    return upfirdn2d(x, f, up=up, padding=p, flip_filter=flip_filter, gain=gain*upx*upy, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Downsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a fraction of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the input. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw - downx + 1) // 2,
+        padx1 + (fw - downx) // 2,
+        pady0 + (fh - downy + 1) // 2,
+        pady1 + (fh - downy) // 2,
+    ]
+    return upfirdn2d(x, f, down=down, padding=p, flip_filter=flip_filter, gain=gain, impl=impl)
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/persistence.py b/torch_utils/persistence.py
new file mode 100644
index 0000000000000000000000000000000000000000..50269409c8d9f7c38d7870ee7c8e4660bfb4115c
--- /dev/null
+++ b/torch_utils/persistence.py
@@ -0,0 +1,253 @@
+﻿# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Facilities for pickling Python code alongside other data.
+
+The pickled code is automatically imported into a separate Python module
+during unpickling. This way, any previously exported pickles will remain
+usable even if the original code is no longer available, or if the current
+version of the code is not consistent with what was originally pickled."""
+
+import sys
+import pickle
+import io
+import inspect
+import copy
+import uuid
+import types
+import dnnlib
+
+#----------------------------------------------------------------------------
+
+_version            = 6         # internal version number
+_decorators         = set()     # {decorator_class, ...}
+_import_hooks       = []        # [hook_function, ...]
+_module_to_src_dict = dict()    # {module: src, ...}
+_src_to_module_dict = dict()    # {src: module, ...}
+
+#----------------------------------------------------------------------------
+
+def persistent_class(orig_class):
+    r"""Class decorator that extends a given class to save its source code
+    when pickled.
+
+    Example:
+
+        from torch_utils import persistence
+
+        @persistence.persistent_class
+        class MyNetwork(torch.nn.Module):
+            def __init__(self, num_inputs, num_outputs):
+                super().__init__()
+                self.fc = MyLayer(num_inputs, num_outputs)
+                ...
+
+        @persistence.persistent_class
+        class MyLayer(torch.nn.Module):
+            ...
+
+    When pickled, any instance of `MyNetwork` and `MyLayer` will save its
+    source code alongside other internal state (e.g., parameters, buffers,
+    and submodules). This way, any previously exported pickle will remain
+    usable even if the class definitions have been modified or are no
+    longer available.
+
+    The decorator saves the source code of the entire Python module
+    containing the decorated class. It does *not* save the source code of
+    any imported modules. Thus, the imported modules must be available
+    during unpickling, also including `torch_utils.persistence` itself.
+
+    It is ok to call functions defined in the same module from the
+    decorated class. However, if the decorated class depends on other
+    classes defined in the same module, they must be decorated as well.
+    This is illustrated in the above example in the case of `MyLayer`.
+
+    It is also possible to employ the decorator just-in-time before
+    calling the constructor. For example:
+
+        cls = MyLayer
+        if want_to_make_it_persistent:
+            cls = persistence.persistent_class(cls)
+        layer = cls(num_inputs, num_outputs)
+
+    As an additional feature, the decorator also keeps track of the
+    arguments that were used to construct each instance of the decorated
+    class. The arguments can be queried via `obj.init_args` and
+    `obj.init_kwargs`, and they are automatically pickled alongside other
+    object state. A typical use case is to first unpickle a previous
+    instance of a persistent class, and then upgrade it to use the latest
+    version of the source code:
+
+        with open('old_pickle.pkl', 'rb') as f:
+            old_net = pickle.load(f)
+        new_net = MyNetwork(*old_obj.init_args, **old_obj.init_kwargs)
+        misc.copy_params_and_buffers(old_net, new_net, require_all=True)
+    """
+    assert isinstance(orig_class, type)
+    if is_persistent(orig_class):
+        return orig_class
+
+    assert orig_class.__module__ in sys.modules
+    orig_module = sys.modules[orig_class.__module__]
+    orig_module_src = _module_to_src(orig_module)
+
+    class Decorator(orig_class):
+        _orig_module_src = orig_module_src
+        _orig_class_name = orig_class.__name__
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self._init_args = copy.deepcopy(args)
+            self._init_kwargs = copy.deepcopy(kwargs)
+            assert orig_class.__name__ in orig_module.__dict__
+            _check_pickleable(self.__reduce__())
+
+        @property
+        def init_args(self):
+            return copy.deepcopy(self._init_args)
+
+        @property
+        def init_kwargs(self):
+            return dnnlib.EasyDict(copy.deepcopy(self._init_kwargs))
+
+        def __reduce__(self):
+            fields = list(super().__reduce__())
+            fields += [None] * max(3 - len(fields), 0)
+            if fields[0] is not _reconstruct_persistent_obj:
+                meta = dict(type='class', version=_version, module_src=self._orig_module_src, class_name=self._orig_class_name, state=fields[2])
+                fields[0] = _reconstruct_persistent_obj # reconstruct func
+                fields[1] = (meta,) # reconstruct args
+                fields[2] = None # state dict
+            return tuple(fields)
+
+    Decorator.__name__ = orig_class.__name__
+    _decorators.add(Decorator)
+    return Decorator
+
+#----------------------------------------------------------------------------
+
+def is_persistent(obj):
+    r"""Test whether the given object or class is persistent, i.e.,
+    whether it will save its source code when pickled.
+    """
+    try:
+        if obj in _decorators:
+            return True
+    except TypeError:
+        pass
+    return type(obj) in _decorators # pylint: disable=unidiomatic-typecheck
+
+#----------------------------------------------------------------------------
+
+def import_hook(hook):
+    r"""Register an import hook that is called whenever a persistent object
+    is being unpickled. A typical use case is to patch the pickled source
+    code to avoid errors and inconsistencies when the API of some imported
+    module has changed.
+
+    The hook should have the following signature:
+
+        hook(meta) -> modified meta
+
+    `meta` is an instance of `dnnlib.EasyDict` with the following fields:
+
+        type:       Type of the persistent object, e.g. `'class'`.
+        version:    Internal version number of `torch_utils.persistence`.
+        module_src  Original source code of the Python module.
+        class_name: Class name in the original Python module.
+        state:      Internal state of the object.
+
+    Example:
+
+        @persistence.import_hook
+        def wreck_my_network(meta):
+            if meta.class_name == 'MyNetwork':
+                print('MyNetwork is being imported. I will wreck it!')
+                meta.module_src = meta.module_src.replace("True", "False")
+            return meta
+    """
+    assert callable(hook)
+    _import_hooks.append(hook)
+
+#----------------------------------------------------------------------------
+
+def _reconstruct_persistent_obj(meta):
+    r"""Hook that is called internally by the `pickle` module to unpickle
+    a persistent object.
+    """
+    meta = dnnlib.EasyDict(meta)
+    meta.state = dnnlib.EasyDict(meta.state)
+    for hook in _import_hooks:
+        meta = hook(meta)
+        assert meta is not None
+
+    assert meta.version == _version
+    module = _src_to_module(meta.module_src)
+
+    assert meta.type == 'class'
+    orig_class = module.__dict__[meta.class_name]
+    decorator_class = persistent_class(orig_class)
+    obj = decorator_class.__new__(decorator_class)
+
+    setstate = getattr(obj, '__setstate__', None)
+    if callable(setstate):
+        setstate(meta.state) # pylint: disable=not-callable
+    else:
+        obj.__dict__.update(meta.state)
+    return obj
+
+#----------------------------------------------------------------------------
+
+def _module_to_src(module):
+    r"""Query the source code of a given Python module.
+    """
+    src = _module_to_src_dict.get(module, None)
+    if src is None:
+        src = inspect.getsource(module)
+        _module_to_src_dict[module] = src
+        _src_to_module_dict[src] = module
+    return src
+
+def _src_to_module(src):
+    r"""Get or create a Python module for the given source code.
+    """
+    module = _src_to_module_dict.get(src, None)
+    if module is None:
+        module_name = "_imported_module_" + uuid.uuid4().hex
+        module = types.ModuleType(module_name)
+        sys.modules[module_name] = module
+        _module_to_src_dict[module] = src
+        _src_to_module_dict[src] = module
+        exec(src, module.__dict__) # pylint: disable=exec-used
+    return module
+
+#----------------------------------------------------------------------------
+
+def _check_pickleable(obj):
+    r"""Check that the given object is pickleable, raising an exception if
+    it is not. This function is expected to be considerably more efficient
+    than actually pickling the object.
+    """
+    def recurse(obj):
+        if isinstance(obj, (list, tuple, set)):
+            return [recurse(x) for x in obj]
+        if isinstance(obj, dict):
+            return [[recurse(x), recurse(y)] for x, y in obj.items()]
+        if isinstance(obj, (str, int, float, bool, bytes, bytearray)):
+            return None # Python primitive types are pickleable.
+        if f'{type(obj).__module__}.{type(obj).__name__}' in ['numpy.ndarray', 'torch.Tensor']:
+            return None # NumPy arrays and PyTorch tensors are pickleable.
+        if is_persistent(obj):
+            return None # Persistent objects are pickleable, by virtue of the constructor check.
+        return obj
+    with io.BytesIO() as f:
+        pickle.dump(recurse(obj), f)
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/training_stats.py b/torch_utils/training_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb94d95286d8aeffe40ad32ca667e53b4622c4f
--- /dev/null
+++ b/torch_utils/training_stats.py
@@ -0,0 +1,270 @@
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Facilities for reporting and collecting training statistics across
+multiple processes and devices. The interface is designed to minimize
+synchronization overhead as well as the amount of boilerplate in user
+code."""
+
+import re
+import numpy as np
+import torch
+import dnnlib
+
+from . import misc
+
+#----------------------------------------------------------------------------
+
+_num_moments    = 3             # [num_scalars, sum_of_scalars, sum_of_squares]
+_reduce_dtype   = torch.float32 # Data type to use for initial per-tensor reduction.
+_counter_dtype  = torch.float64 # Data type to use for the internal counters.
+_rank           = 0             # Rank of the current process.
+_sync_device    = None          # Device to use for multiprocess communication. None = single-process.
+_sync_called    = False         # Has _sync() been called yet?
+_counters       = dict()        # Running counters on each device, updated by report(): name => device => torch.Tensor
+_cumulative     = dict()        # Cumulative counters on the CPU, updated by _sync(): name => torch.Tensor
+
+#----------------------------------------------------------------------------
+
+def init_multiprocessing(rank, sync_device):
+    r"""Initializes `torch_utils.training_stats` for collecting statistics
+    across multiple processes.
+
+    This function must be called after
+    `torch.distributed.init_process_group()` and before `Collector.update()`.
+    The call is not necessary if multi-process collection is not needed.
+
+    Args:
+        rank:           Rank of the current process.
+        sync_device:    PyTorch device to use for inter-process
+                        communication, or None to disable multi-process
+                        collection. Typically `torch.device('cuda', rank)`.
+    """
+    global _rank, _sync_device
+    assert not _sync_called
+    _rank = rank
+    _sync_device = sync_device
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def report(name, value):
+    r"""Broadcasts the given set of scalars to all interested instances of
+    `Collector`, across device and process boundaries.
+
+    This function is expected to be extremely cheap and can be safely
+    called from anywhere in the training loop, loss function, or inside a
+    `torch.nn.Module`.
+
+    Warning: The current implementation expects the set of unique names to
+    be consistent across processes. Please make sure that `report()` is
+    called at least once for each unique name by each process, and in the
+    same order. If a given process has no scalars to broadcast, it can do
+    `report(name, [])` (empty list).
+
+    Args:
+        name:   Arbitrary string specifying the name of the statistic.
+                Averages are accumulated separately for each unique name.
+        value:  Arbitrary set of scalars. Can be a list, tuple,
+                NumPy array, PyTorch tensor, or Python scalar.
+
+    Returns:
+        The same `value` that was passed in.
+    """
+    if name not in _counters:
+        _counters[name] = dict()
+
+    elems = torch.as_tensor(value)
+    if elems.numel() == 0:
+        return value
+
+    elems = elems.detach().flatten().to(_reduce_dtype)
+    moments = torch.stack([
+        torch.ones_like(elems).sum(),
+        elems.sum(),
+        elems.square().sum(),
+    ])
+    assert moments.ndim == 1 and moments.shape[0] == _num_moments
+    moments = moments.to(_counter_dtype)
+
+    device = moments.device
+    if device not in _counters[name]:
+        _counters[name][device] = torch.zeros_like(moments)
+    _counters[name][device].add_(moments)
+    return value
+
+#----------------------------------------------------------------------------
+
+def report0(name, value):
+    r"""Broadcasts the given set of scalars by the first process (`rank = 0`),
+    but ignores any scalars provided by the other processes.
+    See `report()` for further details.
+    """
+    report(name, value if _rank == 0 else [])
+    return value
+
+#----------------------------------------------------------------------------
+
+class Collector:
+    r"""Collects the scalars broadcasted by `report()` and `report0()` and
+    computes their long-term averages (mean and standard deviation) over
+    user-defined periods of time.
+
+    The averages are first collected into internal counters that are not
+    directly visible to the user. They are then copied to the user-visible
+    state as a result of calling `update()` and can then be queried using
+    `mean()`, `std()`, `as_dict()`, etc. Calling `update()` also resets the
+    internal counters for the next round, so that the user-visible state
+    effectively reflects averages collected between the last two calls to
+    `update()`.
+
+    Args:
+        regex:          Regular expression defining which statistics to
+                        collect. The default is to collect everything.
+        keep_previous:  Whether to retain the previous averages if no
+                        scalars were collected on a given round
+                        (default: True).
+    """
+    def __init__(self, regex='.*', keep_previous=True):
+        self._regex = re.compile(regex)
+        self._keep_previous = keep_previous
+        self._cumulative = dict()
+        self._moments = dict()
+        self.update()
+        self._moments.clear()
+
+    def names(self):
+        r"""Returns the names of all statistics broadcasted so far that
+        match the regular expression specified at construction time.
+        """
+        return [name for name in _counters if self._regex.fullmatch(name)]
+
+    def update(self):
+        r"""Copies current values of the internal counters to the
+        user-visible state and resets them for the next round.
+
+        If `keep_previous=True` was specified at construction time, the
+        operation is skipped for statistics that have received no scalars
+        since the last update, retaining their previous averages.
+
+        This method performs a number of GPU-to-CPU transfers and one
+        `torch.distributed.all_reduce()`. It is intended to be called
+        periodically in the main training loop, typically once every
+        N training steps.
+        """
+        if not self._keep_previous:
+            self._moments.clear()
+        for name, cumulative in _sync(self.names()):
+            if name not in self._cumulative:
+                self._cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+            delta = cumulative - self._cumulative[name]
+            self._cumulative[name].copy_(cumulative)
+            if float(delta[0]) != 0:
+                self._moments[name] = delta
+
+    def _get_delta(self, name):
+        r"""Returns the raw moments that were accumulated for the given
+        statistic between the last two calls to `update()`, or zero if
+        no scalars were collected.
+        """
+        assert self._regex.fullmatch(name)
+        if name not in self._moments:
+            self._moments[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+        return self._moments[name]
+
+    def num(self, name):
+        r"""Returns the number of scalars that were accumulated for the given
+        statistic between the last two calls to `update()`, or zero if
+        no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        return int(delta[0])
+
+    def mean(self, name):
+        r"""Returns the mean of the scalars that were accumulated for the
+        given statistic between the last two calls to `update()`, or NaN if
+        no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        if int(delta[0]) == 0:
+            return float('nan')
+        return float(delta[1] / delta[0])
+
+    def std(self, name):
+        r"""Returns the standard deviation of the scalars that were
+        accumulated for the given statistic between the last two calls to
+        `update()`, or NaN if no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        if int(delta[0]) == 0 or not np.isfinite(float(delta[1])):
+            return float('nan')
+        if int(delta[0]) == 1:
+            return float(0)
+        mean = float(delta[1] / delta[0])
+        raw_var = float(delta[2] / delta[0])
+        return np.sqrt(max(raw_var - np.square(mean), 0))
+
+    def as_dict(self):
+        r"""Returns the averages accumulated between the last two calls to
+        `update()` as an `dnnlib.EasyDict`. The contents are as follows:
+
+            dnnlib.EasyDict(
+                NAME = dnnlib.EasyDict(num=FLOAT, mean=FLOAT, std=FLOAT),
+                ...
+            )
+        """
+        stats = dnnlib.EasyDict()
+        for name in self.names():
+            stats[name] = dnnlib.EasyDict(num=self.num(name), mean=self.mean(name), std=self.std(name))
+        return stats
+
+    def __getitem__(self, name):
+        r"""Convenience getter.
+        `collector[name]` is a synonym for `collector.mean(name)`.
+        """
+        return self.mean(name)
+
+#----------------------------------------------------------------------------
+
+def _sync(names):
+    r"""Synchronize the global cumulative counters across devices and
+    processes. Called internally by `Collector.update()`.
+    """
+    if len(names) == 0:
+        return []
+    global _sync_called
+    _sync_called = True
+
+    # Collect deltas within current rank.
+    deltas = []
+    device = _sync_device if _sync_device is not None else torch.device('cpu')
+    for name in names:
+        delta = torch.zeros([_num_moments], dtype=_counter_dtype, device=device)
+        for counter in _counters[name].values():
+            delta.add_(counter.to(device))
+            counter.copy_(torch.zeros_like(counter))
+        deltas.append(delta)
+    deltas = torch.stack(deltas)
+
+    # Sum deltas across ranks.
+    if _sync_device is not None:
+        torch.distributed.all_reduce(deltas)
+
+    # Update cumulative values.
+    deltas = deltas.cpu()
+    for idx, name in enumerate(names):
+        if name not in _cumulative:
+            _cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+        _cumulative[name].add_(deltas[idx])
+
+    # Return name-value pairs.
+    return [(name, _cumulative[name]) for name in names]
+
+#----------------------------------------------------------------------------
diff --git a/utils/__init__.py b/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/utils/ffhq_dataset/__init__.py b/utils/ffhq_dataset/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/utils/ffhq_dataset/face_alignment.py b/utils/ffhq_dataset/face_alignment.py
deleted file mode 100644
index 9f62666dc83a83d4e95446b445b9145dfe11f77c..0000000000000000000000000000000000000000
--- a/utils/ffhq_dataset/face_alignment.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import numpy as np
-import scipy.ndimage
-import os
-import PIL.Image
-
-
-def image_align(src_file, dst_file, face_landmarks, resize=True, output_size=1024, transform_size=4096, enable_padding=True):
-        # Align function from FFHQ dataset pre-processing step
-        # https://github.com/NVlabs/ffhq-dataset/blob/master/download_ffhq.py
-
-        lm = np.array(face_landmarks)
-        lm_chin          = lm[0  : 17]  # left-right
-        lm_eyebrow_left  = lm[17 : 22]  # left-right
-        lm_eyebrow_right = lm[22 : 27]  # left-right
-        lm_nose          = lm[27 : 31]  # top-down
-        lm_nostrils      = lm[31 : 36]  # top-down
-        lm_eye_left      = lm[36 : 42]  # left-clockwise
-        lm_eye_right     = lm[42 : 48]  # left-clockwise
-        lm_mouth_outer   = lm[48 : 60]  # left-clockwise
-        lm_mouth_inner   = lm[60 : 68]  # left-clockwise
-
-        # Calculate auxiliary vectors.
-        eye_left     = np.mean(lm_eye_left, axis=0)
-        eye_right    = np.mean(lm_eye_right, axis=0)
-        eye_avg      = (eye_left + eye_right) * 0.5
-        eye_to_eye   = eye_right - eye_left
-        mouth_left   = lm_mouth_outer[0]
-        mouth_right  = lm_mouth_outer[6]
-        mouth_avg    = (mouth_left + mouth_right) * 0.5
-        eye_to_mouth = mouth_avg - eye_avg
-
-        # Choose oriented crop rectangle.
-        x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
-        x /= np.hypot(*x)
-        x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
-        y = np.flipud(x) * [-1, 1]
-        c = eye_avg + eye_to_mouth * 0.1
-        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
-        qsize = np.hypot(*x) * 2
-
-        # Load in-the-wild image.
-        if not os.path.isfile(src_file):
-            print('\nCannot find source image. Please run "--wilds" before "--align".')
-            return
-        #img = cv2.imread(src_file)
-        #img = PIL.Image.fromarray(img)
-        img = PIL.Image.open(src_file)
-
-        # Shrink.
-        shrink = int(np.floor(qsize / output_size * 0.5))
-        if shrink > 1:
-            rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
-            img = img.resize(rsize, PIL.Image.ANTIALIAS)
-            quad /= shrink
-            qsize /= shrink
-
-        # Crop.
-        border = max(int(np.rint(qsize * 0.1)), 3)
-        crop = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
-        crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]), min(crop[3] + border, img.size[1]))
-        if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
-            img = img.crop(crop)
-            quad -= crop[0:2]
-
-        # Pad.
-        pad = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
-        pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0), max(pad[3] - img.size[1] + border, 0))
-        if enable_padding and max(pad) > border - 4:
-            img = np.float32(img)
-            if img.ndim == 2:
-                img = np.stack((img,)*3, axis=-1)
-            pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
-            img = np.pad(img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
-            h, w, _ = img.shape
-            y, x, _ = np.ogrid[:h, :w, :1]
-            mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w-1-x) / pad[2]), 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h-1-y) / pad[3]))
-            blur = qsize * 0.02
-            img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
-            img += (np.median(img, axis=(0,1)) - img) * np.clip(mask, 0.0, 1.0)
-            img = PIL.Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
-            quad += pad[:2]
-
-        xmin, xmax = np.amin(quad[:,0]), np.amax(quad[:,0])
-        ymin, ymax = np.amin(quad[:,1]), np.amax(quad[:,1])
-        quad_size = int(max(xmax-xmin, ymax-ymin)+0.5)
-
-        if not resize:
-            transform_size = output_size = quad_size
-
-
-        # Transform.
-        img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
-        if output_size < transform_size:
-            img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
-
-        # Save aligned image.
-        os.makedirs(os.path.dirname(dst_file), exist_ok=True)
-        img.save(dst_file, 'PNG')
-        return quad_size
diff --git a/utils/ffhq_dataset/landmarks_detector.py b/utils/ffhq_dataset/landmarks_detector.py
deleted file mode 100644
index 824dae9314d41eabe7091bce095bca1c0ce61ad0..0000000000000000000000000000000000000000
--- a/utils/ffhq_dataset/landmarks_detector.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import dlib
-import cv2
-
-
-class LandmarksDetector:
-    def __init__(self, predictor_model_path):
-        """
-        :param predictor_model_path: path to shape_predictor_68_face_landmarks.dat file
-        """
-        self.detector = dlib.get_frontal_face_detector() # cnn_face_detection_model_v1 also can be used
-        self.shape_predictor = dlib.shape_predictor(predictor_model_path)
-
-    def get_landmarks(self, image):
-        img = dlib.load_rgb_image(image)
-        dets = self.detector(img, 1)
-        #print('face bounding boxes', dets)
-
-        for detection in dets:
-            face_landmarks = [(item.x, item.y) for item in self.shape_predictor(img, detection).parts()]
-            #print('face landmarks', face_landmarks)
-            yield face_landmarks
-
-    def draw(img, landmarks):
-        for (x, y) in landmarks:
-            cv2.circle(img, (x, y), 1, (0, 0, 255), -1)
-        return img
-
-
-class DNNLandmarksDetector:
-    def __init__(self, predictor_model_path, DNN='TF'):
-        """
-        :param
-        DNN: "TF" or "CAFFE"
-        predictor_model_path: path to shape_predictor_68_face_landmarks.dat file
-        """
-        if DNN == "CAFFE":
-            modelFile = "res10_300x300_ssd_iter_140000_fp16.caffemodel"
-            configFile = "deploy.prototxt"
-            net = cv2.dnn.readNetFromCaffe(configFile, modelFile)
-        else:
-            modelFile = "opencv_face_detector_uint8.pb"
-            configFile = "opencv_face_detector.pbtxt"
-            net = cv2.dnn.readNetFromTensorflow(modelFile, configFile)
-
-        self.shape_predictor = dlib.shape_predictor(predictor_model_path)
-
-    def detect_faces(self, image, conf_threshold=0):
-        H, W = image.shape[:2]
-        blob = cv2.dnn.blobFromImage(image, 1.0, (300, 300), [104, 117, 123], False, False)
-        net.setInput(blob)
-        detections = net.forward()
-        bboxes = []
-        for i in range(detections.shape[2]):
-            confidence = detections[0, 0, i, 2]
-            if confidence > conf_threshold:
-                x1 = int(detections[0, 0, i, 3] * W)
-                y1 = int(detections[0, 0, i, 4] * H)
-                x2 = int(detections[0, 0, i, 5] * W)
-                y2 = int(detections[0, 0, i, 6] * H)
-                bboxes.append(dlib.rectangle(x1, y1, x2, y2))
-        return bboxes
-
-    def get_landmarks(self, image):
-        img = cv2.imread(image)
-        dets = self.detect_faces(img, 0)
-        print('face bounding boxes', dets)
-
-        for detection in dets:
-            face_landmarks = [(item.x, item.y) for item in self.shape_predictor(img, detection).parts()]
-            print('face landmarks', face_landmarks)
-            yield face_landmarks
diff --git a/utils/misc.py b/utils/misc.py
deleted file mode 100644
index e2f772285c79db97a41a662d40f7361aed806448..0000000000000000000000000000000000000000
--- a/utils/misc.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import os
-from typing import Iterable
-
-
-def optional_string(condition: bool, string: str):
-    return string if condition else ""
-
-
-def parent_dir(path: str) -> str:
-    return os.path.basename(os.path.dirname(path))
-
-
-def stem(path: str) -> str:
-    return os.path.splitext(os.path.basename(path))[0]
-
-
-def iterable_to_str(iterable: Iterable) -> str:
-    return ','.join([str(x) for x in iterable])
diff --git a/utils/optimize.py b/utils/optimize.py
deleted file mode 100644
index b2e5518d7bd687ab2ef0106c1e3a40fd40f1531c..0000000000000000000000000000000000000000
--- a/utils/optimize.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import math
-from argparse import (
-    ArgumentParser,
-    Namespace,
-)
-from typing import (
-    Dict,
-    Iterable,
-    Optional,
-    Tuple,
-)
-
-import numpy as np
-from tqdm import tqdm
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.utils.tensorboard import SummaryWriter
-from torchvision.utils import make_grid
-from torchvision.transforms import Resize
-
-#from optim import get_optimizer_class, OPTIMIZER_MAP
-from losses.regularize_noise import NoiseRegularizer
-from optim import RAdam
-from utils.misc import (
-    iterable_to_str,
-    optional_string,
-)
-
-
-class OptimizerArguments:
-    @staticmethod
-    def add_arguments(parser: ArgumentParser):
-        parser.add_argument('--coarse_min', type=int, default=32)
-        parser.add_argument('--wplus_step', type=int, nargs="+", default=[250, 750], help="#step for optimizing w_plus")
-        #parser.add_argument('--lr_rampup', type=float, default=0.05)
-        #parser.add_argument('--lr_rampdown', type=float, default=0.25)
-        parser.add_argument('--lr', type=float, default=0.1)
-        parser.add_argument('--noise_strength', type=float, default=.0)
-        parser.add_argument('--noise_ramp', type=float, default=0.75)
-        #parser.add_argument('--optimize_noise', action="store_true")
-        parser.add_argument('--camera_lr', type=float, default=0.01)
-
-        parser.add_argument("--log_dir", default="log/projector", help="tensorboard log directory")
-        parser.add_argument("--log_freq", type=int, default=10, help="log frequency")
-        parser.add_argument("--log_visual_freq", type=int, default=50, help="log frequency")
-
-    @staticmethod
-    def to_string(args: Namespace) -> str:
-        return (
-            f"lr{args.lr}_{args.camera_lr}-c{args.coarse_min}"
-            + f"-wp({iterable_to_str(args.wplus_step)})"
-            + optional_string(args.noise_strength, f"-n{args.noise_strength}")
-        )
-
-
-class LatentNoiser(nn.Module):
-    def __init__(
-            self, generator: torch.nn,
-            noise_ramp: float = 0.75, noise_strength: float = 0.05,
-            n_mean_latent: int = 10000
-    ):
-        super().__init__()
-
-        self.noise_ramp = noise_ramp
-        self.noise_strength = noise_strength
-
-        with torch.no_grad():
-            # TODO: get 512 from generator
-            noise_sample = torch.randn(n_mean_latent, 512, device=generator.device)
-            latent_out = generator.style(noise_sample)
-
-            latent_mean = latent_out.mean(0)
-            self.latent_std = ((latent_out - latent_mean).pow(2).sum() / n_mean_latent) ** 0.5
-
-    def forward(self, latent: torch.Tensor, t: float) -> torch.Tensor:
-        strength = self.latent_std * self.noise_strength * max(0, 1 - t / self.noise_ramp) ** 2
-        noise = torch.randn_like(latent) * strength
-        return latent + noise
-
-
-class Optimizer:
-    @classmethod
-    def optimize(
-            cls,
-            generator: torch.nn,
-            criterion: torch.nn,
-            degrade: torch.nn,
-            target: torch.Tensor,  # only used in writer since it's mostly baked in criterion
-            latent_init: torch.Tensor,
-            noise_init: torch.Tensor,
-            args: Namespace,
-            writer: Optional[SummaryWriter] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # do not optimize generator
-        generator = generator.eval()
-        target = target.detach()
-        # prepare parameters
-        noises = []
-        for n in noise_init:
-            noise = n.detach().clone()
-            noise.requires_grad = True
-            noises.append(noise)
-
-
-        def create_parameters(latent_coarse):
-            parameters = [
-                {'params': [latent_coarse], 'lr': args.lr},
-                {'params': noises, 'lr': args.lr},
-                {'params': degrade.parameters(), 'lr': args.camera_lr},
-            ]
-            return parameters
-
-
-        device = target.device
-
-        # start optimize
-        total_steps = np.sum(args.wplus_step)
-        max_coarse_size = (2 ** (len(args.wplus_step) - 1)) * args.coarse_min
-        noiser = LatentNoiser(generator, noise_ramp=args.noise_ramp, noise_strength=args.noise_strength).to(device)
-        latent = latent_init.detach().clone()
-        for coarse_level, steps in enumerate(args.wplus_step):
-            if criterion.weights["contextual"] > 0:
-                with torch.no_grad():
-                    # synthesize new sibling image using the current optimization results
-                    # FIXME: update rgbs sibling
-                    sibling, _, _ = generator([latent], input_is_latent=True, randomize_noise=True)
-                    criterion.update_sibling(sibling)
-
-            coarse_size = (2 ** coarse_level) * args.coarse_min
-            latent_coarse, latent_fine = cls.split_latent(
-                    latent, generator.get_latent_size(coarse_size))
-            parameters = create_parameters(latent_coarse)
-            optimizer = RAdam(parameters)
-
-            print(f"Optimizing {coarse_size}x{coarse_size}")
-            pbar = tqdm(range(steps))
-            for si in pbar:
-                latent = torch.cat((latent_coarse, latent_fine), dim=1)
-                niters = si + np.sum(args.wplus_step[:coarse_level])
-                latent_noisy = noiser(latent, niters / total_steps)
-                img_gen, _, rgbs = generator([latent_noisy], input_is_latent=True, noise=noises)
-                # TODO: use coarse_size instead of args.coarse_size for rgb_level
-                loss, losses = criterion(img_gen, degrade=degrade, noises=noises, rgbs=rgbs)
-
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-
-                NoiseRegularizer.normalize(noises)
-
-                # log
-                pbar.set_description("; ".join([f"{k}: {v.item(): .3e}" for k, v in losses.items()]))
-
-                if writer is not None and niters % args.log_freq == 0:
-                    cls.log_losses(writer, niters, loss, losses, criterion.weights)
-                    cls.log_parameters(writer, niters, degrade.named_parameters())
-                if writer is not None and niters % args.log_visual_freq == 0:
-                    cls.log_visuals(writer, niters, img_gen, target, degraded=degrade(img_gen), rgbs=rgbs)
-
-            latent = torch.cat((latent_coarse, latent_fine), dim=1).detach()
-
-        return latent, noises
-
-    @staticmethod
-    def split_latent(latent: torch.Tensor, coarse_latent_size: int):
-        latent_coarse = latent[:, :coarse_latent_size]
-        latent_coarse.requires_grad = True
-        latent_fine = latent[:, coarse_latent_size:]
-        latent_fine.requires_grad = False
-        return latent_coarse, latent_fine
-
-    @staticmethod
-    def log_losses(
-            writer: SummaryWriter,
-            niters: int,
-            loss_total: torch.Tensor,
-            losses: Dict[str, torch.Tensor],
-            weights: Optional[Dict[str, torch.Tensor]] = None
-    ):
-        writer.add_scalar("loss", loss_total.item(), niters)
-
-        for name, loss in losses.items():
-            writer.add_scalar(name, loss.item(), niters)
-            if weights is not None:
-                writer.add_scalar(f"weighted_{name}", weights[name] * loss.item(), niters)
-
-    @staticmethod
-    def log_parameters(
-            writer: SummaryWriter,
-            niters: int,
-            named_parameters: Iterable[Tuple[str, torch.nn.Parameter]],
-    ):
-        for name, para in named_parameters:
-            writer.add_scalar(name, para.item(), niters)
-
-    @classmethod
-    def log_visuals(
-            cls,
-            writer: SummaryWriter,
-            niters: int,
-            img: torch.Tensor,
-            target: torch.Tensor,
-            degraded=None,
-            rgbs=None,
-    ):
-        if target.shape[-1] != img.shape[-1]:
-            visual = make_grid(img, nrow=1, normalize=True, range=(-1, 1))
-            writer.add_image("pred", visual, niters)
-
-        def resize(img):
-            return F.interpolate(img, size=target.shape[2:], mode="area")
-
-        vis = resize(img)
-        if degraded is not None:
-            vis = torch.cat((resize(degraded), vis), dim=-1)
-        visual = make_grid(torch.cat((target.repeat(1, vis.shape[1] // target.shape[1], 1, 1), vis), dim=-1), nrow=1, normalize=True, range=(-1, 1))
-        writer.add_image("gnd[-degraded]-pred", visual, niters)
-
-        # log to rgbs
-        if rgbs is not None:
-            cls.log_torgbs(writer, niters, rgbs)
-
-    @staticmethod
-    def log_torgbs(writer: SummaryWriter, niters: int, rgbs: Iterable[torch.Tensor], prefix: str = ""):
-        for ri, rgb in enumerate(rgbs):
-            scale = 2 ** (-(len(rgbs) - ri))
-            visual = make_grid(torch.cat((rgb, rgb / scale), dim=-1), nrow=1, normalize=True, range=(-1, 1))
-            writer.add_image(f"{prefix}to_rbg_{2 ** (ri + 2)}", visual, niters)
-
diff --git a/utils/projector_arguments.py b/utils/projector_arguments.py
deleted file mode 100644
index 5fdf92897177fab9040abf666cbf6f4c7153ad78..0000000000000000000000000000000000000000
--- a/utils/projector_arguments.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-from argparse import (
-    ArgumentParser,
-    Namespace,
-)
-
-from models.degrade import DegradeArguments
-from tools.initialize import InitializerArguments
-from losses.joint_loss import LossArguments
-from utils.optimize import OptimizerArguments
-from .misc import (
-    optional_string,
-    iterable_to_str,
-)
-
-
-class ProjectorArguments:
-    def __init__(self):
-        parser = ArgumentParser("Project image into stylegan2")
-        self.add_arguments(parser)
-        self.parser = parser
-
-    @classmethod
-    def add_arguments(cls, parser: ArgumentParser):
-        parser.add_argument('--rand_seed', type=int, default=None,
-                            help="random seed")
-        cls.add_io_args(parser)
-        cls.add_preprocess_args(parser)
-        cls.add_stylegan_args(parser)
-
-        InitializerArguments.add_arguments(parser)
-        LossArguments.add_arguments(parser)
-        OptimizerArguments.add_arguments(parser)
-        DegradeArguments.add_arguments(parser)
-
-    @staticmethod
-    def add_stylegan_args(parser: ArgumentParser):
-        parser.add_argument('--ckpt', type=str, default="checkpoint/stylegan2-ffhq-config-f.pt",
-                            help="stylegan2 checkpoint")
-        parser.add_argument('--generator_size', type=int, default=1024,
-                            help="output size of the generator")
-
-    @staticmethod
-    def add_io_args(parser: ArgumentParser) -> ArgumentParser:
-        parser.add_argument('input', type=str, help="input image path")
-        parser.add_argument('--results_dir', default="results/projector", help="directory to save results.")
-
-    @staticmethod
-    def add_preprocess_args(parser: ArgumentParser):
-       # parser.add_argument("--match_histogram", action='store_true', help="match the histogram of the input image to the sibling")
-       pass
-
-    def parse(self, args=None, namespace=None) -> Namespace:
-        args = self.parser.parse_args(args, namespace=namespace)
-        self.print(args)
-        return args
-
-    @staticmethod
-    def print(args: Namespace):
-        print("------------ Parameters -------------")
-        args = vars(args)
-        for k, v in sorted(args.items()):
-            print(f"{k}: {v}")
-        print("-------------------------------------")
-
-    @staticmethod
-    def to_string(args: Namespace) -> str:
-        return "-".join([
-            #+ optional_string(args.no_camera_response, "-noCR")
-            #+ optional_string(args.match_histogram, "-MH")
-            DegradeArguments.to_string(args),
-            InitializerArguments.to_string(args),
-            LossArguments.to_string(args),
-            OptimizerArguments.to_string(args),
-        ]) + optional_string(args.rand_seed is not None, f"-S{args.rand_seed}")
-
diff --git a/utils/torch_helpers.py b/utils/torch_helpers.py
deleted file mode 100644
index 9aa728ce97c7ac3a73e0e66986cccbb16d5adacc..0000000000000000000000000000000000000000
--- a/utils/torch_helpers.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from torch import nn
-
-
-def device(gpu_id=0):
-    if torch.cuda.is_available():
-        return torch.device(f"cuda:{gpu_id}")
-    return torch.device("cpu")
-
-
-def load_matching_state_dict(model: nn.Module, state_dict):
-    model_dict = model.state_dict()
-    filtered_dict = {k: v for k, v in state_dict.items() if k in model_dict}
-    model.load_state_dict(filtered_dict)
-
-
-def resize(t: torch.Tensor, size: int) -> torch.Tensor:
-    B, C, H, W = t.shape
-    t = t.reshape(B, C, size, H // size, size, W // size)
-    return t.mean([3, 5])
-
-
-def make_image(tensor):
-    return (
-        tensor.detach()
-            .clamp_(min=-1, max=1)
-            .add(1)
-            .div_(2)
-            .mul(255)
-            .type(torch.uint8)
-            .permute(0, 2, 3, 1)
-            .to('cpu')
-            .numpy()
-    )
-
-