diff --git a/README.md b/README.md
index 4eb561151aa35a6dc082122d8243617992010761..ae894b6915e1bdb4c435bec1a60a97be2d3efc71 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,11 @@
 ---
-title: DATID 3D
-emoji: 📉
-colorFrom: red
-colorTo: red
+title: DATID-3D
+emoji: 🛋
+colorFrom: gray
+colorTo: yellow
 sdk: gradio
-sdk_version: 3.35.2
-app_file: app.py
+sdk_version: 3.28.3
+app_file: datid3d_gradio_app.py
 pinned: false
-license: apache-2.0
+license: mit
 ---
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/datid3d_gradio_app.py b/datid3d_gradio_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..039a99b0b22d7a07f2a49615ccd32c6358f71c29
--- /dev/null
+++ b/datid3d_gradio_app.py
@@ -0,0 +1,357 @@
+import argparse
+import gradio as gr
+import os
+import shutil
+from glob import glob
+from PIL import Image
+import numpy as np
+import matplotlib.pyplot as plt
+from torchvision.utils import make_grid, save_image
+from torchvision.io import read_image
+import torchvision.transforms.functional as F
+from functools import partial
+from datetime import datetime
+
+
+
+plt.rcParams["savefig.bbox"] = 'tight'
+
+def show(imgs):
+    if not isinstance(imgs, list):
+        imgs = [imgs]
+    fig, axs = plt.subplots(ncols=len(imgs), squeeze=False)
+    for i, img in enumerate(imgs):
+        img = F.to_pil_image(img.detach())
+        axs[0, i].imshow(np.asarray(img))
+        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+class Intermediate:
+    def __init__(self):
+        self.input_img = None
+        self.input_img_time = 0
+
+
+model_ckpts = {"elf": "ffhq-elf.pkl",
+               "greek_statue": "ffhq-greek_statue.pkl",
+               "hobbit": "ffhq-hobbit.pkl",
+               "lego": "ffhq-lego.pkl",
+               "masquerade": "ffhq-masquerade.pkl",
+               "neanderthal": "ffhq-neanderthal.pkl",
+               "orc": "ffhq-orc.pkl",
+               "pixar": "ffhq-pixar.pkl",
+               "skeleton": "ffhq-skeleton.pkl",
+               "stone_golem": "ffhq-stone_golem.pkl",
+               "super_mario": "ffhq-super_mario.pkl",
+               "tekken": "ffhq-tekken.pkl",
+               "yoda": "ffhq-yoda.pkl",
+               "zombie": "ffhq-zombie.pkl",
+               "cat_in_Zootopia": "cat-cat_in_Zootopia.pkl",
+               "fox_in_Zootopia": "cat-fox_in_Zootopia.pkl",
+               "golden_aluminum_animal": "cat-golden_aluminum_animal.pkl",
+               }
+
+manip_model_ckpts = {"super_mario": "ffhq-super_mario.pkl",
+                     "lego": "ffhq-lego.pkl",
+                     "neanderthal": "ffhq-neanderthal.pkl",
+                     "orc": "ffhq-orc.pkl",
+                     "pixar": "ffhq-pixar.pkl",
+                     "skeleton": "ffhq-skeleton.pkl",
+                     "stone_golem": "ffhq-stone_golem.pkl",
+                     "tekken": "ffhq-tekken.pkl",
+                     "greek_statue": "ffhq-greek_statue.pkl",
+                     "yoda": "ffhq-yoda.pkl",
+                     "zombie": "ffhq-zombie.pkl",
+                     "elf": "ffhq-elf.pkl",
+                   }
+
+
+def TextGuidedImageTo3D(intermediate, img, model_name, num_inversion_steps, truncation):
+    if img != intermediate.input_img:
+        if os.path.exists('input_imgs_gradio'):
+            shutil.rmtree('input_imgs_gradio')
+        os.makedirs('input_imgs_gradio', exist_ok=True)
+        img.save('input_imgs_gradio/input.png')
+        intermediate.input_img = img
+        now = datetime.now()
+        intermediate.input_img_time = now.strftime('%Y-%m-%d_%H:%M:%S')
+
+    all_model_names = manip_model_ckpts.keys()
+    generator_type = 'ffhq'
+
+    if model_name == 'all':
+        _no_video_models = []
+        for _model_name in all_model_names:
+            if not os.path.exists(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/4_manip_result/finetuned___{model_ckpts[_model_name]}__input_inv.mp4'):
+                print()
+                _no_video_models.append(_model_name)
+
+        model_names_command = ''
+        for _model_name in _no_video_models:
+            if not os.path.exists(f'finetuned/{model_ckpts[_model_name]}'):
+                command = f"""wget https://huggingface.co/gwang-kim/datid3d-finetuned-eg3d-models/resolve/main/finetuned_models/{model_ckpts[_model_name]} -O finetuned/{model_ckpts[_model_name]}
+                """
+                os.system(command)
+
+            model_names_command += f"finetuned/{model_ckpts[_model_name]} "
+
+        w_pths = sorted(glob(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/3_inversion_result/*.pt'))
+        if len(w_pths) == 0:
+            mode = 'manip'
+        else:
+            mode = 'manip_from_inv'
+
+        if len(_no_video_models) > 0:
+            command = f"""python datid3d_test.py --mode {mode} \
+                      --indir='input_imgs_gradio' \
+                      --generator_type={generator_type} \
+                      --outdir='test_runs' \
+                      --trunc={truncation} \
+                      --network {model_names_command} \
+                      --num_inv_steps={num_inversion_steps} \
+                      --down_src_eg3d_from_nvidia=False \
+                      --name_tag='_gradio_{intermediate.input_img_time}' \
+                      --shape=False \
+                      --w_frames 60
+                      """
+            print(command)
+            os.system(command)
+
+        aligned_img_pth = sorted(glob(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/2_pose_result/*.png'))[0]
+        aligned_img = Image.open(aligned_img_pth)
+
+        result_imgs = []
+        for _model_name in all_model_names:
+            img_pth = f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/4_manip_result/finetuned___{model_ckpts[_model_name]}__input_inv.png'
+            result_imgs.append(read_image(img_pth))
+
+        result_grid_pt = make_grid(result_imgs, nrow=1)
+        result_img = F.to_pil_image(result_grid_pt)
+    else:
+        if not os.path.exists(f'finetuned/{model_ckpts[model_name]}'):
+            command = f"""wget https://huggingface.co/gwang-kim/datid3d-finetuned-eg3d-models/resolve/main/finetuned_models/{model_ckpts[model_name]} -O finetuned/{model_ckpts[model_name]}
+            """
+            os.system(command)
+
+        if not os.path.exists(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/4_manip_result/finetuned___{model_ckpts[model_name]}__input_inv.mp4'):
+            w_pths = sorted(glob(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/3_inversion_result/*.pt'))
+            if len(w_pths) == 0:
+                mode = 'manip'
+            else:
+                mode = 'manip_from_inv'
+
+            command = f"""python datid3d_test.py --mode {mode} \
+          --indir='input_imgs_gradio' \
+          --generator_type={generator_type} \
+          --outdir='test_runs' \
+          --trunc={truncation} \
+          --network finetuned/{model_ckpts[model_name]} \
+          --num_inv_steps={num_inversion_steps} \
+          --down_src_eg3d_from_nvidia=0 \
+          --name_tag='_gradio_{intermediate.input_img_time}' \
+          --shape=False
+          --w_frames 60"""
+            print(command)
+            os.system(command)
+
+        aligned_img_pth = sorted(glob(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/2_pose_result/*.png'))[0]
+        aligned_img = Image.open(aligned_img_pth)
+
+        result_img_pth = sorted(glob(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/4_manip_result/*{model_ckpts[model_name]}*.png'))[0]
+        result_img = Image.open(result_img_pth)
+
+
+
+
+    if model_name=='all':
+        result_video_pth = f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/4_manip_result/finetuned___ffhq-all__input_inv.mp4'
+        if os.path.exists(result_video_pth):
+            os.remove(result_video_pth)
+        command = 'ffmpeg '
+        for _model_name in all_model_names:
+            command += f'-i test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/4_manip_result/finetuned___ffhq-{_model_name}.pkl__input_inv.mp4 '
+        command += '-filter_complex "[0:v]scale=2*iw:-1[v0];[1:v]scale=2*iw:-1[v1];[2:v]scale=2*iw:-1[v2];[3:v]scale=2*iw:-1[v3];[4:v]scale=2*iw:-1[v4];[5:v]scale=2*iw:-1[v5];[6:v]scale=2*iw:-1[v6];[7:v]scale=2*iw:-1[v7];[8:v]scale=2*iw:-1[v8];[9:v]scale=2*iw:-1[v9];[10:v]scale=2*iw:-1[v10];[11:v]scale=2*iw:-1[v11];[v0][v1][v2][v3][v4][v5][v6][v7][v8][v9][v10][v11]xstack=inputs=12:layout=0_0|w0_0|w0+w1_0|w0+w1+w2_0|0_h0|w4_h0|w4+w5_h0|w4+w5+w6_h0|0_h0+h4|w8_h0+h4|w8+w9_h0+h4|w8+w9+w10_h0+h4" '
+        command += f" -vcodec libx264 {result_video_pth}"
+        print()
+        print(command)
+        os.system(command)
+
+    else:
+        result_video_pth = sorted(glob(f'test_runs/manip_3D_recon_gradio_{intermediate.input_img_time}/4_manip_result/*{model_ckpts[model_name]}*.mp4'))[0]
+
+    return aligned_img, result_img, result_video_pth
+
+
+def SampleImage(model_name, num_samples, truncation, seed):
+    seed_list = np.random.RandomState(seed).choice(np.arange(10000), num_samples).tolist()
+    seeds = ''
+    for seed in seed_list:
+        seeds += f'{seed},'
+    seeds = seeds[:-1]
+
+    if model_name in ["fox_in_Zootopia", "cat_in_Zootopia", "golden_aluminum_animal"]:
+        generator_type = 'cat'
+    else:
+        generator_type = 'ffhq'
+
+    if not os.path.exists(f'finetuned/{model_ckpts[model_name]}'):
+        command = f"""wget https://huggingface.co/gwang-kim/datid3d-finetuned-eg3d-models/resolve/main/finetuned_models/{model_ckpts[model_name]} -O finetuned/{model_ckpts[model_name]}
+        """
+        os.system(command)
+
+    command = f"""python datid3d_test.py --mode image \
+    --generator_type={generator_type} \
+    --outdir='test_runs' \
+    --seeds={seeds} \
+    --trunc={truncation} \
+    --network=finetuned/{model_ckpts[model_name]} \
+    --shape=False"""
+    print(command)
+    os.system(command)
+
+    result_img_pths = sorted(glob(f'test_runs/image/*{model_ckpts[model_name]}*.png'))
+    result_imgs = []
+    for img_pth in result_img_pths:
+        result_imgs.append(read_image(img_pth))
+
+    result_grid_pt = make_grid(result_imgs, nrow=1)
+    result_grid_pil = F.to_pil_image(result_grid_pt)
+    return result_grid_pil
+
+
+
+
+def SampleVideo(model_name, grid_height, truncation, seed):
+    seed_list = np.random.RandomState(seed).choice(np.arange(10000), grid_height**2).tolist()
+    seeds = ''
+    for seed in seed_list:
+        seeds += f'{seed},'
+    seeds = seeds[:-1]
+
+    if model_name in ["fox_in_Zootopia", "cat_in_Zootopia", "golden_aluminum_animal"]:
+        generator_type = 'cat'
+    else:
+        generator_type = 'ffhq'
+
+    if not os.path.exists(f'finetuned/{model_ckpts[model_name]}'):
+        command = f"""wget https://huggingface.co/gwang-kim/datid3d-finetuned-eg3d-models/resolve/main/finetuned_models/{model_ckpts[model_name]} -O finetuned/{model_ckpts[model_name]}
+        """
+        os.system(command)
+
+    command = f"""python datid3d_test.py --mode video \
+    --generator_type={generator_type} \
+    --outdir='test_runs' \
+    --seeds={seeds} \
+    --trunc={truncation} \
+    --grid={grid_height}x{grid_height} \
+    --network=finetuned/{model_ckpts[model_name]} \
+    --shape=False"""
+    print(command)
+    os.system(command)
+
+    result_video_pth = sorted(glob(f'test_runs/video/*{model_ckpts[model_name]}*.mp4'))[0]
+
+    return result_video_pth
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--share', action='store_true', help="public url")
+    args = parser.parse_args()
+
+    demo = gr.Blocks(title="DATID-3D Interactive Demo")
+    os.makedirs('finetuned', exist_ok=True)
+    intermediate = Intermediate()
+    with demo:
+        gr.Markdown("# DATID-3D Interactive Demo")
+        gr.Markdown(
+            "### Demo of the CVPR 2023 paper \"DATID-3D: Diversity-Preserved Domain Adaptation Using Text-to-Image Diffusion for 3D Generative Model\"")
+
+        with gr.Tab("Text-guided Manipulated 3D reconstruction"):
+            gr.Markdown("Text-guided Image-to-3D Translation")
+            with gr.Row():
+                with gr.Column(scale=1, variant='panel'):
+                    t_image_input = gr.Image(source='upload', type="pil", interactive=True)
+
+                    t_model_name = gr.Radio(["super_mario", "lego", "neanderthal", "orc",
+                                             "pixar", "skeleton", "stone_golem","tekken",
+                                             "greek_statue", "yoda", "zombie", "elf", "all"],
+                                             label="Model fine-tuned through DATID-3D",
+                                             value="super_mario", interactive=True)
+                    with gr.Accordion("Advanced Options", open=False):
+                        t_truncation = gr.Slider(label="Truncation psi", minimum=0, maximum=1.0, step=0.01, randomize=False, value=0.8)
+                        t_num_inversion_steps = gr.Slider(200, 1000, value=200, step=1, label='Number of steps for the invresion')
+                    with gr.Row():
+                        t_button_gen_result = gr.Button("Generate Result", variant='primary')
+                        # t_button_gen_video = gr.Button("Generate Video", variant='primary')
+                        # t_button_gen_image = gr.Button("Generate Image", variant='secondary')
+                    with gr.Row():
+                        t_align_image_result = gr.Image(label="Alignment result", interactive=False)
+                with gr.Column(scale=1, variant='panel'):
+                    with gr.Row():
+                        t_video_result = gr.Video(label="Video result", interactive=False)
+
+                    with gr.Row():
+                        t_image_result = gr.Image(label="Image result", interactive=False)
+
+
+        with gr.Tab("Sample Images"):
+            with gr.Row():
+                with gr.Column(scale=1, variant='panel'):
+                    i_model_name = gr.Radio(
+                        ["elf", "greek_statue", "hobbit", "lego", "masquerade", "neanderthal", "orc", "pixar",
+                         "skeleton", "stone_golem", "super_mario", "tekken", "yoda", "zombie", "fox_in_Zootopia",
+                         "cat_in_Zootopia", "golden_aluminum_animal"],
+                        label="Model fine-tuned through DATID-3D",
+                        value="super_mario", interactive=True)
+                    i_num_samples = gr.Slider(0, 20, value=4, step=1, label='Number of samples')
+                    i_seed = gr.Slider(label="Seed", minimum=0, maximum=1000000000, step=1, value=1235)
+                    with gr.Accordion("Advanced Options", open=False):
+                        i_truncation = gr.Slider(label="Truncation psi", minimum=0, maximum=1.0, step=0.01, randomize=False, value=0.8)
+                    with gr.Row():
+                        i_button_gen_image = gr.Button("Generate Image", variant='primary')
+                with gr.Column(scale=1, variant='panel'):
+                    with gr.Row():
+                        i_image_result = gr.Image(label="Image result", interactive=False)
+
+
+        with gr.Tab("Sample Videos"):
+            with gr.Row():
+                with gr.Column(scale=1, variant='panel'):
+                    v_model_name = gr.Radio(
+                        ["elf", "greek_statue", "hobbit", "lego", "masquerade", "neanderthal", "orc", "pixar",
+                         "skeleton", "stone_golem", "super_mario", "tekken", "yoda", "zombie", "fox_in_Zootopia",
+                         "cat_in_Zootopia", "golden_aluminum_animal"],
+                        label="Model fine-tuned through DATID-3D",
+                        value="super_mario", interactive=True)
+                    v_grid_height = gr.Slider(0, 5, value=2, step=1,label='Height of the grid')
+                    v_seed = gr.Slider(label="Seed", minimum=0, maximum=1000000000, step=1, value=1235)
+                    with gr.Accordion("Advanced Options", open=False):
+                        v_truncation = gr.Slider(label="Truncation psi", minimum=0, maximum=1.0, step=0.01, randomize=False,
+                                                 value=0.8)
+
+                    with gr.Row():
+                        v_button_gen_video = gr.Button("Generate Video", variant='primary')
+
+                with gr.Column(scale=1, variant='panel'):
+
+                    with gr.Row():
+                        v_video_result = gr.Video(label="Video result", interactive=False)
+
+
+
+
+
+        # functions
+        t_button_gen_result.click(fn=partial(TextGuidedImageTo3D, intermediate),
+                                  inputs=[t_image_input, t_model_name, t_num_inversion_steps, t_truncation],
+                                  outputs=[t_align_image_result, t_image_result, t_video_result])
+        i_button_gen_image.click(fn=SampleImage,
+                              inputs=[i_model_name, i_num_samples, i_truncation, i_seed],
+                              outputs=[i_image_result])
+        v_button_gen_video.click(fn=SampleVideo,
+                                 inputs=[i_model_name, v_grid_height, v_truncation, v_seed],
+                                 outputs=[v_video_result])
+
+    demo.queue(concurrency_count=1)
+    demo.launch(share=args.share)
+
diff --git a/datid3d_test.py b/datid3d_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..058c45c3bd9f8290794ed60a7a8be33c1e16dc25
--- /dev/null
+++ b/datid3d_test.py
@@ -0,0 +1,251 @@
+import os
+from os.path import join as opj
+import argparse
+from glob import glob
+
+### Parameters
+parser = argparse.ArgumentParser()
+
+# For all
+parser.add_argument('--mode', type=str, required=True, choices=['image', 'video', 'manip', 'manip_from_inv'],
+                    help="image: Sample images and shapes, "
+                         "video: Sample pose-controlled videos, "
+                         "manip: Manipulated 3D reconstruction from images, "
+                         "manip_from_inv: Manipulated 3D reconstruction from inverted latent")
+parser.add_argument('--network', type=str, nargs='+', required=True)
+parser.add_argument('--generator_type', default='ffhq', type=str, choices=['ffhq', 'cat'])  # ffhq, cat
+parser.add_argument('--outdir', type=str, default='test_runs')
+parser.add_argument('--trunc', type=float, default=0.7)
+parser.add_argument('--seeds', type=str, default='100-200')
+parser.add_argument('--down_src_eg3d_from_nvidia', default=True)
+parser.add_argument('--num_inv_steps', default=300, type=int)
+# Manipulated 3D reconstruction
+parser.add_argument('--indir', type=str, default='input_imgs')
+parser.add_argument('--name_tag', type=str, default='')
+# Sample images
+parser.add_argument('--shape', default=True)
+parser.add_argument('--shape_format',  type=str, choices=['.mrc', '.ply'], default='.mrc')
+parser.add_argument('--shape_only_first', type=bool, default=False)
+# Sample pose-controlled videos
+parser.add_argument('--grid', default='1x1')
+parser.add_argument('--w_frames', type=int, default=120)
+
+
+
+args = parser.parse_args()
+os.makedirs(args.outdir, exist_ok=True)
+print()
+
+
+network_command = ''
+for network_path in args.network:
+    network_command += f"--network {opj('..', network_path)} "
+
+
+
+### Sample images
+if args.mode == 'image':
+    image_path = opj(args.outdir, f'image{args.name_tag}')
+    os.makedirs(image_path, exist_ok=True)
+
+    os.chdir('eg3d')
+    command = f"""python gen_samples.py \
+    {network_command} \
+    --seeds={args.seeds}  \
+    --generator_type={args.generator_type} \
+    --outdir={opj('..', image_path)} \
+    --shapes={args.shape} \
+    --shape_format={args.shape_format} \
+    --shape_only_first={args.shape_only_first} \
+    --trunc={args.trunc} \
+    """
+    print(f"{command} \n")
+    os.system(command)
+    os.chdir('..')
+
+
+
+
+
+### Sample pose-controlled videos
+if args.mode == 'video':
+    video_path = opj(args.outdir, f'video{args.name_tag}')
+    os.makedirs(video_path, exist_ok=True)
+
+    os.chdir('eg3d')
+    command = f"""python gen_videos.py \
+    {network_command} \
+    --seeds={args.seeds} \
+    --generator_type={args.generator_type} \
+    --outdir={opj('..', video_path)} \
+    --shapes=False \
+    --trunc={args.trunc} \
+    --grid={args.grid} \
+    --w-frames={args.w_frames}
+    """
+    print(f"{command} \n")
+    os.system(command)
+    os.chdir('..')
+
+
+### Manipulated 3D reconstruction from images
+if args.mode == 'manip':
+    input_path = opj(args.indir)
+    align_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '1_align_result')
+    pose_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '2_pose_result')
+    inversion_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '3_inversion_result')
+    manip_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '4_manip_result')
+
+    os.makedirs(opj(args.outdir, f'manip_3D_recon{args.name_tag}'), exist_ok=True)
+    os.makedirs(align_path, exist_ok=True)
+    os.makedirs(pose_path, exist_ok=True)
+    os.makedirs(inversion_path, exist_ok=True)
+    os.makedirs(manip_path, exist_ok=True)
+
+    os.chdir('eg3d')
+    if args.generator_type == 'cat':
+        generator_id = 'afhqcats512-128.pkl'
+    else:
+        generator_id = 'ffhqrebalanced512-128.pkl'
+    generator_path = f'pretrained/{generator_id}'
+    if not os.path.exists(generator_path):
+        os.makedirs(f'pretrained', exist_ok=True)
+        print("Pretrained EG3D model cannot be found. Downloading the pretrained EG3D models.")
+        if args.down_src_eg3d_from_nvidia == True:
+            os.system(f'wget -c https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/{generator_id} -O {generator_path}')
+        else:
+            os.system(f'wget https://huggingface.co/gwang-kim/datid3d-finetuned-eg3d-models/resolve/main/finetuned_models/nvidia_{generator_id} -O {generator_path}')
+    os.chdir('..')
+
+    ## Align images and Pose extraction
+    os.chdir('pose_estimation')
+    if not os.path.exists('checkpoints/pretrained/epoch_20.pth') or not os.path.exists('BFM'):
+        print(f"BFM and pretrained DeepFaceRecon3D model cannot be found. Downloading the pretrained pose estimation model and BFM files, put epoch_20.pth in ./pose_estimation/checkpoints/pretrained/ and put unzip BFM.zip in ./pose_estimation/.")
+
+        try:
+            from gdown import download as drive_download
+            drive_download(f'https://drive.google.com/uc?id=1mdqkEUepHZROeOj99pXogAPJPqzBDN2G', './BFM.zip', quiet=False)
+            os.system('unzip BFM.zip')
+            drive_download(f'https://drive.google.com/uc?id=1zawY7jYDJlUGnSAXn1pgIHgIvJpiSmj5', './checkpoints/pretrained/epoch_20.pth', quiet=False)
+        except:
+            os.system("pip install -U --no-cache-dir gdown --pre")
+            from gdown import download as drive_download
+            drive_download(f'https://drive.google.com/uc?id=1mdqkEUepHZROeOj99pXogAPJPqzBDN2G', './BFM.zip', quiet=False)
+            os.system('unzip BFM.zip')
+            drive_download(f'https://drive.google.com/uc?id=1zawY7jYDJlUGnSAXn1pgIHgIvJpiSmj5', './checkpoints/pretrained/epoch_20.pth', quiet=False)
+
+        print()
+    command =  f"""python extract_pose.py 0 \
+    {opj('..', input_path)} {opj('..', align_path)} {opj('..', pose_path)}
+    """
+    print(f"{command} \n")
+    os.system(command)
+    os.chdir('..')
+
+    ## Invert images to the latent space of 3D GANs
+    os.chdir('eg3d')
+    command = f"""python run_inversion.py  \
+    --outdir={opj('..', inversion_path)} \
+    --latent_space_type=w_plus  \
+    --network={generator_path} \
+    --image_path={opj('..', pose_path)} \
+    --num_steps={args.num_inv_steps}
+    """
+    print(f"{command} \n")
+    os.system(command)
+    os.chdir('..')
+
+    ## Generate videos, images and mesh
+    os.chdir('eg3d')
+    w_pths = sorted(glob(opj('..', inversion_path, '*.pt')))
+    if len(w_pths) == 0:
+        print("No inverted latent")
+        exit()
+    for w_pth in w_pths:
+        print(f"{w_pth} \n")
+
+        command = f"""python gen_samples.py \
+        {network_command} \
+        --w_pth={w_pth} \
+        --seeds='100-200' \
+        --generator_type={args.generator_type} \
+        --outdir={opj('..', manip_path)} \
+        --shapes={args.shape} \
+        --shape_format={args.shape_format} \
+        --shape_only_first={args.shape_only_first} \
+        --trunc={args.trunc} \
+        """
+        print(f"{command} \n")
+        os.system(command)
+
+        command = f"""python gen_videos.py \
+         {network_command} \
+        --w_pth={w_pth} \
+        --seeds='100-200' \
+        --generator_type={args.generator_type} \
+        --outdir={opj('..', manip_path)} \
+        --shapes=False \
+        --trunc={args.trunc} \
+        --grid=1x1 \
+        --w-frames={args.w_frames} 
+        """
+        print(f"{command} \n")
+        os.system(command)
+    os.chdir('..')
+
+
+
+
+
+### Manipulated 3D reconstruction from inverted latent
+if args.mode == 'manip_from_inv':
+    input_path = opj(args.indir)
+    align_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '1_align_result')
+    pose_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '2_pose_result')
+    inversion_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '3_inversion_result')
+    manip_path = opj(args.outdir, f'manip_3D_recon{args.name_tag}', '4_manip_result')
+
+    os.makedirs(opj(args.outdir, f'manip_3D_recon{args.name_tag}'), exist_ok=True)
+    os.makedirs(align_path, exist_ok=True)
+    os.makedirs(pose_path, exist_ok=True)
+    os.makedirs(inversion_path, exist_ok=True)
+    os.makedirs(manip_path, exist_ok=True)
+
+    ## Generate videos, images and mesh
+    os.chdir('eg3d')
+    w_pths = sorted(glob(opj('..', inversion_path, '*.pt')))
+    if len(w_pths) == 0:
+        print("No inverted latent")
+        exit()
+    for w_pth in w_pths:
+        print(f"{w_pth} \n")
+
+        command = f"""python gen_samples.py \
+         {network_command} \
+        --w_pth={w_pth} \
+        --seeds='100-200' \
+        --generator_type={args.generator_type} \
+        --outdir={opj('..', manip_path)} \
+        --shapes={args.shape} \
+        --shape_format={args.shape_format} \
+        --shape_only_first={args.shape_only_first} \
+        --trunc={args.trunc} \
+        """
+        print(f"{command} \n")
+        os.system(command)
+
+        command = f"""python gen_videos.py \
+         {network_command} \
+        --w_pth={w_pth} \
+        --seeds='100-200' \
+        --generator_type={args.generator_type} \
+        --outdir={opj('..', manip_path)} \
+        --shapes=False \
+        --trunc={args.trunc} \
+        --grid=1x1 
+        """
+        print(f"{command} \n")
+        os.system(command)
+    os.chdir('..')
+
+
diff --git a/datid3d_train.py b/datid3d_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..83fe0ad88730821258cf4f6905a2de352c95b844
--- /dev/null
+++ b/datid3d_train.py
@@ -0,0 +1,105 @@
+import os
+import argparse
+
+### Parameters
+parser = argparse.ArgumentParser()
+
+# For all
+parser.add_argument('--mode', type=str, required=True, choices=['pdg', 'ft', 'both'],
+                    help="pdg: Pose-aware dataset generation, ft: Fine-tuning 3D generative models, both: Doing both")
+parser.add_argument('--down_src_eg3d_from_nvidia', default=True)
+# Pose-aware dataset generation
+parser.add_argument('--pdg_prompt', type=str, required=True)
+parser.add_argument('--pdg_generator_type', default='ffhq', type=str, choices=['ffhq', 'cat'])  # ffhq, cat
+parser.add_argument('--pdg_strength', default=0.7, type=float)
+parser.add_argument('--pdg_guidance_scale', default=8, type=float)
+parser.add_argument('--pdg_num_images', default=1000, type=int)
+parser.add_argument('--pdg_sd_model_id', default='stabilityai/stable-diffusion-2-1-base', type=str)
+parser.add_argument('--pdg_num_inference_steps', default=50, type=int)
+parser.add_argument('--pdg_name_tag', default='', type=str)
+parser.add_argument('--down_src_eg3d_from_nvidia', default=True)
+# Fine-tuning 3D generative models
+parser.add_argument('--ft_generator_type', default='same', help="None: The same type as pdg_generator_type", type=str, choices=['ffhq', 'cat', 'same'])
+parser.add_argument('--ft_kimg', default=200, type=int)
+parser.add_argument('--ft_batch', default=20, type=int)
+parser.add_argument('--ft_tick', default=1, type=int)
+parser.add_argument('--ft_snap', default=50, type=int)
+parser.add_argument('--ft_outdir', default='../training_runs', type=str) #
+parser.add_argument('--ft_gpus', default=1, type=str) #
+parser.add_argument('--ft_workers', default=8, type=int) #
+parser.add_argument('--ft_data_max_size', default=500000000, type=int) #
+parser.add_argument('--ft_freeze_dec_sr', default=True, type=bool) #
+
+args = parser.parse_args()
+
+
+### Pose-aware target generation
+if args.mode in ['pdg', 'both']:
+    os.chdir('eg3d')
+    if args.pdg_generator_type == 'cat':
+        pdg_generator_id = 'afhqcats512-128.pkl'
+    else:
+        pdg_generator_id = 'ffhqrebalanced512-128.pkl'
+
+    pdg_generator_path = f'pretrained/{pdg_generator_id}'
+    if not os.path.exists(pdg_generator_path):
+        os.makedirs(f'pretrained', exist_ok=True)
+        print("Pretrained EG3D model cannot be found. Downloading the pretrained EG3D models.")
+        if args.down_src_eg3d_from_nvidia == True:
+            os.system(f'wget -c https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/{pdg_generator_id} -O {pdg_generator_path}')
+        else:
+            os.system(f'wget https://huggingface.co/gwang-kim/datid3d-finetuned-eg3d-models/resolve/main/finetuned_models/nvidia_{pdg_generator_id} -O {pdg_generator_path}')
+    command = f"""python datid3d_data_gen.py  \
+   --prompt="{args.pdg_prompt}" \
+   --data_type={args.pdg_generator_type} \
+   --strength={args.pdg_strength} \
+   --guidance_scale={args.pdg_guidance_scale} \
+   --num_images={args.pdg_num_images} \
+   --sd_model_id="{args.pdg_sd_model_id}" \
+   --num_inference_steps={args.pdg_num_inference_steps} \
+   --name_tag={args.pdg_name_tag} 
+    """
+    print(f"{command} \n")
+    os.system(command)
+    os.chdir('..')
+
+### Filtering process
+# TODO
+
+
+### Fine-tuning 3D generative models
+if args.mode in ['ft', 'both']:
+    os.chdir('eg3d')
+    if args.ft_generator_type == 'same':
+        args.ft_generator_type = args.pdg_generator_type
+
+    if args.ft_generator_type == 'cat':
+        ft_generator_id = 'afhqcats512-128.pkl'
+    else:
+        ft_generator_id = 'ffhqrebalanced512-128.pkl'
+
+    ft_generator_path = f'pretrained/{ft_generator_id}'
+    if not os.path.exists(ft_generator_path):
+        os.makedirs(f'pretrained', exist_ok=True)
+        print("Pretrained EG3D model cannot be found. Downloading the pretrained EG3D models.")
+        if args.down_src_eg3d_from_nvidia == True:
+            os.system(f'wget -c https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/{ft_generator_id} -O {ft_generator_path}')
+        else:
+            os.system(f'wget https://huggingface.co/gwang-kim/datid3d-finetuned-eg3d-models/resolve/main/finetuned_models/nvidia_{ft_generator_id} -O {ft_generator_path}')
+
+    dataset_id = f'data_{args.pdg_generator_type}_{args.pdg_prompt.replace(" ", "_")}{args.pdg_name_tag}'
+    dataset_path = f'./exp_data/{dataset_id}/{dataset_id}.zip'
+
+
+    command = f"""python train.py  \
+    --outdir={args.ft_outdir} \
+    --cfg={args.ft_generator_type} \
+    --data="{dataset_path}"   \
+    --resume={ft_generator_path} --freeze_dec_sr={args.ft_freeze_dec_sr} \
+    --batch={args.ft_batch} --workers={args.ft_workers} --gpus={args.ft_gpus} \
+    --tick={args.ft_tick} --snap={args.ft_snap} --data_max_size={args.ft_data_max_size} --kimg={args.ft_kimg} \
+    --gamma=5 --aug=ada --neural_rendering_resolution_final=128 --gen_pose_cond=True --gpc_reg_prob=0.8 --metrics=None 
+    """
+    print(f"{command} \n")
+    os.system(command)
+    os.chdir('..')
diff --git a/eg3d/LICENSE.txt b/eg3d/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b4495c4a8ad8d0f01a5209f4d79f6c8f18252aea
--- /dev/null
+++ b/eg3d/LICENSE.txt
@@ -0,0 +1,99 @@
+Copyright (c) 2021-2022, NVIDIA Corporation & affiliates. All rights
+reserved.
+
+
+NVIDIA Source Code License for EG3D
+
+
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. The Work or
+    derivative works thereof may be used or intended for use by NVIDIA
+    or it’s affiliates commercially or non-commercially. As used
+    herein, "non-commercially" means for research or evaluation
+    purposes only and not for any direct or indirect monetary gain.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grants in Sections 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grants in Sections 2.1)
+    will terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
diff --git a/eg3d/README.md b/eg3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..87a743936a9fe32959c63a7320713f90f9aa837a
--- /dev/null
+++ b/eg3d/README.md
@@ -0,0 +1,216 @@
+## Efficient Geometry-aware 3D Generative Adversarial Networks (EG3D)<br><sub>Official PyTorch implementation of the CVPR 2022 paper</sub>
+
+![Teaser image](./docs/teaser.jpeg)
+
+**Efficient Geometry-aware 3D Generative Adversarial Networks**<br>
+Eric R. Chan*, Connor Z. Lin*, Matthew A. Chan*, Koki Nagano*, Boxiao Pan, Shalini De Mello, Orazio Gallo, Leonidas Guibas, Jonathan Tremblay, Sameh Khamis, Tero Karras, and Gordon Wetzstein<br>*\* equal contribution*<br>
+<br>https://nvlabs.github.io/eg3d/<br>
+
+Abstract: *Unsupervised generation of high-quality multi-view-consistent images and 3D shapes using only collections of single-view 2D photographs has been a long-standing challenge. Existing 3D GANs are either compute-intensive or make approximations that are not 3D-consistent; the former limits quality and resolution of the generated images and the latter adversely affects multi-view consistency and shape quality. In this work, we improve the computational efficiency and image quality of 3D GANs without overly relying on these approximations. We introduce an expressive hybrid explicit-implicit network architecture that, together with other design choices, synthesizes not only high-resolution multi-view-consistent images in real time but also produces high-quality 3D geometry. By decoupling feature generation and neural rendering, our framework is able to leverage state-of-the-art 2D CNN generators, such as StyleGAN2, and inherit their efficiency and expressiveness. We demonstrate state-of-the-art 3D-aware synthesis with FFHQ and AFHQ Cats, among other experiments.*
+
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+
+## Requirements
+
+* We recommend Linux for performance and compatibility reasons.
+* 1&ndash;8 high-end NVIDIA GPUs. We have done all testing and development using V100, RTX3090, and A100 GPUs.
+* 64-bit Python 3.8 and PyTorch 1.11.0 (or later). See https://pytorch.org for PyTorch install instructions.
+* CUDA toolkit 11.3 or later.  (Why is a separate CUDA toolkit installation required?  We use the custom CUDA extensions from the StyleGAN3 repo. Please see [Troubleshooting](https://github.com/NVlabs/stylegan3/blob/main/docs/troubleshooting.md#why-is-cuda-toolkit-installation-necessary)).
+* Python libraries: see [environment.yml](../environment.yml) for exact library dependencies.  You can use the following commands with Miniconda3 to create and activate your Python environment:
+  - `cd eg3d`
+  - `conda env create -f environment.yml`
+  - `conda activate eg3d`
+
+## Getting started
+
+Pre-trained networks are stored as `*.pkl` files that can be referenced using local filenames. See [Models](./docs/models.md) for download links to pre-trained checkpoints.
+
+
+## Generating media
+
+```.bash
+# Generate videos using pre-trained model
+
+python gen_videos.py --outdir=out --trunc=0.7 --seeds=0-3 --grid=2x2 \
+    --network=networks/network_snapshot.pkl
+
+# Generate the same 4 seeds in an interpolation sequence
+
+python gen_videos.py --outdir=out --trunc=0.7 --seeds=0-3 --grid=1x1 \
+    --network=networks/network_snapshot.pkl
+```
+
+```.bash
+# Generate images and shapes (as .mrc files) using pre-trained model
+
+python gen_samples.py --outdir=out --trunc=0.7 --shapes=true --seeds=0-3 \
+    --network=networks/network_snapshot.pkl
+```
+
+We visualize our .mrc shape files with [UCSF Chimerax](https://www.cgl.ucsf.edu/chimerax/).
+
+To visualize a shape in ChimeraX do the following:
+1. Import the `.mrc` file with `File > Open`
+1. Find the selected shape in the Volume Viewer tool
+    1. The Volume Viewer tool is located under `Tools > Volume Data > Volume Viewer`
+1. Change volume type to "Surface"
+1. Change step size to 1
+1. Change level set to 10
+    1. Note that the optimal level can vary by each object, but is usually between 2 and 20. Individual adjustment may make certain shapes slightly sharper
+1. In the `Lighting` menu in the top bar, change lighting to "Full"
+
+
+## Interactive visualization
+
+This release contains an interactive model visualization tool that can be used to explore various characteristics of a trained model.  To start it, run:
+
+```.bash
+python visualizer.py
+```
+
+See the [`Visualizer Guide`](./docs/visualizer_guide.md) for a description of important options.
+
+
+## Using networks from Python
+
+You can use pre-trained networks in your own Python code as follows:
+
+```.python
+with open('ffhq.pkl', 'rb') as f:
+    G = pickle.load(f)['G_ema'].cuda()  # torch.nn.Module
+z = torch.randn([1, G.z_dim]).cuda()    # latent codes
+c = torch.cat([cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1) # camera parameters
+img = G(z, c)['image']                           # NCHW, float32, dynamic range [-1, +1], no truncation
+```
+
+The above code requires `torch_utils` and `dnnlib` to be accessible via `PYTHONPATH`. It does not need source code for the networks themselves &mdash; their class definitions are loaded from the pickle via `torch_utils.persistence`.
+
+The pickle contains three networks. `'G'` and `'D'` are instantaneous snapshots taken during training, and `'G_ema'` represents a moving average of the generator weights over several training steps. The networks are regular instances of `torch.nn.Module`, with all of their parameters and buffers placed on the CPU at import and gradient computation disabled by default.
+
+The generator consists of two submodules, `G.mapping` and `G.synthesis`, that can be executed separately. They also support various additional options:
+
+```.python
+w = G.mapping(z, conditioning_params, truncation_psi=0.5, truncation_cutoff=8)
+img = G.synthesis(w, camera_params)['image]
+```
+
+Please refer to [`gen_samples.py`](gen_samples.py) for complete code example.
+
+## Preparing datasets
+
+Datasets are stored as uncompressed ZIP archives containing uncompressed PNG files and a metadata file `dataset.json` for labels. Each label is a 25-length list of floating point numbers, which is the concatenation of the flattened 4x4 camera extrinsic matrix and flattened 3x3 camera intrinsic matrix. Custom datasets can be created from a folder containing images; see `python dataset_tool.py --help` for more information. Alternatively, the folder can also be used directly as a dataset, without running it through `dataset_tool.py` first, but doing so may lead to suboptimal performance.
+
+**FFHQ**: Download and process the [Flickr-Faces-HQ dataset](https://github.com/NVlabs/ffhq-dataset) using the following commands.
+
+1. Ensure the [Deep3DFaceRecon_pytorch](https://github.com/sicxu/Deep3DFaceRecon_pytorch/tree/6ba3d22f84bf508f0dde002da8fff277196fef21) submodule is properly initialized
+```.bash
+git submodule update --init --recursive
+```
+
+2. Run the following commands
+```.bash
+cd dataset_preprocessing/ffhq
+python runme.py
+```
+
+Optional: preprocessing in-the-wild portrait images. 
+In case you want to crop in-the-wild face images and extract poses using [Deep3DFaceRecon_pytorch](https://github.com/sicxu/Deep3DFaceRecon_pytorch/tree/6ba3d22f84bf508f0dde002da8fff277196fef21) in a way that align with the FFHQ data above and the checkpoint, run the following commands 
+```.bash
+cd dataset_preprocessing/ffhq
+python preprocess_in_the_wild.py --indir=INPUT_IMAGE_FOLDER
+```
+
+
+**AFHQv2**: Download and process the [AFHQv2 dataset](https://github.com/clovaai/stargan-v2/blob/master/README.md#animal-faces-hq-dataset-afhq) with the following.
+
+1. Download the AFHQv2 images zipfile from the [StarGAN V2 repository](https://github.com/clovaai/stargan-v2/)
+2. Run the following commands:
+```.bash
+cd dataset_preprocessing/afhq
+python runme.py "path/to/downloaded/afhq.zip"
+```
+
+**ShapeNet Cars**: Download and process renderings of the cars category of [ShapeNet](https://shapenet.org/) using the following commands.
+NOTE: the following commands download renderings of the ShapeNet cars from the [Scene Representation Networks repository](https://www.vincentsitzmann.com/srns/).
+
+```.bash
+cd dataset_preprocessing/shapenet
+python runme.py
+```
+
+## Training
+
+You can train new networks using `train.py`. For example:
+
+```.bash
+# Train with FFHQ from scratch with raw neural rendering resolution=64, using 8 GPUs.
+python train.py --outdir=~/training-runs --cfg=ffhq --data=~/datasets/FFHQ_512.zip \
+  --gpus=8 --batch=32 --gamma=1 --gen_pose_cond=True
+
+# Second stage finetuning of FFHQ to 128 neural rendering resolution (optional).
+python train.py --outdir=~/training-runs --cfg=ffhq --data=~/datasets/FFHQ_512.zip \
+  --resume=~/training-runs/ffhq_experiment_dir/network-snapshot-025000.pkl \
+  --gpus=8 --batch=32 --gamma=1 --gen_pose_cond=True --neural_rendering_resolution_final=128
+
+# Train with Shapenet from scratch, using 8 GPUs.
+python train.py --outdir=~/training-runs --cfg=shapenet --data=~/datasets/cars_train.zip \
+  --gpus=8 --batch=32 --gamma=0.3
+
+# Train with AFHQ, finetuning from FFHQ with ADA, using 8 GPUs.
+python train.py --outdir=~/training-runs --cfg=afhq --data=~/datasets/afhq.zip \
+  --gpus=8 --batch=32 --gamma=5 --aug=ada --neural_rendering_resolution_final=128 --gen_pose_cond=True --gpc_reg_prob=0.8
+```
+
+Please see the [Training Guide](./docs/training_guide.md) for a guide to setting up a training run on your own data.
+
+Please see [Models](./docs/models.md) for recommended training configurations and download links for pre-trained checkpoints.
+
+
+The results of each training run are saved to a newly created directory, for example `~/training-runs/00000-ffhq-ffhq512-gpus8-batch32-gamma1`. The training loop exports network pickles (`network-snapshot-<KIMG>.pkl`) and random image grids (`fakes<KIMG>.png`) at regular intervals (controlled by `--snap`). For each exported pickle, it evaluates FID (controlled by `--metrics`) and logs the result in `metric-fid50k_full.jsonl`. It also records various statistics in `training_stats.jsonl`, as well as `*.tfevents` if TensorBoard is installed.
+
+## Quality metrics
+
+By default, `train.py` automatically computes FID for each network pickle exported during training. We recommend inspecting `metric-fid50k_full.jsonl` (or TensorBoard) at regular intervals to monitor the training progress. When desired, the automatic computation can be disabled with `--metrics=none` to speed up the training slightly.
+
+Additional quality metrics can also be computed after the training:
+
+```.bash
+# Previous training run: look up options automatically, save result to JSONL file.
+python calc_metrics.py --metrics=fid50k_full \
+    --network=~/training-runs/network-snapshot-000000.pkl
+
+# Pre-trained network pickle: specify dataset explicitly, print result to stdout.
+python calc_metrics.py --metrics=fid50k_full --data=~/datasets/ffhq_512.zip \
+    --network=ffhq-128.pkl
+```
+
+Note that the metrics can be quite expensive to compute (up to 1h), and many of them have an additional one-off cost for each new dataset (up to 30min). Also note that the evaluation is done using a different random seed each time, so the results will vary if the same metric is computed multiple times.
+
+References:
+1. [GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium](https://arxiv.org/abs/1706.08500), Heusel et al. 2017
+2. [Demystifying MMD GANs](https://arxiv.org/abs/1801.01401), Bi&nacute;kowski et al. 2018
+
+<!-- ## License
+
+Copyright &copy; 2021, NVIDIA Corporation & affiliates. All rights reserved.
+
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/stylegan3/blob/main/LICENSE.txt). -->
+
+## Citation
+
+```
+@inproceedings{Chan2022,
+  author = {Eric R. Chan and Connor Z. Lin and Matthew A. Chan and Koki Nagano and Boxiao Pan and Shalini De Mello and Orazio Gallo and Leonidas Guibas and Jonathan Tremblay and Sameh Khamis and Tero Karras and Gordon Wetzstein},
+  title = {Efficient Geometry-aware {3D} Generative Adversarial Networks},
+  booktitle = {CVPR},
+  year = {2022}
+}
+```
+
+## Development
+
+This is a research reference implementation and is treated as a one-time code drop. As such, we do not accept outside code contributions in the form of pull requests.
+
+## Acknowledgements
+
+We thank David Luebke, Jan Kautz, Jaewoo Seo, Jonathan Granskog, Simon Yuen, Alex Evans, Stan Birchfield, Alexander Bergman, and Joy Hsu for feedback on drafts, Alex Chan, Giap Nguyen, and Trevor Chan for help with diagrams, and Colette Kress and Bryan Catanzaro for allowing use of their photographs. This project was in part supported by Stanford HAI and a Samsung GRO. Koki Nagano and Eric Chan were partially supported by DARPA’s Semantic Forensics (SemaFor) contract (HR0011-20-3-0005). The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the U.S. Government. Distribution Statement "A" (Approved for Public Release, Distribution Unlimited).
diff --git a/eg3d/calc_metrics.py b/eg3d/calc_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..d401b22554e142a4146a0eb0fc952cc20742e3e7
--- /dev/null
+++ b/eg3d/calc_metrics.py
@@ -0,0 +1,190 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Calculate quality metrics for previous training run or pretrained network pickle."""
+
+import os
+import click
+import json
+import tempfile
+import copy
+import torch
+
+import dnnlib
+import legacy
+from metrics import metric_main
+from metrics import metric_utils
+from torch_utils import training_stats
+from torch_utils import custom_ops
+from torch_utils import misc
+from torch_utils.ops import conv2d_gradfix
+
+#----------------------------------------------------------------------------
+
+def subprocess_fn(rank, args, temp_dir):
+    dnnlib.util.Logger(should_flush=True)
+
+    # Init torch.distributed.
+    if args.num_gpus > 1:
+        init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init'))
+        if os.name == 'nt':
+            init_method = 'file:///' + init_file.replace('\\', '/')
+            torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus)
+        else:
+            init_method = f'file://{init_file}'
+            torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus)
+
+    # Init torch_utils.
+    sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None
+    training_stats.init_multiprocessing(rank=rank, sync_device=sync_device)
+    if rank != 0 or not args.verbose:
+        custom_ops.verbosity = 'none'
+
+    # Configure torch.
+    device = torch.device('cuda', rank)
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    conv2d_gradfix.enabled = True
+
+    # Print network summary.
+    G = copy.deepcopy(args.G).eval().requires_grad_(False).to(device)
+    if rank == 0 and args.verbose:
+        z = torch.empty([1, G.z_dim], device=device)
+        c = torch.empty([1, G.c_dim], device=device)
+        misc.print_module_summary(G, [z, c])
+
+    # Calculate each metric.
+    for metric in args.metrics:
+        if rank == 0 and args.verbose:
+            print(f'Calculating {metric}...')
+        progress = metric_utils.ProgressMonitor(verbose=args.verbose)
+        result_dict = metric_main.calc_metric(metric=metric, G=G, dataset_kwargs=args.dataset_kwargs,
+            num_gpus=args.num_gpus, rank=rank, device=device, progress=progress)
+        if rank == 0:
+            metric_main.report_metric(result_dict, run_dir=args.run_dir, snapshot_pkl=args.network_pkl)
+        if rank == 0 and args.verbose:
+            print()
+
+    # Done.
+    if rank == 0 and args.verbose:
+        print('Exiting...')
+
+#----------------------------------------------------------------------------
+
+def parse_comma_separated_list(s):
+    if isinstance(s, list):
+        return s
+    if s is None or s.lower() == 'none' or s == '':
+        return []
+    return s.split(',')
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.pass_context
+@click.option('network_pkl', '--network', help='Network pickle filename or URL', metavar='PATH', required=True)
+@click.option('--metrics', help='Quality metrics', metavar='[NAME|A,B,C|none]', type=parse_comma_separated_list, default='fid50k_full', show_default=True)
+@click.option('--data', help='Dataset to evaluate against  [default: look up]', metavar='[ZIP|DIR]')
+@click.option('--mirror', help='Enable dataset x-flips  [default: look up]', type=bool, metavar='BOOL')
+@click.option('--gpus', help='Number of GPUs to use', type=int, default=1, metavar='INT', show_default=True)
+@click.option('--verbose', help='Print optional information', type=bool, default=True, metavar='BOOL', show_default=True)
+
+def calc_metrics(ctx, network_pkl, metrics, data, mirror, gpus, verbose):
+    """Calculate quality metrics for previous training run or pretrained network pickle.
+
+    Examples:
+
+    \b
+    # Previous training run: look up options automatically, save result to JSONL file.
+    python calc_metrics.py --metrics=eqt50k_int,eqr50k \\
+        --network=~/training-runs/00000-stylegan3-r-mydataset/network-snapshot-000000.pkl
+
+    \b
+    # Pre-trained network pickle: specify dataset explicitly, print result to stdout.
+    python calc_metrics.py --metrics=fid50k_full --data=~/datasets/ffhq-1024x1024.zip --mirror=1 \\
+        --network=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-ffhq-1024x1024.pkl
+
+    \b
+    Recommended metrics:
+      fid50k_full  Frechet inception distance against the full dataset.
+      kid50k_full  Kernel inception distance against the full dataset.
+      pr50k3_full  Precision and recall againt the full dataset.
+      ppl2_wend    Perceptual path length in W, endpoints, full image.
+      eqt50k_int   Equivariance w.r.t. integer translation (EQ-T).
+      eqt50k_frac  Equivariance w.r.t. fractional translation (EQ-T_frac).
+      eqr50k       Equivariance w.r.t. rotation (EQ-R).
+
+    \b
+    Legacy metrics:
+      fid50k       Frechet inception distance against 50k real images.
+      kid50k       Kernel inception distance against 50k real images.
+      pr50k3       Precision and recall against 50k real images.
+      is50k        Inception score for CIFAR-10.
+    """
+    dnnlib.util.Logger(should_flush=True)
+
+    # Validate arguments.
+    args = dnnlib.EasyDict(metrics=metrics, num_gpus=gpus, network_pkl=network_pkl, verbose=verbose)
+    if not all(metric_main.is_valid_metric(metric) for metric in args.metrics):
+        ctx.fail('\n'.join(['--metrics can only contain the following values:'] + metric_main.list_valid_metrics()))
+    if not args.num_gpus >= 1:
+        ctx.fail('--gpus must be at least 1')
+
+    # Load network.
+    if not dnnlib.util.is_url(network_pkl, allow_file_urls=True) and not os.path.isfile(network_pkl):
+        ctx.fail('--network must point to a file or URL')
+    if args.verbose:
+        print(f'Loading network from "{network_pkl}"...')
+    with dnnlib.util.open_url(network_pkl, verbose=args.verbose) as f:
+        network_dict = legacy.load_network_pkl(f)
+        args.G = network_dict['G_ema'] # subclass of torch.nn.Module
+
+    # Initialize dataset options.
+    if data is not None:
+        args.dataset_kwargs = dnnlib.EasyDict(class_name='training.dataset.ImageFolderDataset', path=data)
+    elif network_dict['training_set_kwargs'] is not None:
+        args.dataset_kwargs = dnnlib.EasyDict(network_dict['training_set_kwargs'])
+    else:
+        ctx.fail('Could not look up dataset options; please specify --data')
+
+    # Finalize dataset options.
+    args.dataset_kwargs.resolution = args.G.img_resolution
+    args.dataset_kwargs.use_labels = (args.G.c_dim != 0)
+    if mirror is not None:
+        args.dataset_kwargs.xflip = mirror
+
+    # Print dataset options.
+    if args.verbose:
+        print('Dataset options:')
+        print(json.dumps(args.dataset_kwargs, indent=2))
+
+    # Locate run dir.
+    args.run_dir = None
+    if os.path.isfile(network_pkl):
+        pkl_dir = os.path.dirname(network_pkl)
+        if os.path.isfile(os.path.join(pkl_dir, 'training_options.json')):
+            args.run_dir = pkl_dir
+
+    # Launch processes.
+    if args.verbose:
+        print('Launching processes...')
+    torch.multiprocessing.set_start_method('spawn')
+    with tempfile.TemporaryDirectory() as temp_dir:
+        if args.num_gpus == 1:
+            subprocess_fn(rank=0, args=args, temp_dir=temp_dir)
+        else:
+            torch.multiprocessing.spawn(fn=subprocess_fn, args=(args, temp_dir), nprocs=args.num_gpus)
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    calc_metrics() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/camera_utils.py b/eg3d/camera_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d4be88a575b4f43cce42f71222215e9b912d9f9
--- /dev/null
+++ b/eg3d/camera_utils.py
@@ -0,0 +1,149 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""
+Helper functions for constructing camera parameter matrices. Primarily used in visualization and inference scripts.
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+
+from training.volumetric_rendering import math_utils
+
+class GaussianCameraPoseSampler:
+    """
+    Samples pitch and yaw from a Gaussian distribution and returns a camera pose.
+    Camera is specified as looking at the origin.
+    If horizontal and vertical stddev (specified in radians) are zero, gives a
+    deterministic camera pose with yaw=horizontal_mean, pitch=vertical_mean.
+    The coordinate system is specified with y-up, z-forward, x-left.
+    Horizontal mean is the azimuthal angle (rotation around y axis) in radians,
+    vertical mean is the polar angle (angle from the y axis) in radians.
+    A point along the z-axis has azimuthal_angle=0, polar_angle=pi/2.
+
+    Example:
+    For a camera pose looking at the origin with the camera at position [0, 0, 1]:
+    cam2world = GaussianCameraPoseSampler.sample(math.pi/2, math.pi/2, radius=1)
+    """
+
+    @staticmethod
+    def sample(horizontal_mean, vertical_mean, horizontal_stddev=0, vertical_stddev=0, radius=1, batch_size=1, device='cpu'):
+        h = torch.randn((batch_size, 1), device=device) * horizontal_stddev + horizontal_mean
+        v = torch.randn((batch_size, 1), device=device) * vertical_stddev + vertical_mean
+        v = torch.clamp(v, 1e-5, math.pi - 1e-5)
+
+        theta = h
+        v = v / math.pi
+        phi = torch.arccos(1 - 2*v)
+
+        camera_origins = torch.zeros((batch_size, 3), device=device)
+
+        camera_origins[:, 0:1] = radius*torch.sin(phi) * torch.cos(math.pi-theta)
+        camera_origins[:, 2:3] = radius*torch.sin(phi) * torch.sin(math.pi-theta)
+        camera_origins[:, 1:2] = radius*torch.cos(phi)
+
+        forward_vectors = math_utils.normalize_vecs(-camera_origins)
+        return create_cam2world_matrix(forward_vectors, camera_origins)
+
+
+class LookAtPoseSampler:
+    """
+    Same as GaussianCameraPoseSampler, except the
+    camera is specified as looking at 'lookat_position', a 3-vector.
+
+    Example:
+    For a camera pose looking at the origin with the camera at position [0, 0, 1]:
+    cam2world = LookAtPoseSampler.sample(math.pi/2, math.pi/2, torch.tensor([0, 0, 0]), radius=1)
+    """
+
+    @staticmethod
+    def sample(horizontal_mean, vertical_mean, lookat_position, horizontal_stddev=0, vertical_stddev=0, radius=1, batch_size=1, device='cpu'):
+        h = torch.randn((batch_size, 1), device=device) * horizontal_stddev + horizontal_mean
+        v = torch.randn((batch_size, 1), device=device) * vertical_stddev + vertical_mean
+        v = torch.clamp(v, 1e-5, math.pi - 1e-5)
+
+        theta = h
+        v = v / math.pi
+        phi = torch.arccos(1 - 2*v)
+
+        camera_origins = torch.zeros((batch_size, 3), device=device)
+
+        camera_origins[:, 0:1] = radius*torch.sin(phi) * torch.cos(math.pi-theta)
+        camera_origins[:, 2:3] = radius*torch.sin(phi) * torch.sin(math.pi-theta)
+        camera_origins[:, 1:2] = radius*torch.cos(phi)
+
+        # forward_vectors = math_utils.normalize_vecs(-camera_origins)
+        forward_vectors = math_utils.normalize_vecs(lookat_position - camera_origins)
+        return create_cam2world_matrix(forward_vectors, camera_origins)
+
+class UniformCameraPoseSampler:
+    """
+    Same as GaussianCameraPoseSampler, except the
+    pose is sampled from a uniform distribution with range +-[horizontal/vertical]_stddev.
+
+    Example:
+    For a batch of random camera poses looking at the origin with yaw sampled from [-pi/2, +pi/2] radians:
+
+    cam2worlds = UniformCameraPoseSampler.sample(math.pi/2, math.pi/2, horizontal_stddev=math.pi/2, radius=1, batch_size=16)
+    """
+
+    @staticmethod
+    def sample(horizontal_mean, vertical_mean, horizontal_stddev=0, vertical_stddev=0, radius=1, batch_size=1, device='cpu'):
+        h = (torch.rand((batch_size, 1), device=device) * 2 - 1) * horizontal_stddev + horizontal_mean
+        v = (torch.rand((batch_size, 1), device=device) * 2 - 1) * vertical_stddev + vertical_mean
+        v = torch.clamp(v, 1e-5, math.pi - 1e-5)
+
+        theta = h
+        v = v / math.pi
+        phi = torch.arccos(1 - 2*v)
+
+        camera_origins = torch.zeros((batch_size, 3), device=device)
+
+        camera_origins[:, 0:1] = radius*torch.sin(phi) * torch.cos(math.pi-theta)
+        camera_origins[:, 2:3] = radius*torch.sin(phi) * torch.sin(math.pi-theta)
+        camera_origins[:, 1:2] = radius*torch.cos(phi)
+
+        forward_vectors = math_utils.normalize_vecs(-camera_origins)
+        return create_cam2world_matrix(forward_vectors, camera_origins)    
+
+def create_cam2world_matrix(forward_vector, origin):
+    """
+    Takes in the direction the camera is pointing and the camera origin and returns a cam2world matrix.
+    Works on batches of forward_vectors, origins. Assumes y-axis is up and that there is no camera roll.
+    """
+
+    forward_vector = math_utils.normalize_vecs(forward_vector)
+    up_vector = torch.tensor([0, 1, 0], dtype=torch.float, device=origin.device).expand_as(forward_vector)
+
+    right_vector = -math_utils.normalize_vecs(torch.cross(up_vector, forward_vector, dim=-1))
+    up_vector = math_utils.normalize_vecs(torch.cross(forward_vector, right_vector, dim=-1))
+
+    rotation_matrix = torch.eye(4, device=origin.device).unsqueeze(0).repeat(forward_vector.shape[0], 1, 1)
+    rotation_matrix[:, :3, :3] = torch.stack((right_vector, up_vector, forward_vector), axis=-1)
+
+    translation_matrix = torch.eye(4, device=origin.device).unsqueeze(0).repeat(forward_vector.shape[0], 1, 1)
+    translation_matrix[:, :3, 3] = origin
+    cam2world = (translation_matrix @ rotation_matrix)[:, :, :]
+    assert(cam2world.shape[1:] == (4, 4))
+    return cam2world
+
+
+def FOV_to_intrinsics(fov_degrees, device='cpu'):
+    """
+    Creates a 3x3 camera intrinsics matrix from the camera field of view, specified in degrees.
+    Note the intrinsics are returned as normalized by image size, rather than in pixel units.
+    Assumes principal point is at image center.
+    """
+
+    focal_length = float(1 / (math.tan(fov_degrees * 3.14159 / 360) * 1.414))
+    intrinsics = torch.tensor([[focal_length, 0, 0.5], [0, focal_length, 0.5], [0, 0, 1]], device=device)
+    return intrinsics
\ No newline at end of file
diff --git a/eg3d/dataset_tool.py b/eg3d/dataset_tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..a400f770fa477ef09adf4804235be4d67898765a
--- /dev/null
+++ b/eg3d/dataset_tool.py
@@ -0,0 +1,458 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Tool for creating ZIP/PNG based datasets."""
+
+import functools
+import gzip
+import io
+import json
+import os
+import pickle
+import re
+import sys
+import tarfile
+import zipfile
+from pathlib import Path
+from typing import Callable, Optional, Tuple, Union
+
+import click
+import numpy as np
+import PIL.Image
+from tqdm import tqdm
+
+#----------------------------------------------------------------------------
+
+def error(msg):
+    print('Error: ' + msg)
+    sys.exit(1)
+
+#----------------------------------------------------------------------------
+
+def parse_tuple(s: str) -> Tuple[int, int]:
+    '''Parse a 'M,N' or 'MxN' integer tuple.
+
+    Example:
+        '4x2' returns (4,2)
+        '0,1' returns (0,1)
+    '''
+    if m := re.match(r'^(\d+)[x,](\d+)$', s):
+        return (int(m.group(1)), int(m.group(2)))
+    raise ValueError(f'cannot parse tuple {s}')
+
+#----------------------------------------------------------------------------
+
+def maybe_min(a: int, b: Optional[int]) -> int:
+    if b is not None:
+        return min(a, b)
+    return a
+
+#----------------------------------------------------------------------------
+
+def file_ext(name: Union[str, Path]) -> str:
+    return str(name).split('.')[-1]
+
+#----------------------------------------------------------------------------
+
+def is_image_ext(fname: Union[str, Path]) -> bool:
+    ext = file_ext(fname).lower()
+    return f'.{ext}' in PIL.Image.EXTENSION # type: ignore
+
+#----------------------------------------------------------------------------
+
+def open_image_folder(source_dir, *, max_images: Optional[int]):
+    input_images = [str(f) for f in sorted(Path(source_dir).rglob('*')) if is_image_ext(f) and os.path.isfile(f)]
+
+    # Load labels.
+    labels = {}
+    meta_fname = os.path.join(source_dir, 'dataset.json')
+    if os.path.isfile(meta_fname):
+        with open(meta_fname, 'r') as file:
+            labels = json.load(file)['labels']
+            if labels is not None:
+                labels = { x[0]: x[1] for x in labels }
+            else:
+                labels = {}
+
+    max_idx = maybe_min(len(input_images), max_images)
+
+    def iterate_images():
+        for idx, fname in enumerate(input_images):
+            arch_fname = os.path.relpath(fname, source_dir)
+            arch_fname = arch_fname.replace('\\', '/')
+            img = np.array(PIL.Image.open(fname))
+            yield dict(img=img, label=labels.get(arch_fname))
+            if idx >= max_idx-1:
+                break
+    return max_idx, iterate_images()
+
+#----------------------------------------------------------------------------
+
+def open_image_zip(source, *, max_images: Optional[int]):
+    with zipfile.ZipFile(source, mode='r') as z:
+        input_images = [str(f) for f in sorted(z.namelist()) if is_image_ext(f)]
+
+        # Load labels.
+        labels = {}
+        if 'dataset.json' in z.namelist():
+            with z.open('dataset.json', 'r') as file:
+                labels = json.load(file)['labels']
+                if labels is not None:
+                    labels = { x[0]: x[1] for x in labels }
+                else:
+                    labels = {}
+
+    max_idx = maybe_min(len(input_images), max_images)
+
+    def iterate_images():
+        with zipfile.ZipFile(source, mode='r') as z:
+            for idx, fname in enumerate(input_images):
+                with z.open(fname, 'r') as file:
+                    img = PIL.Image.open(file) # type: ignore
+                    img = np.array(img)
+                yield dict(img=img, label=labels.get(fname))
+                if idx >= max_idx-1:
+                    break
+    return max_idx, iterate_images()
+
+#----------------------------------------------------------------------------
+
+def open_lmdb(lmdb_dir: str, *, max_images: Optional[int]):
+    import cv2  # pip install opencv-python # pylint: disable=import-error
+    import lmdb  # pip install lmdb # pylint: disable=import-error
+
+    with lmdb.open(lmdb_dir, readonly=True, lock=False).begin(write=False) as txn:
+        max_idx = maybe_min(txn.stat()['entries'], max_images)
+
+    def iterate_images():
+        with lmdb.open(lmdb_dir, readonly=True, lock=False).begin(write=False) as txn:
+            for idx, (_key, value) in enumerate(txn.cursor()):
+                try:
+                    try:
+                        img = cv2.imdecode(np.frombuffer(value, dtype=np.uint8), 1)
+                        if img is None:
+                            raise IOError('cv2.imdecode failed')
+                        img = img[:, :, ::-1] # BGR => RGB
+                    except IOError:
+                        img = np.array(PIL.Image.open(io.BytesIO(value)))
+                    yield dict(img=img, label=None)
+                    if idx >= max_idx-1:
+                        break
+                except:
+                    print(sys.exc_info()[1])
+
+    return max_idx, iterate_images()
+
+#----------------------------------------------------------------------------
+
+def open_cifar10(tarball: str, *, max_images: Optional[int]):
+    images = []
+    labels = []
+
+    with tarfile.open(tarball, 'r:gz') as tar:
+        for batch in range(1, 6):
+            member = tar.getmember(f'cifar-10-batches-py/data_batch_{batch}')
+            with tar.extractfile(member) as file:
+                data = pickle.load(file, encoding='latin1')
+            images.append(data['data'].reshape(-1, 3, 32, 32))
+            labels.append(data['labels'])
+
+    images = np.concatenate(images)
+    labels = np.concatenate(labels)
+    images = images.transpose([0, 2, 3, 1]) # NCHW -> NHWC
+    assert images.shape == (50000, 32, 32, 3) and images.dtype == np.uint8
+    assert labels.shape == (50000,) and labels.dtype in [np.int32, np.int64]
+    assert np.min(images) == 0 and np.max(images) == 255
+    assert np.min(labels) == 0 and np.max(labels) == 9
+
+    max_idx = maybe_min(len(images), max_images)
+
+    def iterate_images():
+        for idx, img in enumerate(images):
+            yield dict(img=img, label=int(labels[idx]))
+            if idx >= max_idx-1:
+                break
+
+    return max_idx, iterate_images()
+
+#----------------------------------------------------------------------------
+
+def open_mnist(images_gz: str, *, max_images: Optional[int]):
+    labels_gz = images_gz.replace('-images-idx3-ubyte.gz', '-labels-idx1-ubyte.gz')
+    assert labels_gz != images_gz
+    images = []
+    labels = []
+
+    with gzip.open(images_gz, 'rb') as f:
+        images = np.frombuffer(f.read(), np.uint8, offset=16)
+    with gzip.open(labels_gz, 'rb') as f:
+        labels = np.frombuffer(f.read(), np.uint8, offset=8)
+
+    images = images.reshape(-1, 28, 28)
+    images = np.pad(images, [(0,0), (2,2), (2,2)], 'constant', constant_values=0)
+    assert images.shape == (60000, 32, 32) and images.dtype == np.uint8
+    assert labels.shape == (60000,) and labels.dtype == np.uint8
+    assert np.min(images) == 0 and np.max(images) == 255
+    assert np.min(labels) == 0 and np.max(labels) == 9
+
+    max_idx = maybe_min(len(images), max_images)
+
+    def iterate_images():
+        for idx, img in enumerate(images):
+            yield dict(img=img, label=int(labels[idx]))
+            if idx >= max_idx-1:
+                break
+
+    return max_idx, iterate_images()
+
+#----------------------------------------------------------------------------
+
+def make_transform(
+    transform: Optional[str],
+    output_width: Optional[int],
+    output_height: Optional[int]
+) -> Callable[[np.ndarray], Optional[np.ndarray]]:
+    def scale(width, height, img):
+        w = img.shape[1]
+        h = img.shape[0]
+        if width == w and height == h:
+            return img
+        img = PIL.Image.fromarray(img)
+        ww = width if width is not None else w
+        hh = height if height is not None else h
+        img = img.resize((ww, hh), PIL.Image.LANCZOS)
+        return np.array(img)
+
+    def center_crop(width, height, img):
+        crop = np.min(img.shape[:2])
+        img = img[(img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2, (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2]
+        img = PIL.Image.fromarray(img, 'RGB')
+        img = img.resize((width, height), PIL.Image.LANCZOS)
+        return np.array(img)
+
+    def center_crop_wide(width, height, img):
+        ch = int(np.round(width * img.shape[0] / img.shape[1]))
+        if img.shape[1] < width or ch < height:
+            return None
+
+        img = img[(img.shape[0] - ch) // 2 : (img.shape[0] + ch) // 2]
+        img = PIL.Image.fromarray(img, 'RGB')
+        img = img.resize((width, height), PIL.Image.LANCZOS)
+        img = np.array(img)
+
+        canvas = np.zeros([width, width, 3], dtype=np.uint8)
+        canvas[(width - height) // 2 : (width + height) // 2, :] = img
+        return canvas
+
+    if transform is None:
+        return functools.partial(scale, output_width, output_height)
+    if transform == 'center-crop':
+        if (output_width is None) or (output_height is None):
+            error ('must specify --resolution=WxH when using ' + transform + 'transform')
+        return functools.partial(center_crop, output_width, output_height)
+    if transform == 'center-crop-wide':
+        if (output_width is None) or (output_height is None):
+            error ('must specify --resolution=WxH when using ' + transform + ' transform')
+        return functools.partial(center_crop_wide, output_width, output_height)
+    assert False, 'unknown transform'
+
+#----------------------------------------------------------------------------
+
+def open_dataset(source, *, max_images: Optional[int]):
+    if os.path.isdir(source):
+        if source.rstrip('/').endswith('_lmdb'):
+            return open_lmdb(source, max_images=max_images)
+        else:
+            return open_image_folder(source, max_images=max_images)
+    elif os.path.isfile(source):
+        if os.path.basename(source) == 'cifar-10-python.tar.gz':
+            return open_cifar10(source, max_images=max_images)
+        elif os.path.basename(source) == 'train-images-idx3-ubyte.gz':
+            return open_mnist(source, max_images=max_images)
+        elif file_ext(source) == 'zip':
+            return open_image_zip(source, max_images=max_images)
+        else:
+            assert False, 'unknown archive type'
+    else:
+        error(f'Missing input file or directory: {source}')
+
+#----------------------------------------------------------------------------
+
+def open_dest(dest: str) -> Tuple[str, Callable[[str, Union[bytes, str]], None], Callable[[], None]]:
+    dest_ext = file_ext(dest)
+
+    if dest_ext == 'zip':
+        if os.path.dirname(dest) != '':
+            os.makedirs(os.path.dirname(dest), exist_ok=True)
+        zf = zipfile.ZipFile(file=dest, mode='w', compression=zipfile.ZIP_STORED)
+        def zip_write_bytes(fname: str, data: Union[bytes, str]):
+            zf.writestr(fname, data)
+        return '', zip_write_bytes, zf.close
+    else:
+        # If the output folder already exists, check that is is
+        # empty.
+        #
+        # Note: creating the output directory is not strictly
+        # necessary as folder_write_bytes() also mkdirs, but it's better
+        # to give an error message earlier in case the dest folder
+        # somehow cannot be created.
+        if os.path.isdir(dest) and len(os.listdir(dest)) != 0:
+            error('--dest folder must be empty')
+        os.makedirs(dest, exist_ok=True)
+
+        def folder_write_bytes(fname: str, data: Union[bytes, str]):
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            with open(fname, 'wb') as fout:
+                if isinstance(data, str):
+                    data = data.encode('utf8')
+                fout.write(data)
+        return dest, folder_write_bytes, lambda: None
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.pass_context
+@click.option('--source', help='Directory or archive name for input dataset', required=True, metavar='PATH')
+@click.option('--dest', help='Output directory or archive name for output dataset', required=True, metavar='PATH')
+@click.option('--max-images', help='Output only up to `max-images` images', type=int, default=None)
+@click.option('--transform', help='Input crop/resize mode', type=click.Choice(['center-crop', 'center-crop-wide']))
+@click.option('--resolution', help='Output resolution (e.g., \'512x512\')', metavar='WxH', type=parse_tuple)
+def convert_dataset(
+    ctx: click.Context,
+    source: str,
+    dest: str,
+    max_images: Optional[int],
+    transform: Optional[str],
+    resolution: Optional[Tuple[int, int]]
+):
+    """Convert an image dataset into a dataset archive usable with StyleGAN2 ADA PyTorch.
+
+    The input dataset format is guessed from the --source argument:
+
+    \b
+    --source *_lmdb/                    Load LSUN dataset
+    --source cifar-10-python.tar.gz     Load CIFAR-10 dataset
+    --source train-images-idx3-ubyte.gz Load MNIST dataset
+    --source path/                      Recursively load all images from path/
+    --source dataset.zip                Recursively load all images from dataset.zip
+
+    Specifying the output format and path:
+
+    \b
+    --dest /path/to/dir                 Save output files under /path/to/dir
+    --dest /path/to/dataset.zip         Save output files into /path/to/dataset.zip
+
+    The output dataset format can be either an image folder or an uncompressed zip archive.
+    Zip archives makes it easier to move datasets around file servers and clusters, and may
+    offer better training performance on network file systems.
+
+    Images within the dataset archive will be stored as uncompressed PNG.
+    Uncompressed PNGs can be efficiently decoded in the training loop.
+
+    Class labels are stored in a file called 'dataset.json' that is stored at the
+    dataset root folder.  This file has the following structure:
+
+    \b
+    {
+        "labels": [
+            ["00000/img00000000.png",6],
+            ["00000/img00000001.png",9],
+            ... repeated for every image in the dataset
+            ["00049/img00049999.png",1]
+        ]
+    }
+
+    If the 'dataset.json' file cannot be found, the dataset is interpreted as
+    not containing class labels.
+
+    Image scale/crop and resolution requirements:
+
+    Output images must be square-shaped and they must all have the same power-of-two
+    dimensions.
+
+    To scale arbitrary input image size to a specific width and height, use the
+    --resolution option.  Output resolution will be either the original
+    input resolution (if resolution was not specified) or the one specified with
+    --resolution option.
+
+    Use the --transform=center-crop or --transform=center-crop-wide options to apply a
+    center crop transform on the input image.  These options should be used with the
+    --resolution option.  For example:
+
+    \b
+    python dataset_tool.py --source LSUN/raw/cat_lmdb --dest /tmp/lsun_cat \\
+        --transform=center-crop-wide --resolution=512x384
+    """
+
+    PIL.Image.init() # type: ignore
+
+    if dest == '':
+        ctx.fail('--dest output filename or directory must not be an empty string')
+
+    num_files, input_iter = open_dataset(source, max_images=max_images)
+    archive_root_dir, save_bytes, close_dest = open_dest(dest)
+
+    if resolution is None: resolution = (None, None)
+    transform_image = make_transform(transform, *resolution)
+
+    dataset_attrs = None
+
+    labels = []
+    for idx, image in tqdm(enumerate(input_iter), total=num_files):
+        idx_str = f'{idx:08d}'
+        archive_fname = f'{idx_str[:5]}/img{idx_str}.png'
+
+        # Apply crop and resize.
+        img = transform_image(image['img'])
+
+        # Transform may drop images.
+        if img is None:
+            continue
+
+        # Error check to require uniform image attributes across
+        # the whole dataset.
+        channels = img.shape[2] if img.ndim == 3 else 1
+        cur_image_attrs = {
+            'width': img.shape[1],
+            'height': img.shape[0],
+            'channels': channels
+        }
+        if dataset_attrs is None:
+            dataset_attrs = cur_image_attrs
+            width = dataset_attrs['width']
+            height = dataset_attrs['height']
+            if width != height:
+                error(f'Image dimensions after scale and crop are required to be square.  Got {width}x{height}')
+            if dataset_attrs['channels'] not in [1, 3, 4]:
+                error('Input images must be stored as RGB or grayscale')
+            if width != 2 ** int(np.floor(np.log2(width))):
+                error('Image width/height after scale and crop are required to be power-of-two')
+        elif dataset_attrs != cur_image_attrs:
+            err = [f'  dataset {k}/cur image {k}: {dataset_attrs[k]}/{cur_image_attrs[k]}' for k in dataset_attrs.keys()] # pylint: disable=unsubscriptable-object
+            error(f'Image {archive_fname} attributes must be equal across all images of the dataset.  Got:\n' + '\n'.join(err))
+
+        # Save the image as an uncompressed PNG.
+        img = PIL.Image.fromarray(img, { 1: 'L', 3: 'RGB', 4: 'RGBA'}[channels])
+        if channels == 4: img = img.convert('RGB')
+        image_bits = io.BytesIO()
+        img.save(image_bits, format='png', compress_level=0, optimize=False)
+        save_bytes(os.path.join(archive_root_dir, archive_fname), image_bits.getbuffer())
+        labels.append([archive_fname, image['label']] if image['label'] is not None else None)
+
+    metadata = {
+        'labels': labels if all(x is not None for x in labels) else None
+    }
+    save_bytes(os.path.join(archive_root_dir, 'dataset.json'), json.dumps(metadata))
+    close_dest()
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    convert_dataset() # pylint: disable=no-value-for-parameter
diff --git a/eg3d/datid3d_data_gen.py b/eg3d/datid3d_data_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..497a1fd7d489493eb73ccb9768e346d3afa6f0a0
--- /dev/null
+++ b/eg3d/datid3d_data_gen.py
@@ -0,0 +1,204 @@
+
+import sys, os
+sys.path.append(os.getcwd())
+from os.path import join as opj
+import zipfile
+import json
+import pickle
+from tqdm import tqdm
+import argparse
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import autocast
+from torchvision.transforms import ToPILImage
+from diffusers import StableDiffusionImg2ImgPipeline, PNDMScheduler
+from camera_utils import LookAtPoseSampler, FOV_to_intrinsics
+
+
+
+def parse_args():
+    """Parse input arguments."""
+    parser = argparse.ArgumentParser(description='Pose-aware dataset generation')
+    parser.add_argument('--strength', default=0.7, type=float)
+    parser.add_argument('--prompt', type=str)
+    parser.add_argument('--data_type', default='ffhq', type=str) # ffhq, cat
+    parser.add_argument('--guidance_scale', default=8, type=float)
+    parser.add_argument('--num_images', default=1000, type=int)
+    parser.add_argument('--sd_model_id', default='stabilityai/stable-diffusion-2-1-base', type=str)
+    parser.add_argument('--num_inference_steps', default=30, type=int)
+    parser.add_argument('--ffhq_eg3d_path', default='pretrained/ffhqrebalanced512-128.pkl', type=str)
+    parser.add_argument('--cat_eg3d_path', default='pretrained/afhqcats512-128.pkl', type=str)
+    parser.add_argument('--ffhq_pivot', default=0.2, type=float)
+    parser.add_argument('--cat_pivot', default=0.05, type=float)
+    parser.add_argument('--pitch_range', default=0.3, type=float)
+    parser.add_argument('--yaw_range', default=0.3, type=float)
+    parser.add_argument('--name_tag', default='', type=str)
+    parser.add_argument('--seed', default=15, type=int)
+
+    args = parser.parse_args()
+    return args
+
+def make_zip(base_dir, prompt, data_type='ffhq', name_tag=''):
+    base_dir = os.path.abspath(base_dir)
+
+    owd = os.path.abspath(os.getcwd())
+    os.chdir(base_dir)
+
+    json_path = opj(base_dir, "dataset.json")
+
+    zip_path = opj(base_dir, f'data_{data_type}_{prompt.replace(" ", "_")}{name_tag}.zip')
+    zip_file = zipfile.ZipFile(zip_path, "w")
+
+    with open(json_path, 'r') as file:
+        data = json.load(file)
+    zip_file.write(os.path.relpath(json_path, base_dir), compress_type=zipfile.ZIP_STORED)
+
+    for label in data['labels']:
+        trg_img_path = label[0]
+        zip_file.write(trg_img_path, compress_type=zipfile.ZIP_STORED)
+
+    zip_file.close()
+    os.chdir(owd)
+
+def pts2pil(pts):
+    pts = (pts + 1) / 2
+    pts[pts > 1] = 1
+    pts[pts < 0] = 0
+    return ToPILImage()(pts[0])
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    device = "cuda"
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+
+    data_type = args.data_type
+    prompt = args.prompt
+    strength = args.strength
+    guidance_scale = args.guidance_scale
+    num_inference_steps = args.num_inference_steps
+    num_images = args.num_images
+    name_tag = args.name_tag
+
+    # 3DG options
+    ffhq_eg3d_path = args.ffhq_eg3d_path
+    cat_eg3d_path = args.cat_eg3d_path
+    cat_pivot = args.cat_pivot
+    ffhq_pivot = args.ffhq_pivot
+    pitch_range = args.pitch_range
+    yaw_range = args.yaw_range
+    num_frames = 240
+    truncation_psi = 0.7
+    truncation_cutoff = 14
+    fov_deg = 18.837
+    ft_img_size = 512
+
+    # Load 3DG
+    eg3d_path = None
+    if data_type == 'ffhq':
+        eg3d_path = args.ffhq_eg3d_path
+        pivot = ffhq_pivot
+    elif data_type == 'cat':
+        eg3d_path = args.cat_eg3d_path
+        pivot = cat_pivot
+
+    with open(eg3d_path, 'rb') as f:
+        G = pickle.load(f)['G_ema'].to(device)  # torch.nn.Module
+    G.train()
+    for param in G.parameters():
+        param.requires_grad_(True)
+
+    # SD options
+    model_id = args.sd_model_id
+    negative_prompt = None
+    eta = 0.0
+    batch_size = 1
+    model_inversion = False
+
+    # Load SD
+    pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+        model_id,
+        revision="fp16",
+        torch_dtype=torch.float16,
+        use_auth_token=True,
+        scheduler=PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+                                num_train_timesteps=1000, set_alpha_to_one=False, steps_offset=1, skip_prk_steps=1),
+    ).to(device)
+    pipe.safety_checker = None
+    print('SD model is loaded')
+
+    # Outputs directory
+    base_dir = opj(f'./exp_data/data_{data_type}_{prompt.replace(" ", "_")}{name_tag}')
+
+    src_img_dir = opj(base_dir, "src_imgs")
+    trg_img_dir = opj(base_dir, "trg_imgs")
+
+    os.makedirs('exp_data', exist_ok=True)
+    os.makedirs(base_dir, exist_ok=True)
+    os.makedirs(src_img_dir, exist_ok=True)
+    os.makedirs(trg_img_dir, exist_ok=True)
+    labels = []
+
+    # Fine-tuning 3D generator
+    for i in tqdm(range(num_images)):
+        G.eval()
+        z = torch.from_numpy(np.random.randn(batch_size, G.z_dim)).to(device)
+        intrinsics = FOV_to_intrinsics(fov_deg, device=device)
+
+        with torch.no_grad():
+            yaw_idx = np.random.randint(num_frames)
+            pitch_idx = np.random.randint(num_frames)
+
+            cam_pivot = torch.tensor([0, 0, pivot], device=device)
+            cam_radius = G.rendering_kwargs.get('avg_camera_radius', 2.7)
+            cam2world_pose = LookAtPoseSampler.sample(np.pi / 2 + yaw_range * np.sin(2 * np.pi * yaw_idx / num_frames),
+                                                      np.pi / 2 - 0.05 + pitch_range * np.cos(
+                                                          2 * np.pi * pitch_idx / num_frames),
+                                                      cam_pivot, radius=cam_radius, device=device,
+                                                      batch_size=batch_size)
+            conditioning_cam2world_pose = LookAtPoseSampler.sample(np.pi / 2, np.pi / 2, cam_pivot, radius=cam_radius,
+                                                                   device=device, batch_size=batch_size)
+            camera_params = torch.cat([cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9).repeat(batch_size, 1)],
+                                      1)
+            conditioning_params = torch.cat(
+                [conditioning_cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9).repeat(batch_size, 1)], 1)
+
+            ws = G.mapping(z, conditioning_params, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff)
+
+            img_pts = G.synthesis(ws, camera_params)['image']
+
+            src_img_pts = img_pts.detach()
+            src_img_pts = F.interpolate(src_img_pts, (ft_img_size, ft_img_size), mode='bilinear', align_corners=False)
+            with autocast("cuda"):
+                trg_img_pil = pipe(prompt=prompt,
+                                   image=src_img_pts,
+                                   strength=strength,
+                                   guidance_scale=guidance_scale,
+                                   num_inference_steps=num_inference_steps,
+                                   )['images'][0]
+
+        src_idx = f'{i:05d}_src.png'
+        trg_idx = f'{i:05d}_trg.png'
+
+        src_img_pil_path = opj(src_img_dir, src_idx)
+        trg_img_pil_path = opj(trg_img_dir, trg_idx)
+
+        src_img_pil = pts2pil(src_img_pts.cpu())
+
+        src_img_pil.save(src_img_pil_path)
+        trg_img_pil.save(trg_img_pil_path)
+
+        label = [trg_img_pil_path.replace(base_dir, '').replace('/trg_', 'trg_'), camera_params[0].tolist()]
+
+        labels.append(label)
+
+
+    json_path = opj(base_dir, "dataset.json")
+    json_data = {'labels': labels}
+    with open(json_path, 'w') as outfile:
+        json.dump(json_data, outfile, indent=4)
+
+    make_zip(base_dir, prompt, data_type, name_tag)
diff --git a/eg3d/dnnlib/__init__.py b/eg3d/dnnlib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd91ed142e955581e83948455fb71cd837215f61
--- /dev/null
+++ b/eg3d/dnnlib/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+from .util import EasyDict, make_cache_dir_path
diff --git a/eg3d/dnnlib/util.py b/eg3d/dnnlib/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b67c4e312cd1b847ca21fd3b929802a57e6f6d
--- /dev/null
+++ b/eg3d/dnnlib/util.py
@@ -0,0 +1,493 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Miscellaneous utility classes and functions."""
+
+import ctypes
+import fnmatch
+import importlib
+import inspect
+import numpy as np
+import os
+import shutil
+import sys
+import types
+import io
+import pickle
+import re
+import requests
+import html
+import hashlib
+import glob
+import tempfile
+import urllib
+import urllib.request
+import uuid
+
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union
+
+
+# Util classes
+# ------------------------------------------------------------------------------------------
+
+
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+
+
+class Logger(object):
+    """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file."""
+
+    def __init__(self, file_name: str = None, file_mode: str = "w", should_flush: bool = True):
+        self.file = None
+
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+
+        sys.stdout = self
+        sys.stderr = self
+
+    def __enter__(self) -> "Logger":
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+
+    def write(self, text: Union[str, bytes]) -> None:
+        """Write text to stdout (and a file) and optionally flush."""
+        if isinstance(text, bytes):
+            text = text.decode()
+        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+
+        if self.file is not None:
+            self.file.write(text)
+
+        self.stdout.write(text)
+
+        if self.should_flush:
+            self.flush()
+
+    def flush(self) -> None:
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+
+        self.stdout.flush()
+
+    def close(self) -> None:
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+
+        if self.file is not None:
+            self.file.close()
+            self.file = None
+
+
+# Cache directories
+# ------------------------------------------------------------------------------------------
+
+_dnnlib_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    global _dnnlib_cache_dir
+    _dnnlib_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _dnnlib_cache_dir is not None:
+        return os.path.join(_dnnlib_cache_dir, *paths)
+    if 'DNNLIB_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths)
+
+# Small util functions
+# ------------------------------------------------------------------------------------------
+
+
+def format_time(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60)
+    else:
+        return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60)
+
+
+def format_time_brief(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m".format(s // (60 * 60), (s // 60) % 60)
+    else:
+        return "{0}d {1:02}h".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24)
+
+
+def ask_yes_no(question: str) -> bool:
+    """Ask the user the question until the user inputs a valid answer."""
+    while True:
+        try:
+            print("{0} [y/n]".format(question))
+            return strtobool(input().lower())
+        except ValueError:
+            pass
+
+
+def tuple_product(t: Tuple) -> Any:
+    """Calculate the product of the tuple elements."""
+    result = 1
+
+    for v in t:
+        result *= v
+
+    return result
+
+
+_str_to_ctype = {
+    "uint8": ctypes.c_ubyte,
+    "uint16": ctypes.c_uint16,
+    "uint32": ctypes.c_uint32,
+    "uint64": ctypes.c_uint64,
+    "int8": ctypes.c_byte,
+    "int16": ctypes.c_int16,
+    "int32": ctypes.c_int32,
+    "int64": ctypes.c_int64,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double
+}
+
+
+def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
+    """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes."""
+    type_str = None
+
+    if isinstance(type_obj, str):
+        type_str = type_obj
+    elif hasattr(type_obj, "__name__"):
+        type_str = type_obj.__name__
+    elif hasattr(type_obj, "name"):
+        type_str = type_obj.name
+    else:
+        raise RuntimeError("Cannot infer type name from input")
+
+    assert type_str in _str_to_ctype.keys()
+
+    my_dtype = np.dtype(type_str)
+    my_ctype = _str_to_ctype[type_str]
+
+    assert my_dtype.itemsize == ctypes.sizeof(my_ctype)
+
+    return my_dtype, my_ctype
+
+
+def is_pickleable(obj: Any) -> bool:
+    try:
+        with io.BytesIO() as stream:
+            pickle.dump(obj, stream)
+        return True
+    except:
+        return False
+
+
+# Functionality to import modules/objects by name, and call functions by name
+# ------------------------------------------------------------------------------------------
+
+def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]:
+    """Searches for the underlying module behind the name to some python object.
+    Returns the module and the object name (original name with module part removed)."""
+
+    # allow convenience shorthands, substitute them by full names
+    obj_name = re.sub("^np.", "numpy.", obj_name)
+    obj_name = re.sub("^tf.", "tensorflow.", obj_name)
+
+    # list alternatives for (module_name, local_obj_name)
+    parts = obj_name.split(".")
+    name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)]
+
+    # try each alternative in turn
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+            return module, local_obj_name
+        except:
+            pass
+
+    # maybe some of the modules themselves contain errors?
+    for module_name, _local_obj_name in name_pairs:
+        try:
+            importlib.import_module(module_name) # may raise ImportError
+        except ImportError:
+            if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"):
+                raise
+
+    # maybe the requested attribute is missing?
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+        except ImportError:
+            pass
+
+    # we are out of luck, but we have no idea why
+    raise ImportError(obj_name)
+
+
+def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
+    """Traverses the object name and returns the last (rightmost) python object."""
+    if obj_name == '':
+        return module
+    obj = module
+    for part in obj_name.split("."):
+        obj = getattr(obj, part)
+    return obj
+
+
+def get_obj_by_name(name: str) -> Any:
+    """Finds the python object with the given name."""
+    module, obj_name = get_module_from_obj_name(name)
+    return get_obj_from_module(module, obj_name)
+
+
+def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
+    """Finds the python object with the given name and calls it as a function."""
+    assert func_name is not None
+    func_obj = get_obj_by_name(func_name)
+    assert callable(func_obj)
+    return func_obj(*args, **kwargs)
+
+
+def construct_class_by_name(*args, class_name: str = None, **kwargs) -> Any:
+    """Finds the python class with the given name and constructs it with the given arguments."""
+    return call_func_by_name(*args, func_name=class_name, **kwargs)
+
+
+def get_module_dir_by_obj_name(obj_name: str) -> str:
+    """Get the directory path of the module containing the given object name."""
+    module, _ = get_module_from_obj_name(obj_name)
+    return os.path.dirname(inspect.getfile(module))
+
+
+def is_top_level_function(obj: Any) -> bool:
+    """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'."""
+    return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__
+
+
+def get_top_level_function_name(obj: Any) -> str:
+    """Return the fully-qualified name of a top-level function."""
+    assert is_top_level_function(obj)
+    module = obj.__module__
+    if module == '__main__':
+        module = os.path.splitext(os.path.basename(sys.modules[module].__file__))[0]
+    return module + "." + obj.__name__
+
+
+# File system helpers
+# ------------------------------------------------------------------------------------------
+
+def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]:
+    """List all files recursively in a given directory while ignoring given file and directory names.
+    Returns list of tuples containing both absolute and relative paths."""
+    assert os.path.isdir(dir_path)
+    base_name = os.path.basename(os.path.normpath(dir_path))
+
+    if ignores is None:
+        ignores = []
+
+    result = []
+
+    for root, dirs, files in os.walk(dir_path, topdown=True):
+        for ignore_ in ignores:
+            dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)]
+
+            # dirs need to be edited in-place
+            for d in dirs_to_remove:
+                dirs.remove(d)
+
+            files = [f for f in files if not fnmatch.fnmatch(f, ignore_)]
+
+        absolute_paths = [os.path.join(root, f) for f in files]
+        relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths]
+
+        if add_base_to_relative:
+            relative_paths = [os.path.join(base_name, p) for p in relative_paths]
+
+        assert len(absolute_paths) == len(relative_paths)
+        result += zip(absolute_paths, relative_paths)
+
+    return result
+
+
+def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
+    """Takes in a list of tuples of (src, dst) paths and copies files.
+    Will create all necessary directories."""
+    for file in files:
+        target_dir_name = os.path.dirname(file[1])
+
+        # will create all intermediate-level directories
+        if not os.path.exists(target_dir_name):
+            os.makedirs(target_dir_name)
+
+        shutil.copyfile(file[0], file[1])
+
+
+# URL helpers
+# ------------------------------------------------------------------------------------------
+
+def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
+    """Determine whether the given object is a valid URL string."""
+    if not isinstance(obj, str) or not "://" in obj:
+        return False
+    if allow_file_urls and obj.startswith('file://'):
+        return True
+    try:
+        res = requests.compat.urlparse(obj)
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+        res = requests.compat.urlparse(requests.compat.urljoin(obj, "/"))
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+    except:
+        return False
+    return True
+
+
+def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    assert not (return_filename and (not cache))
+
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+
+    assert is_url(url)
+
+    # Lookup from cache.
+    if cache_dir is None:
+        cache_dir = make_cache_dir_path('downloads')
+
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    if cache:
+        cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*"))
+        if len(cache_files) == 1:
+            filename = cache_files[0]
+            return filename if return_filename else open(filename, "rb")
+
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+
+    # Save to cache.
+    if cache:
+        safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name)
+        cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name)
+        temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(temp_file, "wb") as f:
+            f.write(url_data)
+        os.replace(temp_file, cache_file) # atomic
+        if return_filename:
+            return cache_file
+
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
diff --git a/eg3d/docs/camera_conventions.md b/eg3d/docs/camera_conventions.md
new file mode 100644
index 0000000000000000000000000000000000000000..28203a03b60879e829ff4353640c356c70ba7b87
--- /dev/null
+++ b/eg3d/docs/camera_conventions.md
@@ -0,0 +1,2 @@
+Camera poses are in OpenCV Cam2World format.
+Intrinsics are normalized.
\ No newline at end of file
diff --git a/eg3d/docs/camera_coordinate_conventions.jpg b/eg3d/docs/camera_coordinate_conventions.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..537e1445751f2ac1efb2fb33f2633cda8d20a5d9
Binary files /dev/null and b/eg3d/docs/camera_coordinate_conventions.jpg differ
diff --git a/eg3d/docs/models.md b/eg3d/docs/models.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a2681d11536dae67397ec60c5939113c4fbe9d9
--- /dev/null
+++ b/eg3d/docs/models.md
@@ -0,0 +1,71 @@
+Pre-trained checkpoints can be found on the [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/research/models/eg3d).
+
+Brief descriptions of models and the commands used to train them are found below.
+
+---
+
+# FFHQ
+
+**ffhq512-64.pkl**
+
+FFHQ 512, trained with neural rendering resolution of 64x64.
+
+```.bash
+# Train with FFHQ from scratch with raw neural rendering resolution=64, using 8 GPUs.
+python train.py --outdir=~/training-runs --cfg=ffhq --data=~/datasets/FFHQ_512.zip \
+  --gpus=8 --batch=32 --gamma=1 --gen_pose_cond=True
+```
+
+**ffhq512-128.pkl**
+
+Fine-tune FFHQ 512, with neural rendering resolution of 128x128.
+
+```.bash
+# Second stage finetuning of FFHQ to 128 neural rendering resolution.
+python train.py --outdir=~/training-runs --cfg=ffhq --data=~/datasets/FFHQ_512.zip \
+  --resume=ffhq-64.pkl \
+  --gpus=8 --batch=32 --gamma=1 --gen_pose_cond=True --neural_rendering_resolution_final=128 --kimg=2000
+```
+
+## FFHQ Rebalanced
+
+Same as the models above, but fine-tuned using a rebalanced version of FFHQ that has a more uniform pose distribution. Compared to models trained on standard FFHQ, these models should produce better 3D shapes and better renderings from steep angles.
+
+**ffhqrebalanced512-64.pkl**
+
+```.bash
+# Finetune with rebalanced FFHQ at rendering resolution 64.
+python train.py --outdir=~/training-runs --cfg=ffhq --data=~/datasets/FFHQ_rebalanced_512.zip \
+  --resume=ffhq-64.pkl \
+  --gpus=8 --batch=32 --gamma=1 --gen_pose_cond=True --gpc_reg_prob=0.8
+```
+
+**ffhqrebalanced512-128.pkl**
+```.bash
+# Finetune with rebalanced FFHQ at 128 neural rendering resolution.
+python train.py --outdir=~/training-runs --cfg=ffhq --data=~/datasets/FFHQ_rebalanced_512.zip \
+  --resume=ffhq-rebalanced-64.pkl \
+  --gpus=8 --batch=32 --gamma=1 --gen_pose_cond=True --gpc_reg_prob=0.8 --neural_rendering_resolution_final=128
+```
+
+# AFHQ Cats
+
+**afhqcats512-128.pkl**
+
+```.bash
+# Train with AFHQ, finetuning from FFHQ with ADA, using 8 GPUs.
+python train.py --outdir=~/training-runs --cfg=afhq --data=~/datasets/afhq.zip \
+  --resume=ffhq-64.pkl \
+  --gpus=8 --batch=32 --gamma=5 --aug=ada --gen_pose_cond=True --gpc_reg_prob=0.8 --neural_rendering_resolution_final=128
+```
+
+
+# Shapenet
+
+**shapenetcars128-64.pkl**
+
+```.bash
+# Train with Shapenet from scratch, using 8 GPUs.
+python train.py --outdir=~/training-runs --cfg=shapenet --data=~/datasets/cars_train.zip \
+  --gpus=8 --batch=32 --gamma=0.3
+```
\ No newline at end of file
diff --git a/eg3d/docs/teaser.jpeg b/eg3d/docs/teaser.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..b8743d3acb4026ebf5c4fb56123902d50413be93
Binary files /dev/null and b/eg3d/docs/teaser.jpeg differ
diff --git a/eg3d/docs/training_guide.md b/eg3d/docs/training_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..c18abd0116da6039db4c95a432f04705361251b8
--- /dev/null
+++ b/eg3d/docs/training_guide.md
@@ -0,0 +1,165 @@
+## Guide to Training
+
+Tips and tricks for setting up your own training runs. This guide looks at the most important options when setting up a training run with new data.
+
+---
+
+## Preparing your data
+
+Your dataset should be a directory that includes your images and a dataset.json file that fits the following format:
+
+```
+{
+    'labels': [
+        ["img_0000.png", [0.1, 0.2, -0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, -1.5, 1.6, 1.7, 1.8, -1.9, 2.0, 2.1, -2.2, 2.3, 2.4, -2.5]]
+    ]
+}
+```
+Each entry of the 'labels' list contains the relative filename and a 25-length camera parameters vector. The first 16 entries of the camera parameters are the 4x4 OpenCV Cam2World extrinsics matrix. The last 9 parameters are the 3x3 intrinsics matrix normalized by image size.
+
+## Camera Conventions
+
+![Teaser image](camera_coordinate_conventions.jpg)
+
+### Intrinsics
+We use normalized intrinsics so we can ignore image size during training. You can easily normalize intrinsics by dividing by your image size in number of pixels. For a camera intrinsics matrix with focal length f_x, f_y, principal point offset x_0, y_0, axis skew s, and image size (in pixels) size_x, size_y:
+
+```
+unnormalized                                normalized
+
+[[ f_x, s,    x_0]             [[ f_x/size_x,   s,            x_0/size_x]
+ [ 0,   f_y,  y_0]      ->      [ 0,            f_y/size_y,   y_0/size_y]
+ [ 0,   0,    1  ]]             [ 0,            0,            1         ]]
+```
+
+As a sanity check, after normalization, your principal point should be close to 0.5, 0.5.
+
+## Mirrored Data
+
+We recommend you mirror data manually by duplicating images and creating a duplicate camera pose label in your dataset.json file. See the FFHQ dataset preprocessing scripts for an example.
+
+## Uncompressed Zip
+
+While you can train with simply a directory of images and the dataset.json file, it's sometimes easier to zip the directory into an archive for more efficient transfer on networked systems. We use uncompressed .zip archives so that reading from the archive is as efficient as possible.
+
+```
+cd my_dataset
+zip -0 -r ../my_dataset.zip *
+```
+
+## Examples
+
+Please see the dataset_preprocessing directory for example scripts for preparing FFHQ, AFHQ, ShapeNet datasets.
+
+---
+
+## Basic Training and Rendering Parameters
+
+### Finetuning
+
+`--resume=my_pretrained_model.pkl`
+
+Once you have your data, it's time to start some training runs. If possible, we highly recommend using finetuning off of a pre-trained model. Doing so dramatically improves the rate of convergence, so you can get better results in much less time. If your new dataset is front-facing, FFHQ is a great choice. If your dataset is imaged from 360 degrees, ShapeNet is going to be a better option.
+
+### Batch Size
+
+`--gpus=8 --batch=32`
+
+If possible, use 8 gpus and a batch size of 32; these were our defaults for all of our major experiments. However, good results have also been obtained with small batch sizes on one or two GPUs, especially when finetuning is used. The batch size you specify is split evenly across the number of GPUs. If your batch size is small, use stronger R1 regularization (higher gamma).
+
+### Gamma
+
+`--gamma=5`
+
+The strength of R1 regularization is an important hyperparameter for ensuring stability of GAN training. The best value of gamma may vary widely between datasets. If you have nothing to go on, ```--gamma=5``` is a safe choice. If training seems stable, and your model starts to produce diverse and reasonable outputs, you can try lowering gamma. If you experience training instability or mode collapse, try increasing gamma. In general, if your batch size is small, or if your images are large, you will need more regularization (higher gamma).
+
+Finding the optimal value of gamma is important for maximizing your image quality.
+
+### Generator Pose Conditioning
+
+`--gen_pose_cond=True --gpc_reg_prob=0.8`
+
+Generator pose conditioning (GPC) is when we condition the generator on the rendering camera pose. In doing so, we allow the camera pose to influence the identity of the scene, which is important for modelling pose-appearance correlations.
+
+The above options control the presence and strength of GPC. `--gpc_reg_prob` adjusts probability of swapping regularization—when instead of conditioning on the rendering camera pose, we instead condition with a random camera pose. A high (close to 1) swapping regularization makes the conditioning vector "unreliable" and the effect of GPC weaker; a low (close to 0) swapping regularization means the effect of GPC is stronger but may introduce artifacts.
+
+Our recommendation when starting with a new dataset is to train *without* generator pose conditioning by setting `--gen_pose_cond=False`. Whether you should use GPC is dependent on your dataset. If you use synthetic data, and know that all of your scenes are sampled randomly, you probably won't need it; by contrast, if you know your dataset has clear pose-appearance biases, turning on GPC may improve your image quality. After obtaining reasonable results without generator pose conditioning, you can try turning it on. Try setting your `--gpc_reg_prob` to somewhere between `0.5` and `0.8`.
+
+
+### Neural Rendering Resolution
+
+`--neural_rendering_resolution_final=128`
+
+Neural rendering resolution is the resolution at which we volumetrically render, and it is independent of your output image size. In general, low neural rendering resolutions (e.g. 64) are faster at training and at inference. Higher neural rendering resolutions (e.g. 128) are more compute intensive but have less aliasing, produce more detailed shapes, and more view-consistent 3D renderings. For most models, we train at neural rendering resolution of 64 and optionally continue training with a neural rendering resolution of 128. **For the best quality and multi-view consistency, we strongly recommend fine-tuning at the 128 neural rendering resolution.**  
+
+To train with a static neural rendering resolution of 64:
+```.bash
+python train.py \
+  --neural_rendering_resolution_initial=64 \
+  ...
+```
+
+To train with a neural rendering resolution that changes gradually increases from 64 to 128 over 1 million images:
+```.bash
+python train.py \
+  --neural_rendering_resolution_initial=64 \
+  --neural_rendering_resolution_final=128 \
+  --neural_rendering_resolution_fade_kimg=1000 \
+  ...
+```
+
+Please see **Two-stage training** (Section 3 of the supplemental) for additional details.
+
+### Adaptive Discriminator Augmentation
+
+With small datasets, the discriminator can memorize the real images and destabilize training. Enable ADA by setting `--aug=ada`. Note that for small datasets, you'll see the largest benefit if you use both ADA as well as finetuning.
+
+### Discriminator Pose Conditioning Regularization
+
+We condition the discriminator on the rendering camera pose in order to aid convergence to accurate 3D objects. However, it's sometimes possible for discriminator pose conditioning to hurt training stability. If your input poses are accurate and unique, e.g. if they were generated synthetically with random camera poses, it's possible for the discriminator to memorize which poses must be paired with which images. We can regularize this effect by corrupting these poses with Gaussian noise before they are seen by the discriminator. To add 1 standard deviation of Gaussian noise, set `--disc_c_noise=1`.
+
+---
+
+## Rendering Config
+
+```
+if opts.cfg == 'shapenet':
+    rendering_options.update({
+        'depth_resolution': 64,
+        'depth_resolution_importance': 64,
+        'ray_start': 'auto',
+        'ray_end': 'auto',
+        'box_warp': 1.6,
+        'white_back': True,
+        'avg_camera_radius': 1.7,
+        'avg_camera_pivot': [0, 0, 0],
+    })
+```
+
+The last step before training a model is to set up a rendering config, which you can do in the `train.py` script.
+
+**depth_resolution:** How many uniformly spaced samples to take along each ray.
+
+**depth_resolution_importance:** How many importance samples to take along each ray.
+
+**ray_start:** The distance between the camera origin and the first depth sample along the ray. Can be a float, e.g. `0.1` or `'auto'`, if you want to use the ray-box intersection of the volume to set ray bounds.
+
+**ray_end:** The distance between the camera origin and the last depth sample along the ray. Can be a float, e.g. `1.5` or `'auto'`, if you want to use the ray-box intersection of the volume to set ray bounds.
+
+**box_warp:** The side length of the cube spanned by the tri-planes. The box is axis-aligned, centered at the origin, and has limits [-box_warp/2, -box_warp/2, -box_warp/2] - [box_warp/2, box_warp/2, box_warp/2]. If `box_warp=1.8`, it has vertices at [0.9, 0.9, 0.9], [0.9, 0.9, -0.9], ...
+
+**white_back:** Controls the color of rays that pass through the volume without encountering any solid objects. Set to True if your background is white; set to false if the background is black.
+
+**avg_camera_radius:** The average radius of the camera, assuming it rotates on a sphere about the origin. This option is unused at training—it is used only to specify the camera path in the visualizer.
+
+**avg_camera_pivot:** The point at which the camera looks, assuming it rotates on a sphere about the origin. This option is unused at training—it is used only to specify the camera path in the visualizer.
+
+---
+
+Taking all of the above into account, you'll likely have a command that is similar to this one:
+
+`python train.py --data=/data/mydata.zip --gpus=2 --batch=8 --cfg=myconfig --gamma=5 --resume=shapenet.pkl --outdir=training_runs`
+
+For the training commands used to create the supplied pre-trained models, see [Models](models.md).
+
+Good luck!
\ No newline at end of file
diff --git a/eg3d/docs/visualizer.png b/eg3d/docs/visualizer.png
new file mode 100644
index 0000000000000000000000000000000000000000..9950a22a3bf3dc68a3f1d28708f61c770a91ad24
Binary files /dev/null and b/eg3d/docs/visualizer.png differ
diff --git a/eg3d/docs/visualizer_guide.md b/eg3d/docs/visualizer_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..9cb2edae7f9ae3348cc1a7ae670f7a85e935114e
--- /dev/null
+++ b/eg3d/docs/visualizer_guide.md
@@ -0,0 +1,66 @@
+## Guide to the Visualizer
+
+![Visualizer](visualizer.png)
+
+We include a 3D visualizer that is based on the amazing tool introduced in StyleGAN3. The following document describes important options and sliders of the visualizer UI.
+
+TLDR:
+1. Press the "Pickle/Recent" button to select a pretrained EG3D model.
+2. Click and drag the "Latent/Drag" button to sweep latent codes and change the scene identity.
+3. Click and drag the rendering on the right to move the camera.
+
+---
+
+## Network & Latent
+
+### Pickle
+Specify the path of the model checkpoint to visualize. You have a few options:
+1. Drag and drop the .pkl file from your file browser into the visualizer window
+1. Type the path (or url) of your .pkl file into the text field
+1. Press the recent box to access a list of recently used checkpoints
+
+### Pose
+Control the pitch and yaw of the camera by clicking and dragging the rendering on the right. By default, the camera rotates on a sphere with fixed radius, pointed at the origin.
+
+### FOV
+Control the field of view of the camera with this slider to zoom the camera in and out. For FFHQ, 18 degrees is about right; for ShapeNet, use a FOV of 45 degrees.
+
+### Cond Pose
+The pose with which we condition the generator (see Generator Pose Conditioning in Sec. 4.4). By default, we condition on the fixed frontal camera pose. For models trained without generator pose conditioning, this will have no effect.
+
+### Render Type
+Toggle between the final super-resolved output (RGB image), a depth map (Depth image) or the raw neural rendering without super resolution (Neural rendering).
+
+### Depth Sample Multiplier / Depth Sample Importance Multiplier
+Adjust the number of depth samples taken per ray. By increasing the number of depth samples, we reduce flickering artifacts caused by depth aliasing, which leads to more temporally-consistent videos. However, the tradeoff is slower rendering and slightly blurrier images. At 1X / 1X, render in the visualizer with the same number of depth samples as at training; at 2X / 2X, take double the uniformly spaced and double the importance samples per ray. As an example: we train FFHQ with 48 uniformly spaced depth samples and 48 importance samples per ray. Using 2X / 2X, we instead take 96 uniformly spaced depth samples and 96 importance samples (192 total).
+
+### Latent
+The seed for the latent code, *z*, that is the input to the generator. Click and drag the "drag" button to sweep between scene identities. Press the "Anim" checkbox to play an animation sweeping through latent codes.
+
+### Stylemix
+The seed for a second latent code for style mixing. Check the boxes on the right to select which layers should be conditioned by this second code.
+
+### Truncate
+Apply the truncation trick in *w*-space to trade off fidelity for diversity. Psi=1 means no truncation. Psi=0 gives the "average" scene learned by the generator. A Psi between 0 and 1, e.g. 0.7 is a compromise that reduces diversity somewhat but improves the overall consistency in quality. (See the Truncation Trick in StyleGAN for more info.)
+
+---
+
+## Performance & capture
+
+### Render
+
+Displays the framerate of rendering. On an RTX 3090, with neural rendering resolution of 128, and with 48 uniform and 48 importance depth samples, we get 25-30 FPS.
+
+### Capture
+
+Save screenshots to the directory specified by the text field. Save image saves just the rendering; Save GUI saves the complete pane including the user interface.
+
+---
+
+## Layers & channels
+
+### Cache backbone
+For rendering where the scene identity (the latent code *z* and conditioning pose) remain static, but rendering parameters (the camera pose, fov, render type, etc...) change, we can enable 'backbone caching' which will enable us to cache and reuse the existing triplanes computed by the convolutional backbone. Backbone caching slightly improves rendering speed.
+
+### Layer viewer
+View and analyze the intermediate weights and layers of the generator. Scroll through the network and select a layer using the checkbox. Use the "Channel" slider on the right to view different activations. Do note that when 'cache backbone' is enabled, you will be unable to view the intermediate weights of the convolutional backbone/triplanes.
diff --git a/eg3d/gen_samples.py b/eg3d/gen_samples.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b69a581071feb3814eac423f46c9d5b75169d9
--- /dev/null
+++ b/eg3d/gen_samples.py
@@ -0,0 +1,280 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Generate images and shapes using pretrained network pickle."""
+
+import os
+import re
+from typing import List, Optional, Tuple, Union
+
+import click
+import dnnlib
+import numpy as np
+import PIL.Image
+import torch
+from tqdm import tqdm
+import mrcfile
+
+
+import legacy
+from camera_utils import LookAtPoseSampler, FOV_to_intrinsics
+from torch_utils import misc
+from training.triplane import TriPlaneGenerator
+
+
+#----------------------------------------------------------------------------
+
+def parse_range(s: Union[str, List]) -> List[int]:
+    '''Parse a comma separated list of numbers or ranges and return a list of ints.
+
+    Example: '1,2,5-10' returns [1, 2, 5, 6, 7]
+    '''
+    if isinstance(s, list): return s
+    ranges = []
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    for p in s.split(','):
+        if m := range_re.match(p):
+            ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
+        else:
+            ranges.append(int(p))
+    return ranges
+
+#----------------------------------------------------------------------------
+
+def parse_vec2(s: Union[str, Tuple[float, float]]) -> Tuple[float, float]:
+    '''Parse a floating point 2-vector of syntax 'a,b'.
+
+    Example:
+        '0,1' returns (0,1)
+    '''
+    if isinstance(s, tuple): return s
+    parts = s.split(',')
+    if len(parts) == 2:
+        return (float(parts[0]), float(parts[1]))
+    raise ValueError(f'cannot parse 2-vector {s}')
+
+#----------------------------------------------------------------------------
+
+def make_transform(translate: Tuple[float,float], angle: float):
+    m = np.eye(3)
+    s = np.sin(angle/360.0*np.pi*2)
+    c = np.cos(angle/360.0*np.pi*2)
+    m[0][0] = c
+    m[0][1] = s
+    m[0][2] = translate[0]
+    m[1][0] = -s
+    m[1][1] = c
+    m[1][2] = translate[1]
+    return m
+
+#----------------------------------------------------------------------------
+
+def create_samples(N=256, voxel_origin=[0, 0, 0], cube_length=2.0):
+    # NOTE: the voxel_origin is actually the (bottom, left, down) corner, not the middle
+    voxel_origin = np.array(voxel_origin) - cube_length/2
+    voxel_size = cube_length / (N - 1)
+
+    overall_index = torch.arange(0, N ** 3, 1, out=torch.LongTensor())
+    samples = torch.zeros(N ** 3, 3)
+
+    # transform first 3 columns
+    # to be the x, y, z index
+    samples[:, 2] = overall_index % N
+    samples[:, 1] = (overall_index.float() / N) % N
+    samples[:, 0] = ((overall_index.float() / N) / N) % N
+
+    # transform first 3 columns
+    # to be the x, y, z coordinate
+    samples[:, 0] = (samples[:, 0] * voxel_size) + voxel_origin[2]
+    samples[:, 1] = (samples[:, 1] * voxel_size) + voxel_origin[1]
+    samples[:, 2] = (samples[:, 2] * voxel_size) + voxel_origin[0]
+
+    num_samples = N ** 3
+
+    return samples.unsqueeze(0), voxel_origin, voxel_size
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.option('--network', help='Network path', multiple=True, required=True)
+@click.option('--w_pth', help='latent path')
+@click.option('--generator_type', help='Generator type', type=click.Choice(['ffhq', 'cat']), required=False, metavar='STR', default='ffhq', show_default=True)
+@click.option('--model_is_state_dict', type=bool, default=False)
+@click.option('--seeds', type=parse_range, help='List of random seeds (e.g., \'0,1,4-6\')', required=True)
+@click.option('--trunc', 'truncation_psi', type=float, help='Truncation psi', default=1, show_default=True)
+@click.option('--trunc-cutoff', 'truncation_cutoff', type=int, help='Truncation cutoff', default=14, show_default=True)
+@click.option('--outdir', help='Where to save the output images', type=str, required=True, metavar='DIR')
+@click.option('--shapes', help='Export shapes as .mrc files viewable in ChimeraX', type=bool, required=False, metavar='BOOL', default=False, show_default=True)
+@click.option('--shape-res', help='', type=int, required=False, metavar='int', default=512, show_default=True)
+@click.option('--shape_only_first', type=bool, default=False)
+@click.option('--fov-deg', help='Field of View of camera in degrees', type=int, required=False, metavar='float', default=18.837, show_default=True)
+@click.option('--shape_format', help='Shape Format', type=click.Choice(['.mrc', '.ply']), default='.mrc')
+def generate_images(
+    network: List[str],
+    w_pth: str,
+    generator_type: str,
+    seeds: List[int],
+    truncation_psi: float,
+    truncation_cutoff: int,
+    outdir: str,
+    shapes: bool,
+    shape_res: int,
+    fov_deg: float,
+    shape_format: str,
+    model_is_state_dict: bool,
+    shape_only_first: bool,
+):
+
+
+    if not os.path.exists(outdir):
+        os.makedirs(outdir, exist_ok=True)
+
+    device = torch.device('cuda')
+
+    if generator_type == 'ffhq':
+        network_pkl_tmp = 'pretrained/ffhqrebalanced512-128.pkl'
+    elif generator_type == 'cat':
+        network_pkl_tmp = 'pretrained/afhqcats512-128.pkl'
+    else:
+        NotImplementedError()
+
+    G_list = []
+    outputs = []
+    for network_path in network:
+        print('Loading networks from "%s"...' % network_path)
+        dir_label = network_path.split('/')[-2] + '___' + network_path.split('/')[-1]
+        output = os.path.join(outdir, dir_label)
+        outputs.append(output)
+        if model_is_state_dict:
+            with dnnlib.util.open_url(network_pkl_tmp) as f:
+                G = legacy.load_network_pkl(f)['G_ema'].to(device)  # type: ignore
+            ckpt = torch.load(network_path)
+            G.load_state_dict(ckpt, strict=False)
+        else:
+            with dnnlib.util.open_url(network_path) as f:
+                G = legacy.load_network_pkl(f)['G_ema'].to(device)  # type: ignore
+
+        G.rendering_kwargs['depth_resolution'] = int(G.rendering_kwargs['depth_resolution'])
+        G.rendering_kwargs['depth_resolution_importance'] = int(
+            G.rendering_kwargs['depth_resolution_importance'])
+
+        if generator_type == 'cat':
+            G.rendering_kwargs['avg_camera_pivot'] = [0, 0, -0.06]
+        elif generator_type == 'ffhq':
+            G.rendering_kwargs['avg_camera_pivot'] = [0, 0, 0.2]
+
+        G_list.append(G)
+
+    if truncation_cutoff == 0:
+        truncation_psi = 1.0 # truncation cutoff of 0 means no truncation anyways
+    if truncation_psi == 1.0:
+        truncation_cutoff = 14 # no truncation so doesn't matter where we cutoff
+
+    if w_pth is not None:
+        seeds = [0]
+    seed_idx = ''
+    for i, seed in enumerate(seeds):
+        if i < len(seeds) - 1:
+            seed_idx += f'{seed}_'
+        else:
+            seed_idx += f'{seed}'
+
+    intrinsics = FOV_to_intrinsics(fov_deg, device=device)
+
+    print(seeds)
+
+    # Generate images.
+    for G, output in zip(G_list, outputs):
+        for seed_idx, seed in enumerate(seeds):
+            print('Generating image for seed %d (%d/%d) ...' % (seed, seed_idx, len(seeds)))
+
+            z = torch.from_numpy(np.random.RandomState(seed).randn(1, G.z_dim)).to(device)
+
+            imgs = []
+            angle_p = -0.2
+            for angle_y, angle_p in [(.4, angle_p), (0, angle_p), (-.4, angle_p)]:
+                cam_pivot = torch.tensor(G.rendering_kwargs.get('avg_camera_pivot', [0, 0, 0]), device=device)
+                cam_radius = G.rendering_kwargs.get('avg_camera_radius', 2.7)
+                cam2world_pose = LookAtPoseSampler.sample(np.pi/2 + angle_y, np.pi/2 + angle_p, cam_pivot, radius=cam_radius, device=device)
+                conditioning_cam2world_pose = LookAtPoseSampler.sample(np.pi/2, np.pi/2, cam_pivot, radius=cam_radius, device=device)
+                camera_params = torch.cat([cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+                conditioning_params = torch.cat([conditioning_cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+
+                if w_pth is not None:
+                    ws = torch.load(w_pth).cuda()
+                    w_given_id = os.path.split(w_pth)[-1].split('.')[-2]
+                    output_img = output + f'__{w_given_id}.png'
+                    output_shape = output + f'__{w_given_id}.mrc'
+                else:
+                    ws = G.mapping(z, conditioning_params, truncation_psi=truncation_psi,  truncation_cutoff=truncation_cutoff)
+                    output_img = output + f'__{seed_idx:05d}.png'
+                    output_shape = output + f'__{seed_idx:05d}.mrc'
+
+
+                img = G.synthesis(ws, camera_params)['image']
+
+                img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+                imgs.append(img)
+
+            img = torch.cat(imgs, dim=2)
+
+            PIL.Image.fromarray(img[0].cpu().numpy(), 'RGB').save(output_img)
+            if shape_only_first and seed_idx != 0:
+                continue
+
+
+            if shapes:
+                # extract a shape.mrc with marching cubes. You can view the .mrc file using ChimeraX from UCSF.
+                max_batch=1000000
+
+                samples, voxel_origin, voxel_size = create_samples(N=shape_res, voxel_origin=[0, 0, 0], cube_length=G.rendering_kwargs['box_warp'] * 1)#.reshape(1, -1, 3)
+                samples = samples.to(z.device)
+                sigmas = torch.zeros((samples.shape[0], samples.shape[1], 1), device=z.device)
+                transformed_ray_directions_expanded = torch.zeros((samples.shape[0], max_batch, 3), device=z.device)
+                transformed_ray_directions_expanded[..., -1] = -1
+
+                head = 0
+                with tqdm(total = samples.shape[1]) as pbar:
+                    with torch.no_grad():
+                        while head < samples.shape[1]:
+                            torch.manual_seed(0)
+                            sigma = G.sample(samples[:, head:head+max_batch], transformed_ray_directions_expanded[:, :samples.shape[1]-head], z, conditioning_params, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, noise_mode='const')['sigma']
+                            sigmas[:, head:head+max_batch] = sigma
+                            head += max_batch
+                            pbar.update(max_batch)
+
+                sigmas = sigmas.reshape((shape_res, shape_res, shape_res)).cpu().numpy()
+                sigmas = np.flip(sigmas, 0)
+
+                # Trim the border of the extracted cube
+                pad = int(30 * shape_res / 256)
+                pad_value = -1000
+                sigmas[:pad] = pad_value
+                sigmas[-pad:] = pad_value
+                sigmas[:, :pad] = pad_value
+                sigmas[:, -pad:] = pad_value
+                sigmas[:, :, :pad] = pad_value
+                sigmas[:, :, -pad:] = pad_value
+
+
+                if shape_format == '.ply':
+                    from shape_utils import convert_sdf_samples_to_ply
+                    convert_sdf_samples_to_ply(np.transpose(sigmas, (2, 1, 0)), [0, 0, 0], 1, output_shape.replace('.mrc','.ply'), level=10)
+                elif shape_format == '.mrc': # output mrc
+                    with mrcfile.new_mmap(output_shape, overwrite=True, shape=sigmas.shape, mrc_mode=2) as mrc:
+                        mrc.data[:] = sigmas
+
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    generate_images() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/gen_videos.py b/eg3d/gen_videos.py
new file mode 100644
index 0000000000000000000000000000000000000000..f00e589ac62f7cf74dbe1a71879ebc2da508df99
--- /dev/null
+++ b/eg3d/gen_videos.py
@@ -0,0 +1,371 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Generate lerp videos using pretrained network pickle."""
+
+import os
+import re
+from typing import List, Optional, Tuple, Union
+
+import click
+import dnnlib
+import imageio
+import numpy as np
+import scipy.interpolate
+import torch
+from tqdm import tqdm
+import mrcfile
+
+import legacy
+
+from camera_utils import LookAtPoseSampler
+from torch_utils import misc
+#----------------------------------------------------------------------------
+
+def layout_grid(img, grid_w=None, grid_h=1, float_to_uint8=True, chw_to_hwc=True, to_numpy=True):
+    batch_size, channels, img_h, img_w = img.shape
+    if grid_w is None:
+        grid_w = batch_size // grid_h
+    assert batch_size == grid_w * grid_h
+    if float_to_uint8:
+        img = (img * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+    img = img.reshape(grid_h, grid_w, channels, img_h, img_w)
+    img = img.permute(2, 0, 3, 1, 4)
+    img = img.reshape(channels, grid_h * img_h, grid_w * img_w)
+    if chw_to_hwc:
+        img = img.permute(1, 2, 0)
+    if to_numpy:
+        img = img.cpu().numpy()
+    return img
+
+def create_samples(N=256, voxel_origin=[0, 0, 0], cube_length=2.0):
+    # NOTE: the voxel_origin is actually the (bottom, left, down) corner, not the middle
+    voxel_origin = np.array(voxel_origin) - cube_length/2
+    voxel_size = cube_length / (N - 1)
+
+    overall_index = torch.arange(0, N ** 3, 1, out=torch.LongTensor())
+    samples = torch.zeros(N ** 3, 3)
+
+    # transform first 3 columns
+    # to be the x, y, z index
+    samples[:, 2] = overall_index % N
+    samples[:, 1] = (overall_index.float() / N) % N
+    samples[:, 0] = ((overall_index.float() / N) / N) % N
+
+    # transform first 3 columns
+    # to be the x, y, z coordinate
+    samples[:, 0] = (samples[:, 0] * voxel_size) + voxel_origin[2]
+    samples[:, 1] = (samples[:, 1] * voxel_size) + voxel_origin[1]
+    samples[:, 2] = (samples[:, 2] * voxel_size) + voxel_origin[0]
+
+    num_samples = N ** 3
+
+    return samples.unsqueeze(0), voxel_origin, voxel_size
+
+#----------------------------------------------------------------------------
+
+def gen_interp_video(G, w_given, mp4: str, seeds, shuffle_seed=None, w_frames=60*4, kind='cubic', grid_dims=(1,1), num_keyframes=None, wraps=2, psi=1., truncation_cutoff=14, generator_type='ffhq', image_mode='image', gen_shapes=False, device=torch.device('cuda'), **video_kwargs):
+    grid_w = grid_dims[0]
+    grid_h = grid_dims[1]
+
+    if num_keyframes is None:
+        if len(seeds) % (grid_w*grid_h) != 0:
+            raise ValueError('Number of input seeds must be divisible by grid W*H')
+        num_keyframes = len(seeds) // (grid_w*grid_h)
+
+    all_seeds = np.zeros(num_keyframes*grid_h*grid_w, dtype=np.int64)
+    for idx in range(num_keyframes*grid_h*grid_w):
+        all_seeds[idx] = seeds[idx % len(seeds)]
+
+    if shuffle_seed is not None:
+        rng = np.random.RandomState(seed=shuffle_seed)
+        rng.shuffle(all_seeds)
+
+    camera_lookat_point = torch.tensor(G.rendering_kwargs['avg_camera_pivot'], device=device)
+    zs = torch.from_numpy(np.stack([np.random.RandomState(seed).randn(G.z_dim) for seed in all_seeds])).to(device)
+    cam2world_pose = LookAtPoseSampler.sample(3.14/2, 3.14/2, camera_lookat_point, radius=G.rendering_kwargs['avg_camera_radius'], device=device)
+    focal_length = 4.2647 #if generator_type != 'Shapenet' else 1.7074 # shapenet has higher FOV
+    intrinsics = torch.tensor([[focal_length, 0, 0.5], [0, focal_length, 0.5], [0, 0, 1]], device=device)
+    c = torch.cat([cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+    c = c.repeat(len(zs), 1)
+
+    if w_given is not None:
+        ws = w_given
+        if ws.shape[1] != G.backbone.mapping.num_ws:
+            ws = ws.repeat([1, G.backbone.mapping.num_ws, 1])
+    else:
+        ws = G.mapping(z=zs, c=c, truncation_psi=psi, truncation_cutoff=truncation_cutoff)
+    # _ = G.synthesis(ws[:1], c[:1]) # warm up
+    ws = ws.reshape(grid_h, grid_w, num_keyframes, *ws.shape[1:])
+
+    # Interpolation.
+    grid = []
+    for yi in range(grid_h):
+        row = []
+        for xi in range(grid_w):
+            x = np.arange(-num_keyframes * wraps, num_keyframes * (wraps + 1))
+            y = np.tile(ws[yi][xi].cpu().numpy(), [wraps * 2 + 1, 1, 1])
+            interp = scipy.interpolate.interp1d(x, y, kind=kind, axis=0)
+            row.append(interp)
+        grid.append(row)
+
+    # Render video.
+    max_batch = 10000000
+    voxel_resolution = 512
+    video_out = imageio.get_writer(mp4, mode='I', fps=60, codec='libx264', **video_kwargs)
+
+    if gen_shapes:
+        outdir = 'interpolation_{}_{}/'.format(all_seeds[0], all_seeds[1])
+        os.makedirs(outdir, exist_ok=True)
+    all_poses = []
+    for frame_idx in tqdm(range(num_keyframes * w_frames)):
+        imgs = []
+        for yi in range(grid_h):
+            for xi in range(grid_w):
+                pitch_range = 0.25
+                yaw_range = 0.35
+                cam2world_pose = LookAtPoseSampler.sample(3.14/2 + yaw_range * np.sin(2 * 3.14 * frame_idx / (num_keyframes * w_frames)),
+                                                        3.14/2 -0.05 + pitch_range * np.cos(2 * 3.14 * frame_idx / (num_keyframes * w_frames)),
+                                                        camera_lookat_point, radius=G.rendering_kwargs['avg_camera_radius'], device=device)
+                all_poses.append(cam2world_pose.squeeze().cpu().numpy())
+                focal_length = 4.2647 if generator_type != 'Shapenet' else 1.7074 # shapenet has higher FOV
+                intrinsics = torch.tensor([[focal_length, 0, 0.5], [0, focal_length, 0.5], [0, 0, 1]], device=device)
+                c = torch.cat([cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+
+                interp = grid[yi][xi]
+                w = torch.from_numpy(interp(frame_idx / w_frames)).to(device)
+
+                entangle = 'camera'
+                if entangle == 'conditioning':
+                    c_forward = torch.cat([LookAtPoseSampler.sample(3.14/2,
+                                                                    3.14/2,
+                                                                    camera_lookat_point,
+                                                                    radius=G.rendering_kwargs['avg_camera_radius'], device=device).reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+                    w_c = G.mapping(z=zs[0:1], c=c[0:1], truncation_psi=psi, truncation_cutoff=truncation_cutoff)
+                    img = G.synthesis(ws=w_c, c=c_forward, noise_mode='const')[image_mode][0]
+                elif entangle == 'camera':
+                    img = G.synthesis(ws=w.unsqueeze(0), c=c[0:1], noise_mode='const')[image_mode][0]
+                    # img = G.synthesis(ws=ws[yi, xi], c=c[0:1], noise_mode='const')[image_mode][0]
+                elif entangle == 'both':
+                    w_c = G.mapping(z=zs[0:1], c=c[0:1], truncation_psi=psi, truncation_cutoff=truncation_cutoff)
+                    img = G.synthesis(ws=w_c, c=c[0:1], noise_mode='const')[image_mode][0]
+
+                if image_mode == 'image_depth':
+                    img = -img
+                    img = (img - img.min()) / (img.max() - img.min()) * 2 - 1
+
+                imgs.append(img)
+
+                if gen_shapes:
+                    # generate shapes
+                    print('Generating shape for frame %d / %d ...' % (frame_idx, num_keyframes * w_frames))
+
+                    samples, voxel_origin, voxel_size = create_samples(N=voxel_resolution, voxel_origin=[0, 0, 0], cube_length=G.rendering_kwargs['box_warp'])
+                    samples = samples.to(device)
+                    sigmas = torch.zeros((samples.shape[0], samples.shape[1], 1), device=device)
+                    transformed_ray_directions_expanded = torch.zeros((samples.shape[0], max_batch, 3), device=device)
+                    transformed_ray_directions_expanded[..., -1] = -1
+
+                    head = 0
+                    with tqdm(total = samples.shape[1]) as pbar:
+                        with torch.no_grad():
+                            while head < samples.shape[1]:
+                                torch.manual_seed(0)
+                                sigma = G.sample_mixed(samples[:, head:head+max_batch], transformed_ray_directions_expanded[:, :samples.shape[1]-head], w.unsqueeze(0), truncation_psi=psi, noise_mode='const')['sigma']
+                                sigmas[:, head:head+max_batch] = sigma
+                                head += max_batch
+                                pbar.update(max_batch)
+
+                    sigmas = sigmas.reshape((voxel_resolution, voxel_resolution, voxel_resolution)).cpu().numpy()
+                    sigmas = np.flip(sigmas, 0)
+
+                    pad = int(30 * voxel_resolution / 256)
+                    pad_top = int(38 * voxel_resolution / 256)
+                    sigmas[:pad] = 0
+                    sigmas[-pad:] = 0
+                    sigmas[:, :pad] = 0
+                    sigmas[:, -pad_top:] = 0
+                    sigmas[:, :, :pad] = 0
+                    sigmas[:, :, -pad:] = 0
+
+                    output_ply = False
+                    if output_ply:
+                        try:
+                            from shape_utils import convert_sdf_samples_to_ply
+                            convert_sdf_samples_to_ply(np.transpose(sigmas, (2, 1, 0)), [0, 0, 0], 1, os.path.join(outdir, f'{frame_idx:04d}_shape.ply'), level=10)
+                        except:
+                            pass
+                    else: # output mrc
+                        with mrcfile.new_mmap(outdir + f'{frame_idx:04d}_shape.mrc', overwrite=True, shape=sigmas.shape, mrc_mode=2) as mrc:
+                            mrc.data[:] = sigmas
+
+        video_out.append_data(layout_grid(torch.stack(imgs), grid_w=grid_w, grid_h=grid_h))
+    video_out.close()
+    all_poses = np.stack(all_poses)
+
+    if gen_shapes:
+        print(all_poses.shape)
+        with open(mp4.replace('.mp4', '_trajectory.npy'), 'wb') as f:
+            np.save(f, all_poses)
+
+#----------------------------------------------------------------------------
+
+def parse_range(s: Union[str, List[int]]) -> List[int]:
+    '''Parse a comma separated list of numbers or ranges and return a list of ints.
+
+    Example: '1,2,5-10' returns [1, 2, 5, 6, 7]
+    '''
+    if isinstance(s, list): return s
+    ranges = []
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    for p in s.split(','):
+        if m := range_re.match(p):
+            ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
+        else:
+            ranges.append(int(p))
+    return ranges
+
+#----------------------------------------------------------------------------
+
+def parse_tuple(s: Union[str, Tuple[int,int]]) -> Tuple[int, int]:
+    '''Parse a 'M,N' or 'MxN' integer tuple.
+
+    Example:
+        '4x2' returns (4,2)
+        '0,1' returns (0,1)
+    '''
+    if isinstance(s, tuple): return s
+    if m := re.match(r'^(\d+)[x,](\d+)$', s):
+        return (int(m.group(1)), int(m.group(2)))
+    raise ValueError(f'cannot parse tuple {s}')
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.option('--network', help='Network path',multiple=True, required=True)
+@click.option('--w_pth', help='latent path')
+@click.option('--generator_type', help='Generator type', type=click.Choice(['ffhq', 'cat']), required=False, metavar='STR', default='ffhq', show_default=True)
+@click.option('--model_is_state_dict', type=bool, default=False)
+@click.option('--seeds', type=parse_range, help='List of random seeds', required=True)
+@click.option('--shuffle-seed', type=int, help='Random seed to use for shuffling seed order', default=None)
+@click.option('--grid', type=parse_tuple, help='Grid width/height, e.g. \'4x3\' (default: 1x1)', default=(1,1))
+@click.option('--num-keyframes', type=int, help='Number of seeds to interpolate through.  If not specified, determine based on the length of the seeds array given by --seeds.', default=None)
+@click.option('--w-frames', type=int, help='Number of frames to interpolate between latents', default=120)
+@click.option('--trunc', 'truncation_psi', type=float, help='Truncation psi', default=1, show_default=True)
+@click.option('--trunc-cutoff', 'truncation_cutoff', type=int, help='Truncation cutoff', default=14, show_default=True)
+@click.option('--outdir', help='Output directory', type=str, default='../test_runs/manip_3D_recon/4_manip_result', metavar='DIR')
+@click.option('--image_mode', help='Image mode', type=click.Choice(['image', 'image_depth', 'image_raw']), required=False, metavar='STR', default='image', show_default=True)
+@click.option('--sample_mult', 'sampling_multiplier', type=float, help='Multiplier for depth sampling in volume rendering', default=2, show_default=True)
+@click.option('--nrr', type=int, help='Neural rendering resolution override', default=None, show_default=True)
+@click.option('--shapes', type=bool, help='Gen shapes for shape interpolation', default=False, show_default=True)
+
+def generate_images(
+    network: List[str],
+    w_pth: str,
+    seeds: List[int],
+    shuffle_seed: Optional[int],
+    truncation_psi: float,
+    truncation_cutoff: int,
+    grid: Tuple[int,int],
+    num_keyframes: Optional[int],
+    w_frames: int,
+    outdir: str,
+    generator_type: str,
+    image_mode: str,
+    sampling_multiplier: float,
+    nrr: Optional[int],
+    shapes: bool,
+    model_is_state_dict: bool,
+):
+
+    if not os.path.exists(outdir):
+        os.makedirs(outdir, exist_ok=True)
+
+    device = torch.device('cuda')
+
+    if generator_type == 'ffhq':
+        network_pkl_tmp = 'pretrained/ffhqrebalanced512-128.pkl'
+    elif generator_type == 'cat':
+        network_pkl_tmp = 'pretrained/afhqcats512-128.pkl'
+    else:
+        NotImplementedError()
+
+    G_list = []
+    outputs = []
+    for network_path in network:
+        print('Loading networks from "%s"...' % network_path)
+        dir_label = network_path.split('/')[-2] + '___' + network_path.split('/')[-1]
+        output = os.path.join(outdir, dir_label)
+        outputs.append(output)
+        if model_is_state_dict:
+            with dnnlib.util.open_url(network_pkl_tmp) as f:
+                G = legacy.load_network_pkl(f)['G_ema'].to(device)  # type: ignore
+            ckpt = torch.load(network_path)
+            G.load_state_dict(ckpt, strict=False)
+        else:
+            with dnnlib.util.open_url(network_path) as f:
+                G = legacy.load_network_pkl(f)['G_ema'].to(device)  # type: ignore
+
+        G.rendering_kwargs['depth_resolution'] = int(G.rendering_kwargs['depth_resolution'] * sampling_multiplier)
+        G.rendering_kwargs['depth_resolution_importance'] = int(G.rendering_kwargs['depth_resolution_importance'] * sampling_multiplier)
+
+        if generator_type == 'cat':
+            G.rendering_kwargs['avg_camera_pivot'] = [0, 0, -0.06]
+        elif generator_type == 'ffhq':
+            G.rendering_kwargs['avg_camera_pivot'] = [0, 0, 0.2]
+
+        if nrr is not None: G.neural_rendering_resolution = nrr
+        G_list.append(G)
+
+
+    if truncation_cutoff == 0:
+        truncation_psi = 1.0 # truncation cutoff of 0 means no truncation anyways
+    if truncation_psi == 1.0:
+        truncation_cutoff = 14 # no truncation so doesn't matter where we cutoff
+
+    grid_w, grid_h = grid
+    seeds = seeds[:grid_w * grid_h]
+
+    seed_idx = ''
+
+    for i, seed in enumerate(seeds):
+        if i < len(seeds) - 1:
+            seed_idx += f'{seed}_'
+        else:
+            seed_idx += f'{seed}'
+
+
+    for G, output in zip(G_list, outputs):
+        if w_pth is not None:
+            grid = (1, 1)
+            w_given =  torch.load(w_pth).cuda()
+            w_given_id = os.path.split(w_pth)[-1].split('.')[-2]
+            output = output + f'__{w_given_id}.mp4'
+            gen_interp_video(G=G, w_given=w_given, mp4=output, bitrate='10M', grid_dims=grid, num_keyframes=num_keyframes,
+                             w_frames=w_frames,
+                             seeds=seeds, shuffle_seed=shuffle_seed, psi=truncation_psi,
+                             truncation_cutoff=truncation_cutoff, generator_type=generator_type, image_mode=image_mode,
+                             gen_shapes=shapes)
+
+        else:
+            output = output + f'__{seed_idx}.mp4'
+            gen_interp_video(G=G, w_given=None, mp4=output, bitrate='10M', grid_dims=grid, num_keyframes=num_keyframes,
+                             w_frames=w_frames,
+                             seeds=seeds, shuffle_seed=shuffle_seed, psi=truncation_psi,
+                             truncation_cutoff=truncation_cutoff, generator_type=generator_type, image_mode=image_mode,
+                             gen_shapes=shapes)
+
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    generate_images() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/gui_utils/__init__.py b/eg3d/gui_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfebd04f47e6f6b1b44984c14c23b57d56f72240
--- /dev/null
+++ b/eg3d/gui_utils/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+# empty
diff --git a/eg3d/gui_utils/gl_utils.py b/eg3d/gui_utils/gl_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1312f027c23bbb80eb489bba7a0f9014d95ac5b0
--- /dev/null
+++ b/eg3d/gui_utils/gl_utils.py
@@ -0,0 +1,376 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import os
+import functools
+import contextlib
+import numpy as np
+import OpenGL.GL as gl
+import OpenGL.GL.ARB.texture_float
+import dnnlib
+
+#----------------------------------------------------------------------------
+
+def init_egl():
+    assert os.environ['PYOPENGL_PLATFORM'] == 'egl' # Must be set before importing OpenGL.
+    import OpenGL.EGL as egl
+    import ctypes
+
+    # Initialize EGL.
+    display = egl.eglGetDisplay(egl.EGL_DEFAULT_DISPLAY)
+    assert display != egl.EGL_NO_DISPLAY
+    major = ctypes.c_int32()
+    minor = ctypes.c_int32()
+    ok = egl.eglInitialize(display, major, minor)
+    assert ok
+    assert major.value * 10 + minor.value >= 14
+
+    # Choose config.
+    config_attribs = [
+        egl.EGL_RENDERABLE_TYPE,    egl.EGL_OPENGL_BIT,
+        egl.EGL_SURFACE_TYPE,       egl.EGL_PBUFFER_BIT,
+        egl.EGL_NONE
+    ]
+    configs = (ctypes.c_int32 * 1)()
+    num_configs = ctypes.c_int32()
+    ok = egl.eglChooseConfig(display, config_attribs, configs, 1, num_configs)
+    assert ok
+    assert num_configs.value == 1
+    config = configs[0]
+
+    # Create dummy pbuffer surface.
+    surface_attribs = [
+        egl.EGL_WIDTH,  1,
+        egl.EGL_HEIGHT, 1,
+        egl.EGL_NONE
+    ]
+    surface = egl.eglCreatePbufferSurface(display, config, surface_attribs)
+    assert surface != egl.EGL_NO_SURFACE
+
+    # Setup GL context.
+    ok = egl.eglBindAPI(egl.EGL_OPENGL_API)
+    assert ok
+    context = egl.eglCreateContext(display, config, egl.EGL_NO_CONTEXT, None)
+    assert context != egl.EGL_NO_CONTEXT
+    ok = egl.eglMakeCurrent(display, surface, surface, context)
+    assert ok
+
+#----------------------------------------------------------------------------
+
+_texture_formats = {
+    ('uint8',   1): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_LUMINANCE,       internalformat=gl.GL_LUMINANCE8),
+    ('uint8',   2): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_LUMINANCE_ALPHA, internalformat=gl.GL_LUMINANCE8_ALPHA8),
+    ('uint8',   3): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_RGB,             internalformat=gl.GL_RGB8),
+    ('uint8',   4): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_RGBA,            internalformat=gl.GL_RGBA8),
+    ('float32', 1): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_LUMINANCE,       internalformat=OpenGL.GL.ARB.texture_float.GL_LUMINANCE32F_ARB),
+    ('float32', 2): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_LUMINANCE_ALPHA, internalformat=OpenGL.GL.ARB.texture_float.GL_LUMINANCE_ALPHA32F_ARB),
+    ('float32', 3): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_RGB,             internalformat=gl.GL_RGB32F),
+    ('float32', 4): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_RGBA,            internalformat=gl.GL_RGBA32F),
+}
+
+def get_texture_format(dtype, channels):
+    return _texture_formats[(np.dtype(dtype).name, int(channels))]
+
+#----------------------------------------------------------------------------
+
+def prepare_texture_data(image):
+    image = np.asarray(image)
+    if image.ndim == 2:
+        image = image[:, :, np.newaxis]
+    if image.dtype.name == 'float64':
+        image = image.astype('float32')
+    return image
+
+#----------------------------------------------------------------------------
+
+def draw_pixels(image, *, pos=0, zoom=1, align=0, rint=True):
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    zoom = np.broadcast_to(np.asarray(zoom, dtype='float32'), [2])
+    align = np.broadcast_to(np.asarray(align, dtype='float32'), [2])
+    image = prepare_texture_data(image)
+    height, width, channels = image.shape
+    size = zoom * [width, height]
+    pos = pos - size * align
+    if rint:
+        pos = np.rint(pos)
+    fmt = get_texture_format(image.dtype, channels)
+
+    gl.glPushAttrib(gl.GL_CURRENT_BIT | gl.GL_PIXEL_MODE_BIT)
+    gl.glPushClientAttrib(gl.GL_CLIENT_PIXEL_STORE_BIT)
+    gl.glRasterPos2f(pos[0], pos[1])
+    gl.glPixelZoom(zoom[0], -zoom[1])
+    gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+    gl.glDrawPixels(width, height, fmt.format, fmt.type, image)
+    gl.glPopClientAttrib()
+    gl.glPopAttrib()
+
+#----------------------------------------------------------------------------
+
+def read_pixels(width, height, *, pos=0, dtype='uint8', channels=3):
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    dtype = np.dtype(dtype)
+    fmt = get_texture_format(dtype, channels)
+    image = np.empty([height, width, channels], dtype=dtype)
+
+    gl.glPushClientAttrib(gl.GL_CLIENT_PIXEL_STORE_BIT)
+    gl.glPixelStorei(gl.GL_PACK_ALIGNMENT, 1)
+    gl.glReadPixels(int(np.round(pos[0])), int(np.round(pos[1])), width, height, fmt.format, fmt.type, image)
+    gl.glPopClientAttrib()
+    return np.flipud(image)
+
+#----------------------------------------------------------------------------
+
+class Texture:
+    def __init__(self, *, image=None, width=None, height=None, channels=None, dtype=None, bilinear=True, mipmap=True):
+        self.gl_id = None
+        self.bilinear = bilinear
+        self.mipmap = mipmap
+
+        # Determine size and dtype.
+        if image is not None:
+            image = prepare_texture_data(image)
+            self.height, self.width, self.channels = image.shape
+            self.dtype = image.dtype
+        else:
+            assert width is not None and height is not None
+            self.width = width
+            self.height = height
+            self.channels = channels if channels is not None else 3
+            self.dtype = np.dtype(dtype) if dtype is not None else np.uint8
+
+        # Validate size and dtype.
+        assert isinstance(self.width, int) and self.width >= 0
+        assert isinstance(self.height, int) and self.height >= 0
+        assert isinstance(self.channels, int) and self.channels >= 1
+        assert self.is_compatible(width=width, height=height, channels=channels, dtype=dtype)
+
+        # Create texture object.
+        self.gl_id = gl.glGenTextures(1)
+        with self.bind():
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_WRAP_S, gl.GL_CLAMP_TO_EDGE)
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_WRAP_T, gl.GL_CLAMP_TO_EDGE)
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR if self.bilinear else gl.GL_NEAREST)
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR_MIPMAP_LINEAR if self.mipmap else gl.GL_NEAREST)
+        self.update(image)
+
+    def delete(self):
+        if self.gl_id is not None:
+            gl.glDeleteTextures([self.gl_id])
+            self.gl_id = None
+
+    def __del__(self):
+        try:
+            self.delete()
+        except:
+            pass
+
+    @contextlib.contextmanager
+    def bind(self):
+        prev_id = gl.glGetInteger(gl.GL_TEXTURE_BINDING_2D)
+        gl.glBindTexture(gl.GL_TEXTURE_2D, self.gl_id)
+        yield
+        gl.glBindTexture(gl.GL_TEXTURE_2D, prev_id)
+
+    def update(self, image):
+        if image is not None:
+            image = prepare_texture_data(image)
+            assert self.is_compatible(image=image)
+        with self.bind():
+            fmt = get_texture_format(self.dtype, self.channels)
+            gl.glPushClientAttrib(gl.GL_CLIENT_PIXEL_STORE_BIT)
+            gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+            gl.glTexImage2D(gl.GL_TEXTURE_2D, 0, fmt.internalformat, self.width, self.height, 0, fmt.format, fmt.type, image)
+            if self.mipmap:
+                gl.glGenerateMipmap(gl.GL_TEXTURE_2D)
+            gl.glPopClientAttrib()
+
+    def draw(self, *, pos=0, zoom=1, align=0, rint=False, color=1, alpha=1, rounding=0):
+        zoom = np.broadcast_to(np.asarray(zoom, dtype='float32'), [2])
+        size = zoom * [self.width, self.height]
+        with self.bind():
+            gl.glPushAttrib(gl.GL_ENABLE_BIT)
+            gl.glEnable(gl.GL_TEXTURE_2D)
+            draw_rect(pos=pos, size=size, align=align, rint=rint, color=color, alpha=alpha, rounding=rounding)
+            gl.glPopAttrib()
+
+    def is_compatible(self, *, image=None, width=None, height=None, channels=None, dtype=None): # pylint: disable=too-many-return-statements
+        if image is not None:
+            if image.ndim != 3:
+                return False
+            ih, iw, ic = image.shape
+            if not self.is_compatible(width=iw, height=ih, channels=ic, dtype=image.dtype):
+                return False
+        if width is not None and self.width != width:
+            return False
+        if height is not None and self.height != height:
+            return False
+        if channels is not None and self.channels != channels:
+            return False
+        if dtype is not None and self.dtype != dtype:
+            return False
+        return True
+
+#----------------------------------------------------------------------------
+
+class Framebuffer:
+    def __init__(self, *, texture=None, width=None, height=None, channels=None, dtype=None, msaa=0):
+        self.texture = texture
+        self.gl_id = None
+        self.gl_color = None
+        self.gl_depth_stencil = None
+        self.msaa = msaa
+
+        # Determine size and dtype.
+        if texture is not None:
+            assert isinstance(self.texture, Texture)
+            self.width = texture.width
+            self.height = texture.height
+            self.channels = texture.channels
+            self.dtype = texture.dtype
+        else:
+            assert width is not None and height is not None
+            self.width = width
+            self.height = height
+            self.channels = channels if channels is not None else 4
+            self.dtype = np.dtype(dtype) if dtype is not None else np.float32
+
+        # Validate size and dtype.
+        assert isinstance(self.width, int) and self.width >= 0
+        assert isinstance(self.height, int) and self.height >= 0
+        assert isinstance(self.channels, int) and self.channels >= 1
+        assert width is None or width == self.width
+        assert height is None or height == self.height
+        assert channels is None or channels == self.channels
+        assert dtype is None or dtype == self.dtype
+
+        # Create framebuffer object.
+        self.gl_id = gl.glGenFramebuffers(1)
+        with self.bind():
+
+            # Setup color buffer.
+            if self.texture is not None:
+                assert self.msaa == 0
+                gl.glFramebufferTexture2D(gl.GL_FRAMEBUFFER, gl.GL_COLOR_ATTACHMENT0, gl.GL_TEXTURE_2D, self.texture.gl_id, 0)
+            else:
+                fmt = get_texture_format(self.dtype, self.channels)
+                self.gl_color = gl.glGenRenderbuffers(1)
+                gl.glBindRenderbuffer(gl.GL_RENDERBUFFER, self.gl_color)
+                gl.glRenderbufferStorageMultisample(gl.GL_RENDERBUFFER, self.msaa, fmt.internalformat, self.width, self.height)
+                gl.glFramebufferRenderbuffer(gl.GL_FRAMEBUFFER, gl.GL_COLOR_ATTACHMENT0, gl.GL_RENDERBUFFER, self.gl_color)
+
+            # Setup depth/stencil buffer.
+            self.gl_depth_stencil = gl.glGenRenderbuffers(1)
+            gl.glBindRenderbuffer(gl.GL_RENDERBUFFER, self.gl_depth_stencil)
+            gl.glRenderbufferStorageMultisample(gl.GL_RENDERBUFFER, self.msaa, gl.GL_DEPTH24_STENCIL8, self.width, self.height)
+            gl.glFramebufferRenderbuffer(gl.GL_FRAMEBUFFER, gl.GL_DEPTH_STENCIL_ATTACHMENT, gl.GL_RENDERBUFFER, self.gl_depth_stencil)
+
+    def delete(self):
+        if self.gl_id is not None:
+            gl.glDeleteFramebuffers([self.gl_id])
+            self.gl_id = None
+        if self.gl_color is not None:
+            gl.glDeleteRenderbuffers(1, [self.gl_color])
+            self.gl_color = None
+        if self.gl_depth_stencil is not None:
+            gl.glDeleteRenderbuffers(1, [self.gl_depth_stencil])
+            self.gl_depth_stencil = None
+
+    def __del__(self):
+        try:
+            self.delete()
+        except:
+            pass
+
+    @contextlib.contextmanager
+    def bind(self):
+        prev_fbo = gl.glGetInteger(gl.GL_FRAMEBUFFER_BINDING)
+        prev_rbo = gl.glGetInteger(gl.GL_RENDERBUFFER_BINDING)
+        gl.glBindFramebuffer(gl.GL_FRAMEBUFFER, self.gl_id)
+        if self.width is not None and self.height is not None:
+            gl.glViewport(0, 0, self.width, self.height)
+        yield
+        gl.glBindFramebuffer(gl.GL_FRAMEBUFFER, prev_fbo)
+        gl.glBindRenderbuffer(gl.GL_RENDERBUFFER, prev_rbo)
+
+    def blit(self, dst=None):
+        assert dst is None or isinstance(dst, Framebuffer)
+        with self.bind():
+            gl.glBindFramebuffer(gl.GL_DRAW_FRAMEBUFFER, 0 if dst is None else dst.fbo)
+            gl.glBlitFramebuffer(0, 0, self.width, self.height, 0, 0, self.width, self.height, gl.GL_COLOR_BUFFER_BIT, gl.GL_NEAREST)
+
+#----------------------------------------------------------------------------
+
+def draw_shape(vertices, *, mode=gl.GL_TRIANGLE_FAN, pos=0, size=1, color=1, alpha=1):
+    assert vertices.ndim == 2 and vertices.shape[1] == 2
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    size = np.broadcast_to(np.asarray(size, dtype='float32'), [2])
+    color = np.broadcast_to(np.asarray(color, dtype='float32'), [3])
+    alpha = np.clip(np.broadcast_to(np.asarray(alpha, dtype='float32'), []), 0, 1)
+
+    gl.glPushClientAttrib(gl.GL_CLIENT_VERTEX_ARRAY_BIT)
+    gl.glPushAttrib(gl.GL_CURRENT_BIT | gl.GL_TRANSFORM_BIT)
+    gl.glMatrixMode(gl.GL_MODELVIEW)
+    gl.glPushMatrix()
+
+    gl.glEnableClientState(gl.GL_VERTEX_ARRAY)
+    gl.glEnableClientState(gl.GL_TEXTURE_COORD_ARRAY)
+    gl.glVertexPointer(2, gl.GL_FLOAT, 0, vertices)
+    gl.glTexCoordPointer(2, gl.GL_FLOAT, 0, vertices)
+    gl.glTranslate(pos[0], pos[1], 0)
+    gl.glScale(size[0], size[1], 1)
+    gl.glColor4f(color[0] * alpha, color[1] * alpha, color[2] * alpha, alpha)
+    gl.glDrawArrays(mode, 0, vertices.shape[0])
+
+    gl.glPopMatrix()
+    gl.glPopAttrib()
+    gl.glPopClientAttrib()
+
+#----------------------------------------------------------------------------
+
+def draw_rect(*, pos=0, pos2=None, size=None, align=0, rint=False, color=1, alpha=1, rounding=0):
+    assert pos2 is None or size is None
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    pos2 = np.broadcast_to(np.asarray(pos2, dtype='float32'), [2]) if pos2 is not None else None
+    size = np.broadcast_to(np.asarray(size, dtype='float32'), [2]) if size is not None else None
+    size = size if size is not None else pos2 - pos if pos2 is not None else np.array([1, 1], dtype='float32')
+    pos = pos - size * align
+    if rint:
+        pos = np.rint(pos)
+    rounding = np.broadcast_to(np.asarray(rounding, dtype='float32'), [2])
+    rounding = np.minimum(np.abs(rounding) / np.maximum(np.abs(size), 1e-8), 0.5)
+    if np.min(rounding) == 0:
+        rounding *= 0
+    vertices = _setup_rect(float(rounding[0]), float(rounding[1]))
+    draw_shape(vertices, mode=gl.GL_TRIANGLE_FAN, pos=pos, size=size, color=color, alpha=alpha)
+
+@functools.lru_cache(maxsize=10000)
+def _setup_rect(rx, ry):
+    t = np.linspace(0, np.pi / 2, 1 if max(rx, ry) == 0 else 64)
+    s = 1 - np.sin(t); c = 1 - np.cos(t)
+    x = [c * rx, 1 - s * rx, 1 - c * rx, s * rx]
+    y = [s * ry, c * ry, 1 - s * ry, 1 - c * ry]
+    v = np.stack([x, y], axis=-1).reshape(-1, 2)
+    return v.astype('float32')
+
+#----------------------------------------------------------------------------
+
+def draw_circle(*, center=0, radius=100, hole=0, color=1, alpha=1):
+    hole = np.broadcast_to(np.asarray(hole, dtype='float32'), [])
+    vertices = _setup_circle(float(hole))
+    draw_shape(vertices, mode=gl.GL_TRIANGLE_STRIP, pos=center, size=radius, color=color, alpha=alpha)
+
+@functools.lru_cache(maxsize=10000)
+def _setup_circle(hole):
+    t = np.linspace(0, np.pi * 2, 128)
+    s = np.sin(t); c = np.cos(t)
+    v = np.stack([c, s, c * hole, s * hole], axis=-1).reshape(-1, 2)
+    return v.astype('float32')
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/gui_utils/glfw_window.py b/eg3d/gui_utils/glfw_window.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeb96e8707db91c620825541c9b3c846b7362407
--- /dev/null
+++ b/eg3d/gui_utils/glfw_window.py
@@ -0,0 +1,231 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import time
+import glfw
+import OpenGL.GL as gl
+from . import gl_utils
+
+#----------------------------------------------------------------------------
+
+class GlfwWindow: # pylint: disable=too-many-public-methods
+    def __init__(self, *, title='GlfwWindow', window_width=1920, window_height=1080, deferred_show=True, close_on_esc=True):
+        self._glfw_window           = None
+        self._drawing_frame         = False
+        self._frame_start_time      = None
+        self._frame_delta           = 0
+        self._fps_limit             = None
+        self._vsync                 = None
+        self._skip_frames           = 0
+        self._deferred_show         = deferred_show
+        self._close_on_esc          = close_on_esc
+        self._esc_pressed           = False
+        self._drag_and_drop_paths   = None
+        self._capture_next_frame    = False
+        self._captured_frame        = None
+
+        # Create window.
+        glfw.init()
+        glfw.window_hint(glfw.VISIBLE, False)
+        self._glfw_window = glfw.create_window(width=window_width, height=window_height, title=title, monitor=None, share=None)
+        self._attach_glfw_callbacks()
+        self.make_context_current()
+
+        # Adjust window.
+        self.set_vsync(False)
+        self.set_window_size(window_width, window_height)
+        if not self._deferred_show:
+            glfw.show_window(self._glfw_window)
+
+    def close(self):
+        if self._drawing_frame:
+            self.end_frame()
+        if self._glfw_window is not None:
+            glfw.destroy_window(self._glfw_window)
+            self._glfw_window = None
+        #glfw.terminate() # Commented out to play it nice with other glfw clients.
+
+    def __del__(self):
+        try:
+            self.close()
+        except:
+            pass
+
+    @property
+    def window_width(self):
+        return self.content_width
+
+    @property
+    def window_height(self):
+        return self.content_height + self.title_bar_height
+
+    @property
+    def content_width(self):
+        width, _height = glfw.get_window_size(self._glfw_window)
+        return width
+
+    @property
+    def content_height(self):
+        _width, height = glfw.get_window_size(self._glfw_window)
+        return height
+
+    @property
+    def title_bar_height(self):
+        _left, top, _right, _bottom = glfw.get_window_frame_size(self._glfw_window)
+        return top
+
+    @property
+    def monitor_width(self):
+        _, _, width, _height = glfw.get_monitor_workarea(glfw.get_primary_monitor())
+        return width
+
+    @property
+    def monitor_height(self):
+        _, _, _width, height = glfw.get_monitor_workarea(glfw.get_primary_monitor())
+        return height
+
+    @property
+    def frame_delta(self):
+        return self._frame_delta
+
+    def set_title(self, title):
+        glfw.set_window_title(self._glfw_window, title)
+
+    def set_window_size(self, width, height):
+        width = min(width, self.monitor_width)
+        height = min(height, self.monitor_height)
+        glfw.set_window_size(self._glfw_window, width, max(height - self.title_bar_height, 0))
+        if width == self.monitor_width and height == self.monitor_height:
+            self.maximize()
+
+    def set_content_size(self, width, height):
+        self.set_window_size(width, height + self.title_bar_height)
+
+    def maximize(self):
+        glfw.maximize_window(self._glfw_window)
+
+    def set_position(self, x, y):
+        glfw.set_window_pos(self._glfw_window, x, y + self.title_bar_height)
+
+    def center(self):
+        self.set_position((self.monitor_width - self.window_width) // 2, (self.monitor_height - self.window_height) // 2)
+
+    def set_vsync(self, vsync):
+        vsync = bool(vsync)
+        if vsync != self._vsync:
+            glfw.swap_interval(1 if vsync else 0)
+            self._vsync = vsync
+
+    def set_fps_limit(self, fps_limit):
+        self._fps_limit = int(fps_limit)
+
+    def should_close(self):
+        return glfw.window_should_close(self._glfw_window) or (self._close_on_esc and self._esc_pressed)
+
+    def skip_frame(self):
+        self.skip_frames(1)
+
+    def skip_frames(self, num): # Do not update window for the next N frames.
+        self._skip_frames = max(self._skip_frames, int(num))
+
+    def is_skipping_frames(self):
+        return self._skip_frames > 0
+
+    def capture_next_frame(self):
+        self._capture_next_frame = True
+
+    def pop_captured_frame(self):
+        frame = self._captured_frame
+        self._captured_frame = None
+        return frame
+
+    def pop_drag_and_drop_paths(self):
+        paths = self._drag_and_drop_paths
+        self._drag_and_drop_paths = None
+        return paths
+
+    def draw_frame(self): # To be overridden by subclass.
+        self.begin_frame()
+        # Rendering code goes here.
+        self.end_frame()
+
+    def make_context_current(self):
+        if self._glfw_window is not None:
+            glfw.make_context_current(self._glfw_window)
+
+    def begin_frame(self):
+        # End previous frame.
+        if self._drawing_frame:
+            self.end_frame()
+
+        # Apply FPS limit.
+        if self._frame_start_time is not None and self._fps_limit is not None:
+            delay = self._frame_start_time - time.perf_counter() + 1 / self._fps_limit
+            if delay > 0:
+                time.sleep(delay)
+        cur_time = time.perf_counter()
+        if self._frame_start_time is not None:
+            self._frame_delta = cur_time - self._frame_start_time
+        self._frame_start_time = cur_time
+
+        # Process events.
+        glfw.poll_events()
+
+        # Begin frame.
+        self._drawing_frame = True
+        self.make_context_current()
+
+        # Initialize GL state.
+        gl.glViewport(0, 0, self.content_width, self.content_height)
+        gl.glMatrixMode(gl.GL_PROJECTION)
+        gl.glLoadIdentity()
+        gl.glTranslate(-1, 1, 0)
+        gl.glScale(2 / max(self.content_width, 1), -2 / max(self.content_height, 1), 1)
+        gl.glMatrixMode(gl.GL_MODELVIEW)
+        gl.glLoadIdentity()
+        gl.glEnable(gl.GL_BLEND)
+        gl.glBlendFunc(gl.GL_ONE, gl.GL_ONE_MINUS_SRC_ALPHA) # Pre-multiplied alpha.
+
+        # Clear.
+        gl.glClearColor(0, 0, 0, 1)
+        gl.glClear(gl.GL_COLOR_BUFFER_BIT | gl.GL_DEPTH_BUFFER_BIT)
+
+    def end_frame(self):
+        assert self._drawing_frame
+        self._drawing_frame = False
+
+        # Skip frames if requested.
+        if self._skip_frames > 0:
+            self._skip_frames -= 1
+            return
+
+        # Capture frame if requested.
+        if self._capture_next_frame:
+            self._captured_frame = gl_utils.read_pixels(self.content_width, self.content_height)
+            self._capture_next_frame = False
+
+        # Update window.
+        if self._deferred_show:
+            glfw.show_window(self._glfw_window)
+            self._deferred_show = False
+        glfw.swap_buffers(self._glfw_window)
+
+    def _attach_glfw_callbacks(self):
+        glfw.set_key_callback(self._glfw_window, self._glfw_key_callback)
+        glfw.set_drop_callback(self._glfw_window, self._glfw_drop_callback)
+
+    def _glfw_key_callback(self, _window, key, _scancode, action, _mods):
+        if action == glfw.PRESS and key == glfw.KEY_ESCAPE:
+            self._esc_pressed = True
+
+    def _glfw_drop_callback(self, _window, paths):
+        self._drag_and_drop_paths = paths
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/gui_utils/imgui_utils.py b/eg3d/gui_utils/imgui_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a8357caf20493956769984f32776441beefd27
--- /dev/null
+++ b/eg3d/gui_utils/imgui_utils.py
@@ -0,0 +1,171 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import contextlib
+import imgui
+
+#----------------------------------------------------------------------------
+
+def set_default_style(color_scheme='dark', spacing=9, indent=23, scrollbar=27):
+    s = imgui.get_style()
+    s.window_padding        = [spacing, spacing]
+    s.item_spacing          = [spacing, spacing]
+    s.item_inner_spacing    = [spacing, spacing]
+    s.columns_min_spacing   = spacing
+    s.indent_spacing        = indent
+    s.scrollbar_size        = scrollbar
+    s.frame_padding         = [4, 3]
+    s.window_border_size    = 1
+    s.child_border_size     = 1
+    s.popup_border_size     = 1
+    s.frame_border_size     = 1
+    s.window_rounding       = 0
+    s.child_rounding        = 0
+    s.popup_rounding        = 3
+    s.frame_rounding        = 3
+    s.scrollbar_rounding    = 3
+    s.grab_rounding         = 3
+
+    getattr(imgui, f'style_colors_{color_scheme}')(s)
+    c0 = s.colors[imgui.COLOR_MENUBAR_BACKGROUND]
+    c1 = s.colors[imgui.COLOR_FRAME_BACKGROUND]
+    s.colors[imgui.COLOR_POPUP_BACKGROUND] = [x * 0.7 + y * 0.3 for x, y in zip(c0, c1)][:3] + [1]
+
+#----------------------------------------------------------------------------
+
+@contextlib.contextmanager
+def grayed_out(cond=True):
+    if cond:
+        s = imgui.get_style()
+        text = s.colors[imgui.COLOR_TEXT_DISABLED]
+        grab = s.colors[imgui.COLOR_SCROLLBAR_GRAB]
+        back = s.colors[imgui.COLOR_MENUBAR_BACKGROUND]
+        imgui.push_style_color(imgui.COLOR_TEXT, *text)
+        imgui.push_style_color(imgui.COLOR_CHECK_MARK, *grab)
+        imgui.push_style_color(imgui.COLOR_SLIDER_GRAB, *grab)
+        imgui.push_style_color(imgui.COLOR_SLIDER_GRAB_ACTIVE, *grab)
+        imgui.push_style_color(imgui.COLOR_FRAME_BACKGROUND, *back)
+        imgui.push_style_color(imgui.COLOR_FRAME_BACKGROUND_HOVERED, *back)
+        imgui.push_style_color(imgui.COLOR_FRAME_BACKGROUND_ACTIVE, *back)
+        imgui.push_style_color(imgui.COLOR_BUTTON, *back)
+        imgui.push_style_color(imgui.COLOR_BUTTON_HOVERED, *back)
+        imgui.push_style_color(imgui.COLOR_BUTTON_ACTIVE, *back)
+        imgui.push_style_color(imgui.COLOR_HEADER, *back)
+        imgui.push_style_color(imgui.COLOR_HEADER_HOVERED, *back)
+        imgui.push_style_color(imgui.COLOR_HEADER_ACTIVE, *back)
+        imgui.push_style_color(imgui.COLOR_POPUP_BACKGROUND, *back)
+        yield
+        imgui.pop_style_color(14)
+    else:
+        yield
+
+#----------------------------------------------------------------------------
+
+@contextlib.contextmanager
+def item_width(width=None):
+    if width is not None:
+        imgui.push_item_width(width)
+        yield
+        imgui.pop_item_width()
+    else:
+        yield
+
+#----------------------------------------------------------------------------
+
+def scoped_by_object_id(method):
+    def decorator(self, *args, **kwargs):
+        imgui.push_id(str(id(self)))
+        res = method(self, *args, **kwargs)
+        imgui.pop_id()
+        return res
+    return decorator
+
+#----------------------------------------------------------------------------
+
+def button(label, width=0, enabled=True):
+    with grayed_out(not enabled):
+        clicked = imgui.button(label, width=width)
+    clicked = clicked and enabled
+    return clicked
+
+#----------------------------------------------------------------------------
+
+def collapsing_header(text, visible=None, flags=0, default=False, enabled=True, show=True):
+    expanded = False
+    if show:
+        if default:
+            flags |= imgui.TREE_NODE_DEFAULT_OPEN
+        if not enabled:
+            flags |= imgui.TREE_NODE_LEAF
+        with grayed_out(not enabled):
+            expanded, visible = imgui.collapsing_header(text, visible=visible, flags=flags)
+        expanded = expanded and enabled
+    return expanded, visible
+
+#----------------------------------------------------------------------------
+
+def popup_button(label, width=0, enabled=True):
+    if button(label, width, enabled):
+        imgui.open_popup(label)
+    opened = imgui.begin_popup(label)
+    return opened
+
+#----------------------------------------------------------------------------
+
+def input_text(label, value, buffer_length, flags, width=None, help_text=''):
+    old_value = value
+    color = list(imgui.get_style().colors[imgui.COLOR_TEXT])
+    if value == '':
+        color[-1] *= 0.5
+    with item_width(width):
+        imgui.push_style_color(imgui.COLOR_TEXT, *color)
+        value = value if value != '' else help_text
+        changed, value = imgui.input_text(label, value, buffer_length, flags)
+        value = value if value != help_text else ''
+        imgui.pop_style_color(1)
+    if not flags & imgui.INPUT_TEXT_ENTER_RETURNS_TRUE:
+        changed = (value != old_value)
+    return changed, value
+
+#----------------------------------------------------------------------------
+
+def drag_previous_control(enabled=True):
+    dragging = False
+    dx = 0
+    dy = 0
+    if imgui.begin_drag_drop_source(imgui.DRAG_DROP_SOURCE_NO_PREVIEW_TOOLTIP):
+        if enabled:
+            dragging = True
+            dx, dy = imgui.get_mouse_drag_delta()
+            imgui.reset_mouse_drag_delta()
+        imgui.end_drag_drop_source()
+    return dragging, dx, dy
+
+#----------------------------------------------------------------------------
+
+def drag_button(label, width=0, enabled=True):
+    clicked = button(label, width=width, enabled=enabled)
+    dragging, dx, dy = drag_previous_control(enabled=enabled)
+    return clicked, dragging, dx, dy
+
+#----------------------------------------------------------------------------
+
+def drag_hidden_window(label, x, y, width, height, enabled=True):
+    imgui.push_style_color(imgui.COLOR_WINDOW_BACKGROUND, 0, 0, 0, 0)
+    imgui.push_style_color(imgui.COLOR_BORDER, 0, 0, 0, 0)
+    imgui.set_next_window_position(x, y)
+    imgui.set_next_window_size(width, height)
+    imgui.begin(label, closable=False, flags=(imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_MOVE))
+    dragging, dx, dy = drag_previous_control(enabled=enabled)
+    imgui.end()
+    imgui.pop_style_color(2)
+    return dragging, dx, dy
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/gui_utils/imgui_window.py b/eg3d/gui_utils/imgui_window.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1a6382b41c593c5ea4d9d2888c716282e575ec
--- /dev/null
+++ b/eg3d/gui_utils/imgui_window.py
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import os
+import imgui
+import imgui.integrations.glfw
+
+from . import glfw_window
+from . import imgui_utils
+from . import text_utils
+
+#----------------------------------------------------------------------------
+
+class ImguiWindow(glfw_window.GlfwWindow):
+    def __init__(self, *, title='ImguiWindow', font=None, font_sizes=range(14,24), **glfw_kwargs):
+        if font is None:
+            font = text_utils.get_default_font()
+        font_sizes = {int(size) for size in font_sizes}
+        super().__init__(title=title, **glfw_kwargs)
+
+        # Init fields.
+        self._imgui_context  = None
+        self._imgui_renderer = None
+        self._imgui_fonts    = None
+        self._cur_font_size  = max(font_sizes)
+
+        # Delete leftover imgui.ini to avoid unexpected behavior.
+        if os.path.isfile('imgui.ini'):
+            os.remove('imgui.ini')
+
+        # Init ImGui.
+        self._imgui_context = imgui.create_context()
+        self._imgui_renderer = _GlfwRenderer(self._glfw_window)
+        self._attach_glfw_callbacks()
+        imgui.get_io().ini_saving_rate = 0 # Disable creating imgui.ini at runtime.
+        imgui.get_io().mouse_drag_threshold = 0 # Improve behavior with imgui_utils.drag_custom().
+        self._imgui_fonts = {size: imgui.get_io().fonts.add_font_from_file_ttf(font, size) for size in font_sizes}
+        self._imgui_renderer.refresh_font_texture()
+
+    def close(self):
+        self.make_context_current()
+        self._imgui_fonts = None
+        if self._imgui_renderer is not None:
+            self._imgui_renderer.shutdown()
+            self._imgui_renderer = None
+        if self._imgui_context is not None:
+            #imgui.destroy_context(self._imgui_context) # Commented out to avoid creating imgui.ini at the end.
+            self._imgui_context = None
+        super().close()
+
+    def _glfw_key_callback(self, *args):
+        super()._glfw_key_callback(*args)
+        self._imgui_renderer.keyboard_callback(*args)
+
+    @property
+    def font_size(self):
+        return self._cur_font_size
+
+    @property
+    def spacing(self):
+        return round(self._cur_font_size * 0.4)
+
+    def set_font_size(self, target): # Applied on next frame.
+        self._cur_font_size = min((abs(key - target), key) for key in self._imgui_fonts.keys())[1]
+
+    def begin_frame(self):
+        # Begin glfw frame.
+        super().begin_frame()
+
+        # Process imgui events.
+        self._imgui_renderer.mouse_wheel_multiplier = self._cur_font_size / 10
+        if self.content_width > 0 and self.content_height > 0:
+            self._imgui_renderer.process_inputs()
+
+        # Begin imgui frame.
+        imgui.new_frame()
+        imgui.push_font(self._imgui_fonts[self._cur_font_size])
+        imgui_utils.set_default_style(spacing=self.spacing, indent=self.font_size, scrollbar=self.font_size+4)
+
+    def end_frame(self):
+        imgui.pop_font()
+        imgui.render()
+        imgui.end_frame()
+        self._imgui_renderer.render(imgui.get_draw_data())
+        super().end_frame()
+
+#----------------------------------------------------------------------------
+# Wrapper class for GlfwRenderer to fix a mouse wheel bug on Linux.
+
+class _GlfwRenderer(imgui.integrations.glfw.GlfwRenderer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mouse_wheel_multiplier = 1
+
+    def scroll_callback(self, window, x_offset, y_offset):
+        self.io.mouse_wheel += y_offset * self.mouse_wheel_multiplier
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/gui_utils/text_utils.py b/eg3d/gui_utils/text_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e64a34d1287d58960141fa06a8e76446cd9cebc8
--- /dev/null
+++ b/eg3d/gui_utils/text_utils.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import functools
+from typing import Optional
+
+import dnnlib
+import numpy as np
+import PIL.Image
+import PIL.ImageFont
+import scipy.ndimage
+
+from . import gl_utils
+
+#----------------------------------------------------------------------------
+
+def get_default_font():
+    url = 'http://fonts.gstatic.com/s/opensans/v17/mem8YaGs126MiZpBA-U1UpcaXcl0Aw.ttf' # Open Sans regular
+    return dnnlib.util.open_url(url, return_filename=True)
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=None)
+def get_pil_font(font=None, size=32):
+    if font is None:
+        font = get_default_font()
+    return PIL.ImageFont.truetype(font=font, size=size)
+
+#----------------------------------------------------------------------------
+
+def get_array(string, *, dropshadow_radius: int=None, **kwargs):
+    if dropshadow_radius is not None:
+        offset_x = int(np.ceil(dropshadow_radius*2/3))
+        offset_y = int(np.ceil(dropshadow_radius*2/3))
+        return _get_array_priv(string, dropshadow_radius=dropshadow_radius, offset_x=offset_x, offset_y=offset_y, **kwargs)
+    else:
+        return _get_array_priv(string, **kwargs)
+
+@functools.lru_cache(maxsize=10000)
+def _get_array_priv(
+    string: str, *,
+    size: int = 32,
+    max_width: Optional[int]=None,
+    max_height: Optional[int]=None,
+    min_size=10,
+    shrink_coef=0.8,
+    dropshadow_radius: int=None,
+    offset_x: int=None,
+    offset_y: int=None,
+    **kwargs
+):
+    cur_size = size
+    array = None
+    while True:
+        if dropshadow_radius is not None:
+            # separate implementation for dropshadow text rendering
+            array = _get_array_impl_dropshadow(string, size=cur_size, radius=dropshadow_radius, offset_x=offset_x, offset_y=offset_y, **kwargs)
+        else:
+            array = _get_array_impl(string, size=cur_size, **kwargs)
+        height, width, _ = array.shape
+        if (max_width is None or width <= max_width) and (max_height is None or height <= max_height) or (cur_size <= min_size):
+            break
+        cur_size = max(int(cur_size * shrink_coef), min_size)
+    return array
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=10000)
+def _get_array_impl(string, *, font=None, size=32, outline=0, outline_pad=3, outline_coef=3, outline_exp=2, line_pad: int=None):
+    pil_font = get_pil_font(font=font, size=size)
+    lines = [pil_font.getmask(line, 'L') for line in string.split('\n')]
+    lines = [np.array(line, dtype=np.uint8).reshape([line.size[1], line.size[0]]) for line in lines]
+    width = max(line.shape[1] for line in lines)
+    lines = [np.pad(line, ((0, 0), (0, width - line.shape[1])), mode='constant') for line in lines]
+    line_spacing = line_pad if line_pad is not None else size // 2
+    lines = [np.pad(line, ((0, line_spacing), (0, 0)), mode='constant') for line in lines[:-1]] + lines[-1:]
+    mask = np.concatenate(lines, axis=0)
+    alpha = mask
+    if outline > 0:
+        mask = np.pad(mask, int(np.ceil(outline * outline_pad)), mode='constant', constant_values=0)
+        alpha = mask.astype(np.float32) / 255
+        alpha = scipy.ndimage.gaussian_filter(alpha, outline)
+        alpha = 1 - np.maximum(1 - alpha * outline_coef, 0) ** outline_exp
+        alpha = (alpha * 255 + 0.5).clip(0, 255).astype(np.uint8)
+        alpha = np.maximum(alpha, mask)
+    return np.stack([mask, alpha], axis=-1)
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=10000)
+def _get_array_impl_dropshadow(string, *, font=None, size=32, radius: int, offset_x: int, offset_y: int, line_pad: int=None, **kwargs):
+    assert (offset_x > 0) and (offset_y > 0)
+    pil_font = get_pil_font(font=font, size=size)
+    lines = [pil_font.getmask(line, 'L') for line in string.split('\n')]
+    lines = [np.array(line, dtype=np.uint8).reshape([line.size[1], line.size[0]]) for line in lines]
+    width = max(line.shape[1] for line in lines)
+    lines = [np.pad(line, ((0, 0), (0, width - line.shape[1])), mode='constant') for line in lines]
+    line_spacing = line_pad if line_pad is not None else size // 2
+    lines = [np.pad(line, ((0, line_spacing), (0, 0)), mode='constant') for line in lines[:-1]] + lines[-1:]
+    mask = np.concatenate(lines, axis=0)
+    alpha = mask
+
+    mask = np.pad(mask, 2*radius + max(abs(offset_x), abs(offset_y)), mode='constant', constant_values=0)
+    alpha = mask.astype(np.float32) / 255
+    alpha = scipy.ndimage.gaussian_filter(alpha, radius)
+    alpha = 1 - np.maximum(1 - alpha * 1.5, 0) ** 1.4
+    alpha = (alpha * 255 + 0.5).clip(0, 255).astype(np.uint8)
+    alpha = np.pad(alpha, [(offset_y, 0), (offset_x, 0)], mode='constant')[:-offset_y, :-offset_x]
+    alpha = np.maximum(alpha, mask)
+    return np.stack([mask, alpha], axis=-1)
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=10000)
+def get_texture(string, bilinear=True, mipmap=True, **kwargs):
+    return gl_utils.Texture(image=get_array(string, **kwargs), bilinear=bilinear, mipmap=mipmap)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/legacy.py b/eg3d/legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..f30944a15c8f7da114c3b1d94da8c31b1ed13ae8
--- /dev/null
+++ b/eg3d/legacy.py
@@ -0,0 +1,325 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Converting legacy network pickle into the new format."""
+
+import click
+import pickle
+import re
+import copy
+import numpy as np
+import torch
+import dnnlib
+from torch_utils import misc
+
+#----------------------------------------------------------------------------
+
+def load_network_pkl(f, force_fp16=False):
+    data = _LegacyUnpickler(f).load()
+
+    # Legacy TensorFlow pickle => convert.
+    if isinstance(data, tuple) and len(data) == 3 and all(isinstance(net, _TFNetworkStub) for net in data):
+        tf_G, tf_D, tf_Gs = data
+        G = convert_tf_generator(tf_G)
+        D = convert_tf_discriminator(tf_D)
+        G_ema = convert_tf_generator(tf_Gs)
+        data = dict(G=G, D=D, G_ema=G_ema)
+
+    # Add missing fields.
+    if 'training_set_kwargs' not in data:
+        data['training_set_kwargs'] = None
+    if 'augment_pipe' not in data:
+        data['augment_pipe'] = None
+
+    # Validate contents.
+    assert isinstance(data['G'], torch.nn.Module)
+    assert isinstance(data['D'], torch.nn.Module)
+    assert isinstance(data['G_ema'], torch.nn.Module)
+    assert isinstance(data['training_set_kwargs'], (dict, type(None)))
+    assert isinstance(data['augment_pipe'], (torch.nn.Module, type(None)))
+
+    # Force FP16.
+    if force_fp16:
+        for key in ['G', 'D', 'G_ema']:
+            old = data[key]
+            kwargs = copy.deepcopy(old.init_kwargs)
+            fp16_kwargs = kwargs.get('synthesis_kwargs', kwargs)
+            fp16_kwargs.num_fp16_res = 4
+            fp16_kwargs.conv_clamp = 256
+            if kwargs != old.init_kwargs:
+                new = type(old)(**kwargs).eval().requires_grad_(False)
+                misc.copy_params_and_buffers(old, new, require_all=True)
+                data[key] = new
+    return data
+
+#----------------------------------------------------------------------------
+
+class _TFNetworkStub(dnnlib.EasyDict):
+    pass
+
+class _LegacyUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if module == 'dnnlib.tflib.network' and name == 'Network':
+            return _TFNetworkStub
+        return super().find_class(module, name)
+
+#----------------------------------------------------------------------------
+
+def _collect_tf_params(tf_net):
+    # pylint: disable=protected-access
+    tf_params = dict()
+    def recurse(prefix, tf_net):
+        for name, value in tf_net.variables:
+            tf_params[prefix + name] = value
+        for name, comp in tf_net.components.items():
+            recurse(prefix + name + '/', comp)
+    recurse('', tf_net)
+    return tf_params
+
+#----------------------------------------------------------------------------
+
+def _populate_module_params(module, *patterns):
+    for name, tensor in misc.named_params_and_buffers(module):
+        found = False
+        value = None
+        for pattern, value_fn in zip(patterns[0::2], patterns[1::2]):
+            match = re.fullmatch(pattern, name)
+            if match:
+                found = True
+                if value_fn is not None:
+                    value = value_fn(*match.groups())
+                break
+        try:
+            assert found
+            if value is not None:
+                tensor.copy_(torch.from_numpy(np.array(value)))
+        except:
+            print(name, list(tensor.shape))
+            raise
+
+#----------------------------------------------------------------------------
+
+def convert_tf_generator(tf_G):
+    if tf_G.version < 4:
+        raise ValueError('TensorFlow pickle version too low')
+
+    # Collect kwargs.
+    tf_kwargs = tf_G.static_kwargs
+    known_kwargs = set()
+    def kwarg(tf_name, default=None, none=None):
+        known_kwargs.add(tf_name)
+        val = tf_kwargs.get(tf_name, default)
+        return val if val is not None else none
+
+    # Convert kwargs.
+    from training import networks_stylegan2
+    network_class = networks_stylegan2.Generator
+    kwargs = dnnlib.EasyDict(
+        z_dim               = kwarg('latent_size',          512),
+        c_dim               = kwarg('label_size',           0),
+        w_dim               = kwarg('dlatent_size',         512),
+        img_resolution      = kwarg('resolution',           1024),
+        img_channels        = kwarg('num_channels',         3),
+        channel_base        = kwarg('fmap_base',            16384) * 2,
+        channel_max         = kwarg('fmap_max',             512),
+        num_fp16_res        = kwarg('num_fp16_res',         0),
+        conv_clamp          = kwarg('conv_clamp',           None),
+        architecture        = kwarg('architecture',         'skip'),
+        resample_filter     = kwarg('resample_kernel',      [1,3,3,1]),
+        use_noise           = kwarg('use_noise',            True),
+        activation          = kwarg('nonlinearity',         'lrelu'),
+        mapping_kwargs      = dnnlib.EasyDict(
+            num_layers      = kwarg('mapping_layers',       8),
+            embed_features  = kwarg('label_fmaps',          None),
+            layer_features  = kwarg('mapping_fmaps',        None),
+            activation      = kwarg('mapping_nonlinearity', 'lrelu'),
+            lr_multiplier   = kwarg('mapping_lrmul',        0.01),
+            w_avg_beta      = kwarg('w_avg_beta',           0.995,  none=1),
+        ),
+    )
+
+    # Check for unknown kwargs.
+    kwarg('truncation_psi')
+    kwarg('truncation_cutoff')
+    kwarg('style_mixing_prob')
+    kwarg('structure')
+    kwarg('conditioning')
+    kwarg('fused_modconv')
+    unknown_kwargs = list(set(tf_kwargs.keys()) - known_kwargs)
+    if len(unknown_kwargs) > 0:
+        raise ValueError('Unknown TensorFlow kwarg', unknown_kwargs[0])
+
+    # Collect params.
+    tf_params = _collect_tf_params(tf_G)
+    for name, value in list(tf_params.items()):
+        match = re.fullmatch(r'ToRGB_lod(\d+)/(.*)', name)
+        if match:
+            r = kwargs.img_resolution // (2 ** int(match.group(1)))
+            tf_params[f'{r}x{r}/ToRGB/{match.group(2)}'] = value
+            kwargs.synthesis.kwargs.architecture = 'orig'
+    #for name, value in tf_params.items(): print(f'{name:<50s}{list(value.shape)}')
+
+    # Convert params.
+    G = network_class(**kwargs).eval().requires_grad_(False)
+    # pylint: disable=unnecessary-lambda
+    # pylint: disable=f-string-without-interpolation
+    _populate_module_params(G,
+        r'mapping\.w_avg',                                  lambda:     tf_params[f'dlatent_avg'],
+        r'mapping\.embed\.weight',                          lambda:     tf_params[f'mapping/LabelEmbed/weight'].transpose(),
+        r'mapping\.embed\.bias',                            lambda:     tf_params[f'mapping/LabelEmbed/bias'],
+        r'mapping\.fc(\d+)\.weight',                        lambda i:   tf_params[f'mapping/Dense{i}/weight'].transpose(),
+        r'mapping\.fc(\d+)\.bias',                          lambda i:   tf_params[f'mapping/Dense{i}/bias'],
+        r'synthesis\.b4\.const',                            lambda:     tf_params[f'synthesis/4x4/Const/const'][0],
+        r'synthesis\.b4\.conv1\.weight',                    lambda:     tf_params[f'synthesis/4x4/Conv/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b4\.conv1\.bias',                      lambda:     tf_params[f'synthesis/4x4/Conv/bias'],
+        r'synthesis\.b4\.conv1\.noise_const',               lambda:     tf_params[f'synthesis/noise0'][0, 0],
+        r'synthesis\.b4\.conv1\.noise_strength',            lambda:     tf_params[f'synthesis/4x4/Conv/noise_strength'],
+        r'synthesis\.b4\.conv1\.affine\.weight',            lambda:     tf_params[f'synthesis/4x4/Conv/mod_weight'].transpose(),
+        r'synthesis\.b4\.conv1\.affine\.bias',              lambda:     tf_params[f'synthesis/4x4/Conv/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.conv0\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/weight'][::-1, ::-1].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.conv0\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/bias'],
+        r'synthesis\.b(\d+)\.conv0\.noise_const',           lambda r:   tf_params[f'synthesis/noise{int(np.log2(int(r)))*2-5}'][0, 0],
+        r'synthesis\.b(\d+)\.conv0\.noise_strength',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/noise_strength'],
+        r'synthesis\.b(\d+)\.conv0\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.conv0\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.conv1\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.conv1\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/bias'],
+        r'synthesis\.b(\d+)\.conv1\.noise_const',           lambda r:   tf_params[f'synthesis/noise{int(np.log2(int(r)))*2-4}'][0, 0],
+        r'synthesis\.b(\d+)\.conv1\.noise_strength',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/noise_strength'],
+        r'synthesis\.b(\d+)\.conv1\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.conv1\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.torgb\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.torgb\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/bias'],
+        r'synthesis\.b(\d+)\.torgb\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.torgb\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.skip\.weight',                 lambda r:   tf_params[f'synthesis/{r}x{r}/Skip/weight'][::-1, ::-1].transpose(3, 2, 0, 1),
+        r'.*\.resample_filter',                             None,
+        r'.*\.act_filter',                                  None,
+    )
+    return G
+
+#----------------------------------------------------------------------------
+
+def convert_tf_discriminator(tf_D):
+    if tf_D.version < 4:
+        raise ValueError('TensorFlow pickle version too low')
+
+    # Collect kwargs.
+    tf_kwargs = tf_D.static_kwargs
+    known_kwargs = set()
+    def kwarg(tf_name, default=None):
+        known_kwargs.add(tf_name)
+        return tf_kwargs.get(tf_name, default)
+
+    # Convert kwargs.
+    kwargs = dnnlib.EasyDict(
+        c_dim                   = kwarg('label_size',           0),
+        img_resolution          = kwarg('resolution',           1024),
+        img_channels            = kwarg('num_channels',         3),
+        architecture            = kwarg('architecture',         'resnet'),
+        channel_base            = kwarg('fmap_base',            16384) * 2,
+        channel_max             = kwarg('fmap_max',             512),
+        num_fp16_res            = kwarg('num_fp16_res',         0),
+        conv_clamp              = kwarg('conv_clamp',           None),
+        cmap_dim                = kwarg('mapping_fmaps',        None),
+        block_kwargs = dnnlib.EasyDict(
+            activation          = kwarg('nonlinearity',         'lrelu'),
+            resample_filter     = kwarg('resample_kernel',      [1,3,3,1]),
+            freeze_layers       = kwarg('freeze_layers',        0),
+        ),
+        mapping_kwargs = dnnlib.EasyDict(
+            num_layers          = kwarg('mapping_layers',       0),
+            embed_features      = kwarg('mapping_fmaps',        None),
+            layer_features      = kwarg('mapping_fmaps',        None),
+            activation          = kwarg('nonlinearity',         'lrelu'),
+            lr_multiplier       = kwarg('mapping_lrmul',        0.1),
+        ),
+        epilogue_kwargs = dnnlib.EasyDict(
+            mbstd_group_size    = kwarg('mbstd_group_size',     None),
+            mbstd_num_channels  = kwarg('mbstd_num_features',   1),
+            activation          = kwarg('nonlinearity',         'lrelu'),
+        ),
+    )
+
+    # Check for unknown kwargs.
+    kwarg('structure')
+    kwarg('conditioning')
+    unknown_kwargs = list(set(tf_kwargs.keys()) - known_kwargs)
+    if len(unknown_kwargs) > 0:
+        raise ValueError('Unknown TensorFlow kwarg', unknown_kwargs[0])
+
+    # Collect params.
+    tf_params = _collect_tf_params(tf_D)
+    for name, value in list(tf_params.items()):
+        match = re.fullmatch(r'FromRGB_lod(\d+)/(.*)', name)
+        if match:
+            r = kwargs.img_resolution // (2 ** int(match.group(1)))
+            tf_params[f'{r}x{r}/FromRGB/{match.group(2)}'] = value
+            kwargs.architecture = 'orig'
+    #for name, value in tf_params.items(): print(f'{name:<50s}{list(value.shape)}')
+
+    # Convert params.
+    from training import networks_stylegan2
+    D = networks_stylegan2.Discriminator(**kwargs).eval().requires_grad_(False)
+    # pylint: disable=unnecessary-lambda
+    # pylint: disable=f-string-without-interpolation
+    _populate_module_params(D,
+        r'b(\d+)\.fromrgb\.weight',     lambda r:       tf_params[f'{r}x{r}/FromRGB/weight'].transpose(3, 2, 0, 1),
+        r'b(\d+)\.fromrgb\.bias',       lambda r:       tf_params[f'{r}x{r}/FromRGB/bias'],
+        r'b(\d+)\.conv(\d+)\.weight',   lambda r, i:    tf_params[f'{r}x{r}/Conv{i}{["","_down"][int(i)]}/weight'].transpose(3, 2, 0, 1),
+        r'b(\d+)\.conv(\d+)\.bias',     lambda r, i:    tf_params[f'{r}x{r}/Conv{i}{["","_down"][int(i)]}/bias'],
+        r'b(\d+)\.skip\.weight',        lambda r:       tf_params[f'{r}x{r}/Skip/weight'].transpose(3, 2, 0, 1),
+        r'mapping\.embed\.weight',      lambda:         tf_params[f'LabelEmbed/weight'].transpose(),
+        r'mapping\.embed\.bias',        lambda:         tf_params[f'LabelEmbed/bias'],
+        r'mapping\.fc(\d+)\.weight',    lambda i:       tf_params[f'Mapping{i}/weight'].transpose(),
+        r'mapping\.fc(\d+)\.bias',      lambda i:       tf_params[f'Mapping{i}/bias'],
+        r'b4\.conv\.weight',            lambda:         tf_params[f'4x4/Conv/weight'].transpose(3, 2, 0, 1),
+        r'b4\.conv\.bias',              lambda:         tf_params[f'4x4/Conv/bias'],
+        r'b4\.fc\.weight',              lambda:         tf_params[f'4x4/Dense0/weight'].transpose(),
+        r'b4\.fc\.bias',                lambda:         tf_params[f'4x4/Dense0/bias'],
+        r'b4\.out\.weight',             lambda:         tf_params[f'Output/weight'].transpose(),
+        r'b4\.out\.bias',               lambda:         tf_params[f'Output/bias'],
+        r'.*\.resample_filter',         None,
+    )
+    return D
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.option('--source', help='Input pickle', required=True, metavar='PATH')
+@click.option('--dest', help='Output pickle', required=True, metavar='PATH')
+@click.option('--force-fp16', help='Force the networks to use FP16', type=bool, default=False, metavar='BOOL', show_default=True)
+def convert_network_pickle(source, dest, force_fp16):
+    """Convert legacy network pickle into the native PyTorch format.
+
+    The tool is able to load the main network configurations exported using the TensorFlow version of StyleGAN2 or StyleGAN2-ADA.
+    It does not support e.g. StyleGAN2-ADA comparison methods, StyleGAN2 configs A-D, or StyleGAN1 networks.
+
+    Example:
+
+    \b
+    python legacy.py \\
+        --source=https://nvlabs-fi-cdn.nvidia.com/stylegan2/networks/stylegan2-cat-config-f.pkl \\
+        --dest=stylegan2-cat-config-f.pkl
+    """
+    print(f'Loading "{source}"...')
+    with dnnlib.util.open_url(source) as f:
+        data = load_network_pkl(f, force_fp16=force_fp16)
+    print(f'Saving "{dest}"...')
+    with open(dest, 'wb') as f:
+        pickle.dump(data, f)
+    print('Done.')
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    convert_network_pickle() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/__init__.py b/eg3d/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfebd04f47e6f6b1b44984c14c23b57d56f72240
--- /dev/null
+++ b/eg3d/metrics/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+# empty
diff --git a/eg3d/metrics/equivariance.py b/eg3d/metrics/equivariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..4609296593dd60cf0a1afa28ae4abb17d5b23576
--- /dev/null
+++ b/eg3d/metrics/equivariance.py
@@ -0,0 +1,269 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Equivariance metrics (EQ-T, EQ-T_frac, and EQ-R) from the paper
+"Alias-Free Generative Adversarial Networks"."""
+
+import copy
+import numpy as np
+import torch
+import torch.fft
+from torch_utils.ops import upfirdn2d
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+# Utilities.
+
+def sinc(x):
+    y = (x * np.pi).abs()
+    z = torch.sin(y) / y.clamp(1e-30, float('inf'))
+    return torch.where(y < 1e-30, torch.ones_like(x), z)
+
+def lanczos_window(x, a):
+    x = x.abs() / a
+    return torch.where(x < 1, sinc(x), torch.zeros_like(x))
+
+def rotation_matrix(angle):
+    angle = torch.as_tensor(angle).to(torch.float32)
+    mat = torch.eye(3, device=angle.device)
+    mat[0, 0] = angle.cos()
+    mat[0, 1] = angle.sin()
+    mat[1, 0] = -angle.sin()
+    mat[1, 1] = angle.cos()
+    return mat
+
+#----------------------------------------------------------------------------
+# Apply integer translation to a batch of 2D images. Corresponds to the
+# operator T_x in Appendix E.1.
+
+def apply_integer_translation(x, tx, ty):
+    _N, _C, H, W = x.shape
+    tx = torch.as_tensor(tx * W).to(dtype=torch.float32, device=x.device)
+    ty = torch.as_tensor(ty * H).to(dtype=torch.float32, device=x.device)
+    ix = tx.round().to(torch.int64)
+    iy = ty.round().to(torch.int64)
+
+    z = torch.zeros_like(x)
+    m = torch.zeros_like(x)
+    if abs(ix) < W and abs(iy) < H:
+        y = x[:, :, max(-iy,0) : H+min(-iy,0), max(-ix,0) : W+min(-ix,0)]
+        z[:, :, max(iy,0) : H+min(iy,0), max(ix,0) : W+min(ix,0)] = y
+        m[:, :, max(iy,0) : H+min(iy,0), max(ix,0) : W+min(ix,0)] = 1
+    return z, m
+
+#----------------------------------------------------------------------------
+# Apply integer translation to a batch of 2D images. Corresponds to the
+# operator T_x in Appendix E.2.
+
+def apply_fractional_translation(x, tx, ty, a=3):
+    _N, _C, H, W = x.shape
+    tx = torch.as_tensor(tx * W).to(dtype=torch.float32, device=x.device)
+    ty = torch.as_tensor(ty * H).to(dtype=torch.float32, device=x.device)
+    ix = tx.floor().to(torch.int64)
+    iy = ty.floor().to(torch.int64)
+    fx = tx - ix
+    fy = ty - iy
+    b = a - 1
+
+    z = torch.zeros_like(x)
+    zx0 = max(ix - b, 0)
+    zy0 = max(iy - b, 0)
+    zx1 = min(ix + a, 0) + W
+    zy1 = min(iy + a, 0) + H
+    if zx0 < zx1 and zy0 < zy1:
+        taps = torch.arange(a * 2, device=x.device) - b
+        filter_x = (sinc(taps - fx) * sinc((taps - fx) / a)).unsqueeze(0)
+        filter_y = (sinc(taps - fy) * sinc((taps - fy) / a)).unsqueeze(1)
+        y = x
+        y = upfirdn2d.filter2d(y, filter_x / filter_x.sum(), padding=[b,a,0,0])
+        y = upfirdn2d.filter2d(y, filter_y / filter_y.sum(), padding=[0,0,b,a])
+        y = y[:, :, max(b-iy,0) : H+b+a+min(-iy-a,0), max(b-ix,0) : W+b+a+min(-ix-a,0)]
+        z[:, :, zy0:zy1, zx0:zx1] = y
+
+    m = torch.zeros_like(x)
+    mx0 = max(ix + a, 0)
+    my0 = max(iy + a, 0)
+    mx1 = min(ix - b, 0) + W
+    my1 = min(iy - b, 0) + H
+    if mx0 < mx1 and my0 < my1:
+        m[:, :, my0:my1, mx0:mx1] = 1
+    return z, m
+
+#----------------------------------------------------------------------------
+# Construct an oriented low-pass filter that applies the appropriate
+# bandlimit with respect to the input and output of the given affine 2D
+# image transformation.
+
+def construct_affine_bandlimit_filter(mat, a=3, amax=16, aflt=64, up=4, cutoff_in=1, cutoff_out=1):
+    assert a <= amax < aflt
+    mat = torch.as_tensor(mat).to(torch.float32)
+
+    # Construct 2D filter taps in input & output coordinate spaces.
+    taps = ((torch.arange(aflt * up * 2 - 1, device=mat.device) + 1) / up - aflt).roll(1 - aflt * up)
+    yi, xi = torch.meshgrid(taps, taps)
+    xo, yo = (torch.stack([xi, yi], dim=2) @ mat[:2, :2].t()).unbind(2)
+
+    # Convolution of two oriented 2D sinc filters.
+    fi = sinc(xi * cutoff_in) * sinc(yi * cutoff_in)
+    fo = sinc(xo * cutoff_out) * sinc(yo * cutoff_out)
+    f = torch.fft.ifftn(torch.fft.fftn(fi) * torch.fft.fftn(fo)).real
+
+    # Convolution of two oriented 2D Lanczos windows.
+    wi = lanczos_window(xi, a) * lanczos_window(yi, a)
+    wo = lanczos_window(xo, a) * lanczos_window(yo, a)
+    w = torch.fft.ifftn(torch.fft.fftn(wi) * torch.fft.fftn(wo)).real
+
+    # Construct windowed FIR filter.
+    f = f * w
+
+    # Finalize.
+    c = (aflt - amax) * up
+    f = f.roll([aflt * up - 1] * 2, dims=[0,1])[c:-c, c:-c]
+    f = torch.nn.functional.pad(f, [0, 1, 0, 1]).reshape(amax * 2, up, amax * 2, up)
+    f = f / f.sum([0,2], keepdim=True) / (up ** 2)
+    f = f.reshape(amax * 2 * up, amax * 2 * up)[:-1, :-1]
+    return f
+
+#----------------------------------------------------------------------------
+# Apply the given affine transformation to a batch of 2D images.
+
+def apply_affine_transformation(x, mat, up=4, **filter_kwargs):
+    _N, _C, H, W = x.shape
+    mat = torch.as_tensor(mat).to(dtype=torch.float32, device=x.device)
+
+    # Construct filter.
+    f = construct_affine_bandlimit_filter(mat, up=up, **filter_kwargs)
+    assert f.ndim == 2 and f.shape[0] == f.shape[1] and f.shape[0] % 2 == 1
+    p = f.shape[0] // 2
+
+    # Construct sampling grid.
+    theta = mat.inverse()
+    theta[:2, 2] *= 2
+    theta[0, 2] += 1 / up / W
+    theta[1, 2] += 1 / up / H
+    theta[0, :] *= W / (W + p / up * 2)
+    theta[1, :] *= H / (H + p / up * 2)
+    theta = theta[:2, :3].unsqueeze(0).repeat([x.shape[0], 1, 1])
+    g = torch.nn.functional.affine_grid(theta, x.shape, align_corners=False)
+
+    # Resample image.
+    y = upfirdn2d.upsample2d(x=x, f=f, up=up, padding=p)
+    z = torch.nn.functional.grid_sample(y, g, mode='bilinear', padding_mode='zeros', align_corners=False)
+
+    # Form mask.
+    m = torch.zeros_like(y)
+    c = p * 2 + 1
+    m[:, :, c:-c, c:-c] = 1
+    m = torch.nn.functional.grid_sample(m, g, mode='nearest', padding_mode='zeros', align_corners=False)
+    return z, m
+
+#----------------------------------------------------------------------------
+# Apply fractional rotation to a batch of 2D images. Corresponds to the
+# operator R_\alpha in Appendix E.3.
+
+def apply_fractional_rotation(x, angle, a=3, **filter_kwargs):
+    angle = torch.as_tensor(angle).to(dtype=torch.float32, device=x.device)
+    mat = rotation_matrix(angle)
+    return apply_affine_transformation(x, mat, a=a, amax=a*2, **filter_kwargs)
+
+#----------------------------------------------------------------------------
+# Modify the frequency content of a batch of 2D images as if they had undergo
+# fractional rotation -- but without actually rotating them. Corresponds to
+# the operator R^*_\alpha in Appendix E.3.
+
+def apply_fractional_pseudo_rotation(x, angle, a=3, **filter_kwargs):
+    angle = torch.as_tensor(angle).to(dtype=torch.float32, device=x.device)
+    mat = rotation_matrix(-angle)
+    f = construct_affine_bandlimit_filter(mat, a=a, amax=a*2, up=1, **filter_kwargs)
+    y = upfirdn2d.filter2d(x=x, f=f)
+    m = torch.zeros_like(y)
+    c = f.shape[0] // 2
+    m[:, :, c:-c, c:-c] = 1
+    return y, m
+
+#----------------------------------------------------------------------------
+# Compute the selected equivariance metrics for the given generator.
+
+def compute_equivariance_metrics(opts, num_samples, batch_size, translate_max=0.125, rotate_max=1, compute_eqt_int=False, compute_eqt_frac=False, compute_eqr=False):
+    assert compute_eqt_int or compute_eqt_frac or compute_eqr
+
+    # Setup generator and labels.
+    G = copy.deepcopy(opts.G).eval().requires_grad_(False).to(opts.device)
+    I = torch.eye(3, device=opts.device)
+    M = getattr(getattr(getattr(G, 'synthesis', None), 'input', None), 'transform', None)
+    if M is None:
+        raise ValueError('Cannot compute equivariance metrics; the given generator does not support user-specified image transformations')
+    c_iter = metric_utils.iterate_random_labels(opts=opts, batch_size=batch_size)
+
+    # Sampling loop.
+    sums = None
+    progress = opts.progress.sub(tag='eq sampling', num_items=num_samples)
+    for batch_start in range(0, num_samples, batch_size * opts.num_gpus):
+        progress.update(batch_start)
+        s = []
+
+        # Randomize noise buffers, if any.
+        for name, buf in G.named_buffers():
+            if name.endswith('.noise_const'):
+                buf.copy_(torch.randn_like(buf))
+
+        # Run mapping network.
+        z = torch.randn([batch_size, G.z_dim], device=opts.device)
+        c = next(c_iter)
+        ws = G.mapping(z=z, c=c)
+
+        # Generate reference image.
+        M[:] = I
+        orig = G.synthesis(ws=ws, noise_mode='const', **opts.G_kwargs)
+
+        # Integer translation (EQ-T).
+        if compute_eqt_int:
+            t = (torch.rand(2, device=opts.device) * 2 - 1) * translate_max
+            t = (t * G.img_resolution).round() / G.img_resolution
+            M[:] = I
+            M[:2, 2] = -t
+            img = G.synthesis(ws=ws, noise_mode='const', **opts.G_kwargs)
+            ref, mask = apply_integer_translation(orig, t[0], t[1])
+            s += [(ref - img).square() * mask, mask]
+
+        # Fractional translation (EQ-T_frac).
+        if compute_eqt_frac:
+            t = (torch.rand(2, device=opts.device) * 2 - 1) * translate_max
+            M[:] = I
+            M[:2, 2] = -t
+            img = G.synthesis(ws=ws, noise_mode='const', **opts.G_kwargs)
+            ref, mask = apply_fractional_translation(orig, t[0], t[1])
+            s += [(ref - img).square() * mask, mask]
+
+        # Rotation (EQ-R).
+        if compute_eqr:
+            angle = (torch.rand([], device=opts.device) * 2 - 1) * (rotate_max * np.pi)
+            M[:] = rotation_matrix(-angle)
+            img = G.synthesis(ws=ws, noise_mode='const', **opts.G_kwargs)
+            ref, ref_mask = apply_fractional_rotation(orig, angle)
+            pseudo, pseudo_mask = apply_fractional_pseudo_rotation(img, angle)
+            mask = ref_mask * pseudo_mask
+            s += [(ref - pseudo).square() * mask, mask]
+
+        # Accumulate results.
+        s = torch.stack([x.to(torch.float64).sum() for x in s])
+        sums = sums + s if sums is not None else s
+    progress.update(num_samples)
+
+    # Compute PSNRs.
+    if opts.num_gpus > 1:
+        torch.distributed.all_reduce(sums)
+    sums = sums.cpu()
+    mses = sums[0::2] / sums[1::2]
+    psnrs = np.log10(2) * 20 - mses.log10() * 10
+    psnrs = tuple(psnrs.numpy())
+    return psnrs[0] if len(psnrs) == 1 else psnrs
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/frechet_inception_distance.py b/eg3d/metrics/frechet_inception_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2944eb21dbb88d2f383991ff88f557513b38168
--- /dev/null
+++ b/eg3d/metrics/frechet_inception_distance.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Frechet Inception Distance (FID) from the paper
+"GANs trained by a two time-scale update rule converge to a local Nash
+equilibrium". Matches the original implementation by Heusel et al. at
+https://github.com/bioinf-jku/TTUR/blob/master/fid.py"""
+
+import numpy as np
+import scipy.linalg
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_fid(opts, max_real, num_gen):
+    # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+    detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl'
+    detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer.
+
+    mu_real, sigma_real = metric_utils.compute_feature_stats_for_dataset(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=0, capture_mean_cov=True, max_items=max_real).get_mean_cov()
+
+    mu_gen, sigma_gen = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=1, capture_mean_cov=True, max_items=num_gen).get_mean_cov()
+
+    if opts.rank != 0:
+        return float('nan')
+
+    m = np.square(mu_gen - mu_real).sum()
+    s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member
+    fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
+    return float(fid)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/inception_score.py b/eg3d/metrics/inception_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e5e247280f76471819550295bf2fc5ea3f7b42e
--- /dev/null
+++ b/eg3d/metrics/inception_score.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Inception Score (IS) from the paper "Improved techniques for training
+GANs". Matches the original implementation by Salimans et al. at
+https://github.com/openai/improved-gan/blob/master/inception_score/model.py"""
+
+import numpy as np
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_is(opts, num_gen, num_splits):
+    # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+    detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl'
+    detector_kwargs = dict(no_output_bias=True) # Match the original implementation by not applying bias in the softmax layer.
+
+    gen_probs = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        capture_all=True, max_items=num_gen).get_all()
+
+    if opts.rank != 0:
+        return float('nan'), float('nan')
+
+    scores = []
+    for i in range(num_splits):
+        part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits]
+        kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True)))
+        kl = np.mean(np.sum(kl, axis=1))
+        scores.append(np.exp(kl))
+    return float(np.mean(scores)), float(np.std(scores))
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/kernel_inception_distance.py b/eg3d/metrics/kernel_inception_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..48906eba23a7d29ba912b7d209f83fba6d0b9f37
--- /dev/null
+++ b/eg3d/metrics/kernel_inception_distance.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Kernel Inception Distance (KID) from the paper "Demystifying MMD
+GANs". Matches the original implementation by Binkowski et al. at
+https://github.com/mbinkowski/MMD-GAN/blob/master/gan/compute_scores.py"""
+
+import numpy as np
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_kid(opts, max_real, num_gen, num_subsets, max_subset_size):
+    # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+    detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl'
+    detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer.
+
+    real_features = metric_utils.compute_feature_stats_for_dataset(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=0, capture_all=True, max_items=max_real).get_all()
+
+    gen_features = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=1, capture_all=True, max_items=num_gen).get_all()
+
+    if opts.rank != 0:
+        return float('nan')
+
+    n = real_features.shape[1]
+    m = min(min(real_features.shape[0], gen_features.shape[0]), max_subset_size)
+    t = 0
+    for _subset_idx in range(num_subsets):
+        x = gen_features[np.random.choice(gen_features.shape[0], m, replace=False)]
+        y = real_features[np.random.choice(real_features.shape[0], m, replace=False)]
+        a = (x @ x.T / n + 1) ** 3 + (y @ y.T / n + 1) ** 3
+        b = (x @ y.T / n + 1) ** 3
+        t += (a.sum() - np.diag(a).sum()) / (m - 1) - b.sum() * 2 / m
+    kid = t / num_subsets / m
+    return float(kid)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/metric_main.py b/eg3d/metrics/metric_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..52318ee48a523f30e7eace0b62b936c7826ffc56
--- /dev/null
+++ b/eg3d/metrics/metric_main.py
@@ -0,0 +1,155 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Main API for computing and reporting quality metrics."""
+
+import os
+import time
+import json
+import torch
+import dnnlib
+
+from . import metric_utils
+from . import frechet_inception_distance
+from . import kernel_inception_distance
+from . import precision_recall
+from . import perceptual_path_length
+from . import inception_score
+from . import equivariance
+
+#----------------------------------------------------------------------------
+
+_metric_dict = dict() # name => fn
+
+def register_metric(fn):
+    assert callable(fn)
+    _metric_dict[fn.__name__] = fn
+    return fn
+
+def is_valid_metric(metric):
+    return metric in _metric_dict
+
+def list_valid_metrics():
+    return list(_metric_dict.keys())
+
+#----------------------------------------------------------------------------
+
+def calc_metric(metric, **kwargs): # See metric_utils.MetricOptions for the full list of arguments.
+    assert is_valid_metric(metric)
+    opts = metric_utils.MetricOptions(**kwargs)
+
+    # Calculate.
+    start_time = time.time()
+    results = _metric_dict[metric](opts)
+    total_time = time.time() - start_time
+
+    # Broadcast results.
+    for key, value in list(results.items()):
+        if opts.num_gpus > 1:
+            value = torch.as_tensor(value, dtype=torch.float64, device=opts.device)
+            torch.distributed.broadcast(tensor=value, src=0)
+            value = float(value.cpu())
+        results[key] = value
+
+    # Decorate with metadata.
+    return dnnlib.EasyDict(
+        results         = dnnlib.EasyDict(results),
+        metric          = metric,
+        total_time      = total_time,
+        total_time_str  = dnnlib.util.format_time(total_time),
+        num_gpus        = opts.num_gpus,
+    )
+
+#----------------------------------------------------------------------------
+
+def report_metric(result_dict, run_dir=None, snapshot_pkl=None):
+    metric = result_dict['metric']
+    assert is_valid_metric(metric)
+    if run_dir is not None and snapshot_pkl is not None:
+        snapshot_pkl = os.path.relpath(snapshot_pkl, run_dir)
+
+    jsonl_line = json.dumps(dict(result_dict, snapshot_pkl=snapshot_pkl, timestamp=time.time()))
+    print(jsonl_line)
+    if run_dir is not None and os.path.isdir(run_dir):
+        with open(os.path.join(run_dir, f'metric-{metric}.jsonl'), 'at') as f:
+            f.write(jsonl_line + '\n')
+
+#----------------------------------------------------------------------------
+# Recommended metrics.
+
+@register_metric
+def fid50k_full(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    fid = frechet_inception_distance.compute_fid(opts, max_real=None, num_gen=50000)
+    return dict(fid50k_full=fid)
+
+@register_metric
+def kid50k_full(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    kid = kernel_inception_distance.compute_kid(opts, max_real=1000000, num_gen=50000, num_subsets=100, max_subset_size=1000)
+    return dict(kid50k_full=kid)
+
+@register_metric
+def pr50k3_full(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    precision, recall = precision_recall.compute_pr(opts, max_real=200000, num_gen=50000, nhood_size=3, row_batch_size=10000, col_batch_size=10000)
+    return dict(pr50k3_full_precision=precision, pr50k3_full_recall=recall)
+
+@register_metric
+def ppl2_wend(opts):
+    ppl = perceptual_path_length.compute_ppl(opts, num_samples=50000, epsilon=1e-4, space='w', sampling='end', crop=False, batch_size=2)
+    return dict(ppl2_wend=ppl)
+
+@register_metric
+def eqt50k_int(opts):
+    opts.G_kwargs.update(force_fp32=True)
+    psnr = equivariance.compute_equivariance_metrics(opts, num_samples=50000, batch_size=4, compute_eqt_int=True)
+    return dict(eqt50k_int=psnr)
+
+@register_metric
+def eqt50k_frac(opts):
+    opts.G_kwargs.update(force_fp32=True)
+    psnr = equivariance.compute_equivariance_metrics(opts, num_samples=50000, batch_size=4, compute_eqt_frac=True)
+    return dict(eqt50k_frac=psnr)
+
+@register_metric
+def eqr50k(opts):
+    opts.G_kwargs.update(force_fp32=True)
+    psnr = equivariance.compute_equivariance_metrics(opts, num_samples=50000, batch_size=4, compute_eqr=True)
+    return dict(eqr50k=psnr)
+
+#----------------------------------------------------------------------------
+# Legacy metrics.
+
+@register_metric
+def fid50k(opts):
+    opts.dataset_kwargs.update(max_size=None)
+    fid = frechet_inception_distance.compute_fid(opts, max_real=50000, num_gen=50000)
+    return dict(fid50k=fid)
+
+@register_metric
+def kid50k(opts):
+    opts.dataset_kwargs.update(max_size=None)
+    kid = kernel_inception_distance.compute_kid(opts, max_real=50000, num_gen=50000, num_subsets=100, max_subset_size=1000)
+    return dict(kid50k=kid)
+
+@register_metric
+def pr50k3(opts):
+    opts.dataset_kwargs.update(max_size=None)
+    precision, recall = precision_recall.compute_pr(opts, max_real=50000, num_gen=50000, nhood_size=3, row_batch_size=10000, col_batch_size=10000)
+    return dict(pr50k3_precision=precision, pr50k3_recall=recall)
+
+@register_metric
+def is50k(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    mean, std = inception_score.compute_is(opts, num_gen=50000, num_splits=10)
+    return dict(is50k_mean=mean, is50k_std=std)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/metric_utils.py b/eg3d/metrics/metric_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..212cb7d38fabf6c7b60c55a0fa0a07560ac602b2
--- /dev/null
+++ b/eg3d/metrics/metric_utils.py
@@ -0,0 +1,281 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Miscellaneous utilities used internally by the quality metrics."""
+
+import os
+import time
+import hashlib
+import pickle
+import copy
+import uuid
+import numpy as np
+import torch
+import dnnlib
+
+#----------------------------------------------------------------------------
+
+class MetricOptions:
+    def __init__(self, G=None, G_kwargs={}, dataset_kwargs={}, num_gpus=1, rank=0, device=None, progress=None, cache=True):
+        assert 0 <= rank < num_gpus
+        self.G              = G
+        self.G_kwargs       = dnnlib.EasyDict(G_kwargs)
+        self.dataset_kwargs = dnnlib.EasyDict(dataset_kwargs)
+        self.num_gpus       = num_gpus
+        self.rank           = rank
+        self.device         = device if device is not None else torch.device('cuda', rank)
+        self.progress       = progress.sub() if progress is not None and rank == 0 else ProgressMonitor()
+        self.cache          = cache
+
+#----------------------------------------------------------------------------
+
+_feature_detector_cache = dict()
+
+def get_feature_detector_name(url):
+    return os.path.splitext(url.split('/')[-1])[0]
+
+def get_feature_detector(url, device=torch.device('cpu'), num_gpus=1, rank=0, verbose=False):
+    assert 0 <= rank < num_gpus
+    key = (url, device)
+    if key not in _feature_detector_cache:
+        is_leader = (rank == 0)
+        if not is_leader and num_gpus > 1:
+            torch.distributed.barrier() # leader goes first
+        with dnnlib.util.open_url(url, verbose=(verbose and is_leader)) as f:
+            _feature_detector_cache[key] = pickle.load(f).to(device)
+        if is_leader and num_gpus > 1:
+            torch.distributed.barrier() # others follow
+    return _feature_detector_cache[key]
+
+#----------------------------------------------------------------------------
+
+def iterate_random_labels(opts, batch_size):
+    if opts.G.c_dim == 0:
+        c = torch.zeros([batch_size, opts.G.c_dim], device=opts.device)
+        while True:
+            yield c
+    else:
+        dataset = dnnlib.util.construct_class_by_name(**opts.dataset_kwargs)
+        while True:
+            c = [dataset.get_label(np.random.randint(len(dataset))) for _i in range(batch_size)]
+            c = torch.from_numpy(np.stack(c)).pin_memory().to(opts.device)
+            yield c
+
+#----------------------------------------------------------------------------
+
+class FeatureStats:
+    def __init__(self, capture_all=False, capture_mean_cov=False, max_items=None):
+        self.capture_all = capture_all
+        self.capture_mean_cov = capture_mean_cov
+        self.max_items = max_items
+        self.num_items = 0
+        self.num_features = None
+        self.all_features = None
+        self.raw_mean = None
+        self.raw_cov = None
+
+    def set_num_features(self, num_features):
+        if self.num_features is not None:
+            assert num_features == self.num_features
+        else:
+            self.num_features = num_features
+            self.all_features = []
+            self.raw_mean = np.zeros([num_features], dtype=np.float64)
+            self.raw_cov = np.zeros([num_features, num_features], dtype=np.float64)
+
+    def is_full(self):
+        return (self.max_items is not None) and (self.num_items >= self.max_items)
+
+    def append(self, x):
+        x = np.asarray(x, dtype=np.float32)
+        assert x.ndim == 2
+        if (self.max_items is not None) and (self.num_items + x.shape[0] > self.max_items):
+            if self.num_items >= self.max_items:
+                return
+            x = x[:self.max_items - self.num_items]
+
+        self.set_num_features(x.shape[1])
+        self.num_items += x.shape[0]
+        if self.capture_all:
+            self.all_features.append(x)
+        if self.capture_mean_cov:
+            x64 = x.astype(np.float64)
+            self.raw_mean += x64.sum(axis=0)
+            self.raw_cov += x64.T @ x64
+
+    def append_torch(self, x, num_gpus=1, rank=0):
+        assert isinstance(x, torch.Tensor) and x.ndim == 2
+        assert 0 <= rank < num_gpus
+        if num_gpus > 1:
+            ys = []
+            for src in range(num_gpus):
+                y = x.clone()
+                torch.distributed.broadcast(y, src=src)
+                ys.append(y)
+            x = torch.stack(ys, dim=1).flatten(0, 1) # interleave samples
+        self.append(x.cpu().numpy())
+
+    def get_all(self):
+        assert self.capture_all
+        return np.concatenate(self.all_features, axis=0)
+
+    def get_all_torch(self):
+        return torch.from_numpy(self.get_all())
+
+    def get_mean_cov(self):
+        assert self.capture_mean_cov
+        mean = self.raw_mean / self.num_items
+        cov = self.raw_cov / self.num_items
+        cov = cov - np.outer(mean, mean)
+        return mean, cov
+
+    def save(self, pkl_file):
+        with open(pkl_file, 'wb') as f:
+            pickle.dump(self.__dict__, f)
+
+    @staticmethod
+    def load(pkl_file):
+        with open(pkl_file, 'rb') as f:
+            s = dnnlib.EasyDict(pickle.load(f))
+        obj = FeatureStats(capture_all=s.capture_all, max_items=s.max_items)
+        obj.__dict__.update(s)
+        return obj
+
+#----------------------------------------------------------------------------
+
+class ProgressMonitor:
+    def __init__(self, tag=None, num_items=None, flush_interval=1000, verbose=False, progress_fn=None, pfn_lo=0, pfn_hi=1000, pfn_total=1000):
+        self.tag = tag
+        self.num_items = num_items
+        self.verbose = verbose
+        self.flush_interval = flush_interval
+        self.progress_fn = progress_fn
+        self.pfn_lo = pfn_lo
+        self.pfn_hi = pfn_hi
+        self.pfn_total = pfn_total
+        self.start_time = time.time()
+        self.batch_time = self.start_time
+        self.batch_items = 0
+        if self.progress_fn is not None:
+            self.progress_fn(self.pfn_lo, self.pfn_total)
+
+    def update(self, cur_items):
+        assert (self.num_items is None) or (cur_items <= self.num_items)
+        if (cur_items < self.batch_items + self.flush_interval) and (self.num_items is None or cur_items < self.num_items):
+            return
+        cur_time = time.time()
+        total_time = cur_time - self.start_time
+        time_per_item = (cur_time - self.batch_time) / max(cur_items - self.batch_items, 1)
+        if (self.verbose) and (self.tag is not None):
+            print(f'{self.tag:<19s} items {cur_items:<7d} time {dnnlib.util.format_time(total_time):<12s} ms/item {time_per_item*1e3:.2f}')
+        self.batch_time = cur_time
+        self.batch_items = cur_items
+
+        if (self.progress_fn is not None) and (self.num_items is not None):
+            self.progress_fn(self.pfn_lo + (self.pfn_hi - self.pfn_lo) * (cur_items / self.num_items), self.pfn_total)
+
+    def sub(self, tag=None, num_items=None, flush_interval=1000, rel_lo=0, rel_hi=1):
+        return ProgressMonitor(
+            tag             = tag,
+            num_items       = num_items,
+            flush_interval  = flush_interval,
+            verbose         = self.verbose,
+            progress_fn     = self.progress_fn,
+            pfn_lo          = self.pfn_lo + (self.pfn_hi - self.pfn_lo) * rel_lo,
+            pfn_hi          = self.pfn_lo + (self.pfn_hi - self.pfn_lo) * rel_hi,
+            pfn_total       = self.pfn_total,
+        )
+
+#----------------------------------------------------------------------------
+
+def compute_feature_stats_for_dataset(opts, detector_url, detector_kwargs, rel_lo=0, rel_hi=1, batch_size=64, data_loader_kwargs=None, max_items=None, **stats_kwargs):
+    dataset = dnnlib.util.construct_class_by_name(**opts.dataset_kwargs)
+    if data_loader_kwargs is None:
+        data_loader_kwargs = dict(pin_memory=True, num_workers=3, prefetch_factor=2)
+
+    # Try to lookup from cache.
+    cache_file = None
+    if opts.cache:
+        # Choose cache file name.
+        args = dict(dataset_kwargs=opts.dataset_kwargs, detector_url=detector_url, detector_kwargs=detector_kwargs, stats_kwargs=stats_kwargs)
+        md5 = hashlib.md5(repr(sorted(args.items())).encode('utf-8'))
+        cache_tag = f'{dataset.name}-{get_feature_detector_name(detector_url)}-{md5.hexdigest()}'
+        cache_file = dnnlib.make_cache_dir_path('gan-metrics', cache_tag + '.pkl')
+
+        # Check if the file exists (all processes must agree).
+        flag = os.path.isfile(cache_file) if opts.rank == 0 else False
+        if opts.num_gpus > 1:
+            flag = torch.as_tensor(flag, dtype=torch.float32, device=opts.device)
+            torch.distributed.broadcast(tensor=flag, src=0)
+            flag = (float(flag.cpu()) != 0)
+
+        # Load.
+        if flag:
+            return FeatureStats.load(cache_file)
+
+    # Initialize.
+    num_items = len(dataset)
+    if max_items is not None:
+        num_items = min(num_items, max_items)
+    stats = FeatureStats(max_items=num_items, **stats_kwargs)
+    progress = opts.progress.sub(tag='dataset features', num_items=num_items, rel_lo=rel_lo, rel_hi=rel_hi)
+    detector = get_feature_detector(url=detector_url, device=opts.device, num_gpus=opts.num_gpus, rank=opts.rank, verbose=progress.verbose)
+
+    # Main loop.
+    item_subset = [(i * opts.num_gpus + opts.rank) % num_items for i in range((num_items - 1) // opts.num_gpus + 1)]
+    for images, _labels in torch.utils.data.DataLoader(dataset=dataset, sampler=item_subset, batch_size=batch_size, **data_loader_kwargs):
+        if images.shape[1] == 1:
+            images = images.repeat([1, 3, 1, 1])
+        features = detector(images.to(opts.device), **detector_kwargs)
+        stats.append_torch(features, num_gpus=opts.num_gpus, rank=opts.rank)
+        progress.update(stats.num_items)
+
+    # Save to cache.
+    if cache_file is not None and opts.rank == 0:
+        os.makedirs(os.path.dirname(cache_file), exist_ok=True)
+        temp_file = cache_file + '.' + uuid.uuid4().hex
+        stats.save(temp_file)
+        os.replace(temp_file, cache_file) # atomic
+    return stats
+
+#----------------------------------------------------------------------------
+
+def compute_feature_stats_for_generator(opts, detector_url, detector_kwargs, rel_lo=0, rel_hi=1, batch_size=64, batch_gen=None, **stats_kwargs):
+    if batch_gen is None:
+        batch_gen = min(batch_size, 4)
+    assert batch_size % batch_gen == 0
+
+    # Setup generator and labels.
+    G = copy.deepcopy(opts.G).eval().requires_grad_(False).to(opts.device)
+    c_iter = iterate_random_labels(opts=opts, batch_size=batch_gen)
+
+    # Initialize.
+    stats = FeatureStats(**stats_kwargs)
+    assert stats.max_items is not None
+    progress = opts.progress.sub(tag='generator features', num_items=stats.max_items, rel_lo=rel_lo, rel_hi=rel_hi)
+    detector = get_feature_detector(url=detector_url, device=opts.device, num_gpus=opts.num_gpus, rank=opts.rank, verbose=progress.verbose)
+
+    # Main loop.
+    while not stats.is_full():
+        images = []
+        for _i in range(batch_size // batch_gen):
+            z = torch.randn([batch_gen, G.z_dim], device=opts.device)
+            img = G(z=z, c=next(c_iter), **opts.G_kwargs)['image']
+            img = (img * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+            images.append(img)
+        images = torch.cat(images)
+        if images.shape[1] == 1:
+            images = images.repeat([1, 3, 1, 1])
+        features = detector(images, **detector_kwargs)
+        stats.append_torch(features, num_gpus=opts.num_gpus, rank=opts.rank)
+        progress.update(stats.num_items)
+    return stats
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/perceptual_path_length.py b/eg3d/metrics/perceptual_path_length.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e58dac3317733e2ace6d64ee1f97cafa0a38225
--- /dev/null
+++ b/eg3d/metrics/perceptual_path_length.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Perceptual Path Length (PPL) from the paper "A Style-Based Generator
+Architecture for Generative Adversarial Networks". Matches the original
+implementation by Karras et al. at
+https://github.com/NVlabs/stylegan/blob/master/metrics/perceptual_path_length.py"""
+
+import copy
+import numpy as np
+import torch
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+# Spherical interpolation of a batch of vectors.
+def slerp(a, b, t):
+    a = a / a.norm(dim=-1, keepdim=True)
+    b = b / b.norm(dim=-1, keepdim=True)
+    d = (a * b).sum(dim=-1, keepdim=True)
+    p = t * torch.acos(d)
+    c = b - d * a
+    c = c / c.norm(dim=-1, keepdim=True)
+    d = a * torch.cos(p) + c * torch.sin(p)
+    d = d / d.norm(dim=-1, keepdim=True)
+    return d
+
+#----------------------------------------------------------------------------
+
+class PPLSampler(torch.nn.Module):
+    def __init__(self, G, G_kwargs, epsilon, space, sampling, crop, vgg16):
+        assert space in ['z', 'w']
+        assert sampling in ['full', 'end']
+        super().__init__()
+        self.G = copy.deepcopy(G)
+        self.G_kwargs = G_kwargs
+        self.epsilon = epsilon
+        self.space = space
+        self.sampling = sampling
+        self.crop = crop
+        self.vgg16 = copy.deepcopy(vgg16)
+
+    def forward(self, c):
+        # Generate random latents and interpolation t-values.
+        t = torch.rand([c.shape[0]], device=c.device) * (1 if self.sampling == 'full' else 0)
+        z0, z1 = torch.randn([c.shape[0] * 2, self.G.z_dim], device=c.device).chunk(2)
+
+        # Interpolate in W or Z.
+        if self.space == 'w':
+            w0, w1 = self.G.mapping(z=torch.cat([z0,z1]), c=torch.cat([c,c])).chunk(2)
+            wt0 = w0.lerp(w1, t.unsqueeze(1).unsqueeze(2))
+            wt1 = w0.lerp(w1, t.unsqueeze(1).unsqueeze(2) + self.epsilon)
+        else: # space == 'z'
+            zt0 = slerp(z0, z1, t.unsqueeze(1))
+            zt1 = slerp(z0, z1, t.unsqueeze(1) + self.epsilon)
+            wt0, wt1 = self.G.mapping(z=torch.cat([zt0,zt1]), c=torch.cat([c,c])).chunk(2)
+
+        # Randomize noise buffers.
+        for name, buf in self.G.named_buffers():
+            if name.endswith('.noise_const'):
+                buf.copy_(torch.randn_like(buf))
+
+        # Generate images.
+        img = self.G.synthesis(ws=torch.cat([wt0,wt1]), noise_mode='const', force_fp32=True, **self.G_kwargs)
+
+        # Center crop.
+        if self.crop:
+            assert img.shape[2] == img.shape[3]
+            c = img.shape[2] // 8
+            img = img[:, :, c*3 : c*7, c*2 : c*6]
+
+        # Downsample to 256x256.
+        factor = self.G.img_resolution // 256
+        if factor > 1:
+            img = img.reshape([-1, img.shape[1], img.shape[2] // factor, factor, img.shape[3] // factor, factor]).mean([3, 5])
+
+        # Scale dynamic range from [-1,1] to [0,255].
+        img = (img + 1) * (255 / 2)
+        if self.G.img_channels == 1:
+            img = img.repeat([1, 3, 1, 1])
+
+        # Evaluate differential LPIPS.
+        lpips_t0, lpips_t1 = self.vgg16(img, resize_images=False, return_lpips=True).chunk(2)
+        dist = (lpips_t0 - lpips_t1).square().sum(1) / self.epsilon ** 2
+        return dist
+
+#----------------------------------------------------------------------------
+
+def compute_ppl(opts, num_samples, epsilon, space, sampling, crop, batch_size):
+    vgg16_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/vgg16.pkl'
+    vgg16 = metric_utils.get_feature_detector(vgg16_url, num_gpus=opts.num_gpus, rank=opts.rank, verbose=opts.progress.verbose)
+
+    # Setup sampler and labels.
+    sampler = PPLSampler(G=opts.G, G_kwargs=opts.G_kwargs, epsilon=epsilon, space=space, sampling=sampling, crop=crop, vgg16=vgg16)
+    sampler.eval().requires_grad_(False).to(opts.device)
+    c_iter = metric_utils.iterate_random_labels(opts=opts, batch_size=batch_size)
+
+    # Sampling loop.
+    dist = []
+    progress = opts.progress.sub(tag='ppl sampling', num_items=num_samples)
+    for batch_start in range(0, num_samples, batch_size * opts.num_gpus):
+        progress.update(batch_start)
+        x = sampler(next(c_iter))
+        for src in range(opts.num_gpus):
+            y = x.clone()
+            if opts.num_gpus > 1:
+                torch.distributed.broadcast(y, src=src)
+            dist.append(y)
+    progress.update(num_samples)
+
+    # Compute PPL.
+    if opts.rank != 0:
+        return float('nan')
+    dist = torch.cat(dist)[:num_samples].cpu().numpy()
+    lo = np.percentile(dist, 1, interpolation='lower')
+    hi = np.percentile(dist, 99, interpolation='higher')
+    ppl = np.extract(np.logical_and(dist >= lo, dist <= hi), dist).mean()
+    return float(ppl)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/metrics/precision_recall.py b/eg3d/metrics/precision_recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..e33e85f64de81fa211135edaf3863c2fe851a6f4
--- /dev/null
+++ b/eg3d/metrics/precision_recall.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Precision/Recall (PR) from the paper "Improved Precision and Recall
+Metric for Assessing Generative Models". Matches the original implementation
+by Kynkaanniemi et al. at
+https://github.com/kynkaat/improved-precision-and-recall-metric/blob/master/precision_recall.py"""
+
+import torch
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_distances(row_features, col_features, num_gpus, rank, col_batch_size):
+    assert 0 <= rank < num_gpus
+    num_cols = col_features.shape[0]
+    num_batches = ((num_cols - 1) // col_batch_size // num_gpus + 1) * num_gpus
+    col_batches = torch.nn.functional.pad(col_features, [0, 0, 0, -num_cols % num_batches]).chunk(num_batches)
+    dist_batches = []
+    for col_batch in col_batches[rank :: num_gpus]:
+        dist_batch = torch.cdist(row_features.unsqueeze(0), col_batch.unsqueeze(0))[0]
+        for src in range(num_gpus):
+            dist_broadcast = dist_batch.clone()
+            if num_gpus > 1:
+                torch.distributed.broadcast(dist_broadcast, src=src)
+            dist_batches.append(dist_broadcast.cpu() if rank == 0 else None)
+    return torch.cat(dist_batches, dim=1)[:, :num_cols] if rank == 0 else None
+
+#----------------------------------------------------------------------------
+
+def compute_pr(opts, max_real, num_gen, nhood_size, row_batch_size, col_batch_size):
+    detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/vgg16.pkl'
+    detector_kwargs = dict(return_features=True)
+
+    real_features = metric_utils.compute_feature_stats_for_dataset(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=0, capture_all=True, max_items=max_real).get_all_torch().to(torch.float16).to(opts.device)
+
+    gen_features = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=1, capture_all=True, max_items=num_gen).get_all_torch().to(torch.float16).to(opts.device)
+
+    results = dict()
+    for name, manifold, probes in [('precision', real_features, gen_features), ('recall', gen_features, real_features)]:
+        kth = []
+        for manifold_batch in manifold.split(row_batch_size):
+            dist = compute_distances(row_features=manifold_batch, col_features=manifold, num_gpus=opts.num_gpus, rank=opts.rank, col_batch_size=col_batch_size)
+            kth.append(dist.to(torch.float32).kthvalue(nhood_size + 1).values.to(torch.float16) if opts.rank == 0 else None)
+        kth = torch.cat(kth) if opts.rank == 0 else None
+        pred = []
+        for probes_batch in probes.split(row_batch_size):
+            dist = compute_distances(row_features=probes_batch, col_features=manifold, num_gpus=opts.num_gpus, rank=opts.rank, col_batch_size=col_batch_size)
+            pred.append((dist <= kth).any(dim=1) if opts.rank == 0 else None)
+        results[name] = float(torch.cat(pred).to(torch.float32).mean() if opts.rank == 0 else 'nan')
+    return results['precision'], results['recall']
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/projector/w_plus_projector.py b/eg3d/projector/w_plus_projector.py
new file mode 100644
index 0000000000000000000000000000000000000000..46a8040cbb93637314c03c15061784900d993b40
--- /dev/null
+++ b/eg3d/projector/w_plus_projector.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Project given image to the latent space of pretrained network pickle."""
+
+import copy
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+import dnnlib
+import PIL
+from camera_utils import LookAtPoseSampler
+
+def project(
+        G,
+        c,
+        outdir,
+        target: torch.Tensor,  # [C,H,W] and dynamic range [0,255], W & H must match G output resolution
+        *,
+        num_steps=1000,
+        w_avg_samples=10000,
+        initial_learning_rate=0.01,
+        initial_noise_factor=0.05,
+        lr_rampdown_length=0.25,
+        lr_rampup_length=0.05,
+        noise_ramp_length=0.75,
+        regularize_noise_weight=1e5,
+        verbose=False,
+        device: torch.device,
+        initial_w=None,
+        image_log_step=100,
+        w_name: str
+):
+    os.makedirs(f'{outdir}/{w_name}_w_plus', exist_ok=True)
+    outdir = f'{outdir}/{w_name}_w_plus'
+    assert target.shape == (G.img_channels, G.img_resolution, G.img_resolution)
+
+    def logprint(*args):
+        if verbose:
+            print(*args)
+
+    G = copy.deepcopy(G).eval().requires_grad_(False).to(device).float() # type: ignore
+
+    # Compute w stats.
+    w_avg_path = './w_avg.npy'
+    w_std_path = './w_std.npy'
+    if (not os.path.exists(w_avg_path)) or (not os.path.exists(w_std_path)):
+        print(f'Computing W midpoint and stddev using {w_avg_samples} samples...')
+        z_samples = np.random.RandomState(123).randn(w_avg_samples, G.z_dim)
+        # c_samples = c.repeat(w_avg_samples, 1)
+
+        # use avg look at point
+
+        camera_lookat_point = torch.tensor(G.rendering_kwargs['avg_camera_pivot'], device=device)
+        cam2world_pose = LookAtPoseSampler.sample(3.14 / 2, 3.14 / 2, camera_lookat_point,
+                                                  radius=G.rendering_kwargs['avg_camera_radius'], device=device)
+        focal_length = 4.2647  # FFHQ's FOV
+        intrinsics = torch.tensor([[focal_length, 0, 0.5], [0, focal_length, 0.5], [0, 0, 1]], device=device)
+        c_samples = torch.cat([cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+        c_samples = c_samples.repeat(w_avg_samples, 1)
+
+        w_samples = G.mapping(torch.from_numpy(z_samples).to(device), c_samples)  # [N, L, C]
+        w_samples = w_samples[:, :1, :].cpu().numpy().astype(np.float32)  # [N, 1, C]
+        w_avg = np.mean(w_samples, axis=0, keepdims=True)  # [1, 1, C]
+        # print('save w_avg  to ./w_avg.npy')
+        # np.save('./w_avg.npy',w_avg)
+        w_avg_tensor = torch.from_numpy(w_avg).cuda()
+        w_std = (np.sum((w_samples - w_avg) ** 2) / w_avg_samples) ** 0.5
+
+        # np.save(w_avg_path, w_avg)
+        # np.save(w_std_path, w_std)
+    else:
+        # w_avg = np.load(w_avg_path)
+        # w_std = np.load(w_std_path)
+        raise Exception(' ')
+
+    # z_samples = np.random.RandomState(123).randn(w_avg_samples, G.z_dim)
+    # c_samples = c.repeat(w_avg_samples, 1)
+    # w_samples = G.mapping(torch.from_numpy(z_samples).to(device), c_samples)  # [N, L, C]
+    # w_samples = w_samples[:, :1, :].cpu().numpy().astype(np.float32)  # [N, 1, C]
+    # w_avg = np.mean(w_samples, axis=0, keepdims=True)  # [1, 1, C]
+    # w_avg_tensor = torch.from_numpy(w_avg).cuda()
+    # w_std = (np.sum((w_samples - w_avg) ** 2) / w_avg_samples) ** 0.5
+
+    start_w = initial_w if initial_w is not None else w_avg
+
+    # Setup noise inputs.
+    noise_bufs = {name: buf for (name, buf) in G.backbone.synthesis.named_buffers() if 'noise_const' in name}
+
+    # Load VGG16 feature detector.
+    url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
+    # url = './networks/vgg16.pt'
+    with dnnlib.util.open_url(url) as f:
+        vgg16 = torch.jit.load(f).eval().to(device)
+
+    # Features for target image.
+    target_images = target.unsqueeze(0).to(device).to(torch.float32)
+    if target_images.shape[2] > 256:
+        target_images = F.interpolate(target_images, size=(256, 256), mode='area')
+    target_features = vgg16(target_images, resize_images=False, return_lpips=True)
+
+    start_w = np.repeat(start_w, G.backbone.mapping.num_ws, axis=1)
+    w_opt = torch.tensor(start_w, dtype=torch.float32, device=device,
+                         requires_grad=True)  # pylint: disable=not-callable
+
+    optimizer = torch.optim.Adam([w_opt] + list(noise_bufs.values()), betas=(0.9, 0.999),
+                                 lr=0.1)
+
+    # Init noise.
+    for buf in noise_bufs.values():
+        buf[:] = torch.randn_like(buf)
+        buf.requires_grad = True
+
+    for step in tqdm(range(num_steps), position=0, leave=True):
+
+        # Learning rate schedule.
+        t = step / num_steps
+        w_noise_scale = w_std * initial_noise_factor * max(0.0, 1.0 - t / noise_ramp_length) ** 2
+        lr_ramp = min(1.0, (1.0 - t) / lr_rampdown_length)
+        lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi)
+        lr_ramp = lr_ramp * min(1.0, t / lr_rampup_length)
+        lr = initial_learning_rate * lr_ramp
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+        # Synth images from opt_w.
+        w_noise = torch.randn_like(w_opt) * w_noise_scale
+        ws = (w_opt + w_noise)
+        synth_images = G.synthesis(ws,c, noise_mode='const')['image']
+
+        if step % image_log_step == 0:
+            with torch.no_grad():
+                vis_img = (synth_images.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+
+                PIL.Image.fromarray(vis_img[0].cpu().numpy(), 'RGB').save(f'{outdir}/{step}.png')
+
+        # Downsample image to 256x256 if it's larger than that. VGG was built for 224x224 images.
+        synth_images = (synth_images + 1) * (255 / 2)
+        if synth_images.shape[2] > 256:
+            synth_images = F.interpolate(synth_images, size=(256, 256), mode='area')
+
+        # Features for synth images.
+        synth_features = vgg16(synth_images, resize_images=False, return_lpips=True)
+        dist = (target_features - synth_features).square().sum()
+
+        # Noise regularization.
+        reg_loss = 0.0
+        for v in noise_bufs.values():
+            noise = v[None, None, :, :]  # must be [1,1,H,W] for F.avg_pool2d()
+            while True:
+                reg_loss += (noise * torch.roll(noise, shifts=1, dims=3)).mean() ** 2
+                reg_loss += (noise * torch.roll(noise, shifts=1, dims=2)).mean() ** 2
+                if noise.shape[2] <= 8:
+                    break
+                noise = F.avg_pool2d(noise, kernel_size=2)
+        loss = dist + reg_loss * regularize_noise_weight
+
+        # if step % 10 == 0:
+        #     with torch.no_grad():
+        #         print({f'step {step}, first projection _{w_name}': loss.detach().cpu()})
+
+        # Step
+        optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        optimizer.step()
+        logprint(f'step {step + 1:>4d}/{num_steps}: dist {dist:<4.2f} loss {float(loss):<5.2f}')
+
+        # Normalize noise.
+        with torch.no_grad():
+            for buf in noise_bufs.values():
+                buf -= buf.mean()
+                buf *= buf.square().mean().rsqrt()
+
+    del G
+    return w_opt
diff --git a/eg3d/projector/w_projector.py b/eg3d/projector/w_projector.py
new file mode 100644
index 0000000000000000000000000000000000000000..355252db1a51a7ec56126efee00fff746cbf37a5
--- /dev/null
+++ b/eg3d/projector/w_projector.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Project given image to the latent space of pretrained network pickle."""
+
+import copy
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+import dnnlib
+import PIL
+from camera_utils import LookAtPoseSampler
+def project(
+        G,
+        c,
+        outdir,
+        target: torch.Tensor,  # [C,H,W] and dynamic range [0,255], W & H must match G output resolution
+        *,
+        num_steps=1000,
+        w_avg_samples=10000,
+        initial_learning_rate=0.01,
+        initial_noise_factor=0.05,
+        lr_rampdown_length=0.25,
+        lr_rampup_length=0.05,
+        noise_ramp_length=0.75,
+        regularize_noise_weight=1e5,
+        verbose=False,
+        device: torch.device,
+        initial_w=None,
+        image_log_step=100,
+        w_name: str
+):
+    os.makedirs(f'{outdir}/{w_name}_w',exist_ok=True)
+    outdir = f'{outdir}/{w_name}_w'
+    assert target.shape == (G.img_channels, G.img_resolution, G.img_resolution)
+
+    def logprint(*args):
+        if verbose:
+            print(*args)
+
+    G = copy.deepcopy(G).eval().requires_grad_(False).to(device).float()  # type: ignore
+
+    # Compute w stats.
+
+    w_avg_path = './w_avg.npy'
+    w_std_path = './w_std.npy'
+    if (not os.path.exists(w_avg_path)) or (not os.path.exists(w_std_path)):
+        print(f'Computing W midpoint and stddev using {w_avg_samples} samples...')
+        z_samples = np.random.RandomState(123).randn(w_avg_samples, G.z_dim)
+        #c_samples = c.repeat(w_avg_samples, 1)
+
+        # use avg look at point
+
+        camera_lookat_point = torch.tensor(G.rendering_kwargs['avg_camera_pivot'], device=device)
+        cam2world_pose = LookAtPoseSampler.sample(3.14 / 2, 3.14 / 2, camera_lookat_point,
+                                                  radius=G.rendering_kwargs['avg_camera_radius'], device=device)
+        focal_length = 4.2647   # FFHQ's FOV
+        intrinsics = torch.tensor([[focal_length, 0, 0.5], [0, focal_length, 0.5], [0, 0, 1]], device=device)
+        c_samples = torch.cat([cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+        c_samples = c_samples.repeat(w_avg_samples, 1)
+
+
+
+        w_samples = G.mapping(torch.from_numpy(z_samples).to(device), c_samples)  # [N, L, C]
+        w_samples = w_samples[:, :1, :].cpu().numpy().astype(np.float32)  # [N, 1, C]
+        w_avg = np.mean(w_samples, axis=0, keepdims=True)  # [1, 1, C]
+        # print('save w_avg  to ./w_avg.npy')
+        # np.save('./w_avg.npy',w_avg)
+        w_avg_tensor = torch.from_numpy(w_avg).cuda()
+        w_std = (np.sum((w_samples - w_avg) ** 2) / w_avg_samples) ** 0.5
+
+        # np.save(w_avg_path, w_avg)
+        # np.save(w_std_path, w_std)
+    else:
+        # w_avg = np.load(w_avg_path)
+        # w_std = np.load(w_std_path)
+        raise Exception(' ')
+
+    start_w = initial_w if initial_w is not None else w_avg
+
+    # Setup noise inputs.
+    noise_bufs = {name: buf for (name, buf) in G.backbone.synthesis.named_buffers() if 'noise_const' in name}
+
+    # Load VGG16 feature detector.
+    url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
+    # url = './networks/vgg16.pt'
+    with dnnlib.util.open_url(url) as f:
+        vgg16 = torch.jit.load(f).eval().to(device)
+
+    # Features for target image.
+    target_images = target.unsqueeze(0).to(device).to(torch.float32)
+    if target_images.shape[2] > 256:
+        target_images = F.interpolate(target_images, size=(256, 256), mode='area')
+    target_features = vgg16(target_images, resize_images=False, return_lpips=True)
+
+    w_opt = torch.tensor(start_w, dtype=torch.float32, device=device,
+                         requires_grad=True)  # pylint: disable=not-callable
+    print('w_opt shape: ',w_opt.shape)
+
+    optimizer = torch.optim.Adam([w_opt] + list(noise_bufs.values()), betas=(0.9, 0.999),
+                                 lr=0.1)
+
+    # Init noise.
+    for buf in noise_bufs.values():
+        buf[:] = torch.randn_like(buf)
+        buf.requires_grad = True
+
+    for step in tqdm(range(num_steps), position=0, leave=True):
+
+        # Learning rate schedule.
+        t = step / num_steps
+        w_noise_scale = w_std * initial_noise_factor * max(0.0, 1.0 - t / noise_ramp_length) ** 2
+        lr_ramp = min(1.0, (1.0 - t) / lr_rampdown_length)
+        lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi)
+        lr_ramp = lr_ramp * min(1.0, t / lr_rampup_length)
+        lr = initial_learning_rate * lr_ramp
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+        # Synth images from opt_w.
+        w_noise = torch.randn_like(w_opt) * w_noise_scale
+        ws = (w_opt + w_noise).repeat([1, G.backbone.mapping.num_ws, 1])
+        synth_images = G.synthesis(ws,c, noise_mode='const')['image']
+
+        if step % image_log_step == 0:
+            with torch.no_grad():
+                vis_img = (synth_images.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+
+                PIL.Image.fromarray(vis_img[0].cpu().numpy(), 'RGB').save(f'{outdir}/{step}.png')
+
+        # Downsample image to 256x256 if it's larger than that. VGG was built for 224x224 images.
+        synth_images = (synth_images + 1) * (255 / 2)
+        if synth_images.shape[2] > 256:
+            synth_images = F.interpolate(synth_images, size=(256, 256), mode='area')
+
+        # Features for synth images.
+        synth_features = vgg16(synth_images, resize_images=False, return_lpips=True)
+        dist = (target_features - synth_features).square().sum()
+
+        # Noise regularization.
+        reg_loss = 0.0
+        for v in noise_bufs.values():
+            noise = v[None, None, :, :]  # must be [1,1,H,W] for F.avg_pool2d()
+            while True:
+                reg_loss += (noise * torch.roll(noise, shifts=1, dims=3)).mean() ** 2
+                reg_loss += (noise * torch.roll(noise, shifts=1, dims=2)).mean() ** 2
+                if noise.shape[2] <= 8:
+                    break
+                noise = F.avg_pool2d(noise, kernel_size=2)
+        loss = dist + reg_loss * regularize_noise_weight
+
+        # if step % 10 == 0:
+        #     with torch.no_grad():
+        #          print({f'step {step } first projection _{w_name}': loss.detach().cpu()})
+
+        # Step
+        optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        optimizer.step()
+        logprint(f'step {step + 1:>4d}/{num_steps}: dist {dist:<4.2f} loss {float(loss):<5.2f}')
+
+        # Normalize noise.
+        with torch.no_grad():
+            for buf in noise_bufs.values():
+                buf -= buf.mean()
+                buf *= buf.square().mean().rsqrt()
+
+
+    return w_opt.repeat([1, G.backbone.mapping.num_ws, 1])
+    del G
diff --git a/eg3d/run_inversion.py b/eg3d/run_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..584393842c46d1620675a3844fd269cf378c6ad2
--- /dev/null
+++ b/eg3d/run_inversion.py
@@ -0,0 +1,106 @@
+
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Generate lerp videos using pretrained network pickle."""
+
+import os
+import re
+from typing import List, Optional, Tuple, Union
+
+import click
+import dnnlib
+import numpy as np
+import torch
+import legacy
+from torchvision.transforms import transforms
+from projector import w_projector,w_plus_projector
+from PIL import Image
+from glob import glob
+from os.path import join as opj
+
+@click.command()
+@click.option('--image_path', help='path of image file or image directory', type=str, required=True, metavar='STR', show_default=True)
+@click.option('--c_path', help='camera parameters path', type=str, required=True,  default='test-runs', metavar='STR', show_default=True)
+@click.option('--network', 'network_pkl', help='Network pickle filename', required=True)
+@click.option('--outdir', help='Output directory', type=str, required=True, metavar='DIR')
+@click.option('--latent_space_type', help='latent_space_type', type=click.Choice(['w', 'w_plus']), required=False, metavar='STR',
+              default='w', show_default=True)
+@click.option('--num_steps', 'num_steps', type=int,
+              help='Multiplier for depth sampling in volume rendering', default=500, show_default=True)
+@click.option('--sample_mult', 'sampling_multiplier', type=float,
+              help='Multiplier for depth sampling in volume rendering', default=2, show_default=True)
+@click.option('--nrr', type=int, help='Neural rendering resolution override', default=None, show_default=True)
+def run(
+        network_pkl: str,
+        outdir: str,
+        sampling_multiplier: float,
+        nrr: Optional[int],
+        latent_space_type:str,
+        image_path:str,
+        c_path:str,
+        num_steps:int
+):
+    os.makedirs(outdir, exist_ok=True)
+    print('Loading networks from "%s"...' % network_pkl)
+    device = torch.device('cuda')
+    with dnnlib.util.open_url(network_pkl) as f:
+        G = legacy.load_network_pkl(f)['G_ema']
+
+    G =  G.to(device)
+    G.rendering_kwargs['depth_resolution'] = int(G.rendering_kwargs['depth_resolution'] * sampling_multiplier)
+    G.rendering_kwargs['depth_resolution_importance'] = int(
+        G.rendering_kwargs['depth_resolution_importance'] * sampling_multiplier)
+    if nrr is not None: G.neural_rendering_resolution = nrr
+
+
+    if os.path.isdir(image_path):
+        img_paths = sorted(glob(opj(image_path,"*.png")))
+    else:
+        img_paths = [image_path]
+
+    trans = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+        transforms.Resize((512, 512))
+    ])
+
+
+    for img_path in img_paths:
+        img = Image.open(img_path).convert('RGB')
+        img_id = os.path.split(img_path)[-1].split('.')[0]
+        img.save(f'{outdir}/{img_id}_orig.png')
+        c = np.load(img_path.replace('png','npy'))
+        c = np.reshape(c,(1,25))
+        c = torch.FloatTensor(c).cuda()
+
+        from_im = trans(img).cuda()
+        id_image = torch.squeeze((from_im.cuda() + 1) / 2) * 255
+
+        if latent_space_type == 'w':
+            w = w_projector.project(G, c, outdir,id_image, device=torch.device('cuda'), w_avg_samples=600, num_steps = num_steps, w_name=img_id)
+        else:
+            w = w_plus_projector.project(G, c,outdir, id_image, device=torch.device('cuda'), w_avg_samples=600, w_name=img_id, num_steps = num_steps )
+
+        result_img = G.synthesis(w, c, noise_mode='const')['image']
+        vis_img = (result_img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+        Image.fromarray(vis_img[0].cpu().numpy(), 'RGB').save(f'{outdir}/{img_id}_inv.png')
+
+        torch.save(w.detach().cpu(), f'{outdir}/{img_id}_inv.pt')
+
+# ----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    run()  # pylint: disable=no-value-for-parameter
+
+# ----------------------------------------------------------------------------
+
+
+
diff --git a/eg3d/shape_utils.py b/eg3d/shape_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16f6cc82a59d9d3e455ba334abf68b576fdc10f
--- /dev/null
+++ b/eg3d/shape_utils.py
@@ -0,0 +1,124 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+
+"""
+Utils for extracting 3D shapes using marching cubes. Based on code from DeepSDF (Park et al.)
+
+Takes as input an .mrc file and extracts a mesh.
+
+Ex.
+    python shape_utils.py my_shape.mrc
+Ex.
+    python shape_utils.py myshapes_directory --level=12
+"""
+
+
+import time
+import plyfile
+import glob
+import logging
+import numpy as np
+import os
+import random
+import torch
+import torch.utils.data
+import trimesh
+import skimage.measure
+import argparse
+import mrcfile
+from tqdm import tqdm
+        
+
+def convert_sdf_samples_to_ply(
+    numpy_3d_sdf_tensor,
+    voxel_grid_origin,
+    voxel_size,
+    ply_filename_out,
+    offset=None,
+    scale=None,
+    level=0.0
+):
+    """
+    Convert sdf samples to .ply
+    :param pytorch_3d_sdf_tensor: a torch.FloatTensor of shape (n,n,n)
+    :voxel_grid_origin: a list of three floats: the bottom, left, down origin of the voxel grid
+    :voxel_size: float, the size of the voxels
+    :ply_filename_out: string, path of the filename to save to
+    This function adapted from: https://github.com/RobotLocomotion/spartan
+    """
+    start_time = time.time()
+
+    verts, faces, normals, values = np.zeros((0, 3)), np.zeros((0, 3)), np.zeros((0, 3)), np.zeros(0)
+    # try:
+    verts, faces, normals, values = skimage.measure.marching_cubes(
+        numpy_3d_sdf_tensor, level=level, spacing=[voxel_size] * 3
+    )
+    # except:
+    #     pass
+
+    # transform from voxel coordinates to camera coordinates
+    # note x and y are flipped in the output of marching_cubes
+    mesh_points = np.zeros_like(verts)
+    mesh_points[:, 0] = voxel_grid_origin[0] + verts[:, 0]
+    mesh_points[:, 1] = voxel_grid_origin[1] + verts[:, 1]
+    mesh_points[:, 2] = voxel_grid_origin[2] + verts[:, 2]
+
+    # apply additional offset and scale
+    if scale is not None:
+        mesh_points = mesh_points / scale
+    if offset is not None:
+        mesh_points = mesh_points - offset
+
+    # try writing to the ply file
+
+    num_verts = verts.shape[0]
+    num_faces = faces.shape[0]
+
+    verts_tuple = np.zeros((num_verts,), dtype=[("x", "f4"), ("y", "f4"), ("z", "f4")])
+
+    for i in range(0, num_verts):
+        verts_tuple[i] = tuple(mesh_points[i, :])
+
+    faces_building = []
+    for i in range(0, num_faces):
+        faces_building.append(((faces[i, :].tolist(),)))
+    faces_tuple = np.array(faces_building, dtype=[("vertex_indices", "i4", (3,))])
+
+    el_verts = plyfile.PlyElement.describe(verts_tuple, "vertex")
+    el_faces = plyfile.PlyElement.describe(faces_tuple, "face")
+
+    ply_data = plyfile.PlyData([el_verts, el_faces])
+    ply_data.write(ply_filename_out)
+    print(f"wrote to {ply_filename_out}")
+
+
+def convert_mrc(input_filename, output_filename, isosurface_level=1):
+    with mrcfile.open(input_filename) as mrc:
+        convert_sdf_samples_to_ply(np.transpose(mrc.data, (2, 1, 0)), [0, 0, 0], 1, output_filename, level=isosurface_level)
+
+if __name__ == '__main__':
+    start_time = time.time()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input_mrc_path')
+    parser.add_argument('--level', type=float, default=10, help="The isosurface level for marching cubes")
+    args = parser.parse_args()
+
+    if os.path.isfile(args.input_mrc_path) and args.input_mrc_path.split('.')[-1] == 'ply':
+        output_obj_path = args.input_mrc_path.split('.mrc')[0] + '.ply'
+        convert_mrc(args.input_mrc_path, output_obj_path, isosurface_level=1)
+
+        print(f"{time.time() - start_time:02f} s")
+    else:
+        assert os.path.isdir(args.input_mrc_path)
+
+        for mrc_path in tqdm(glob.glob(os.path.join(args.input_mrc_path, '*.mrc'))):
+            output_obj_path = mrc_path.split('.mrc')[0] + '.ply'
+            convert_mrc(mrc_path, output_obj_path, isosurface_level=args.level)
\ No newline at end of file
diff --git a/eg3d/torch_utils/__init__.py b/eg3d/torch_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfebd04f47e6f6b1b44984c14c23b57d56f72240
--- /dev/null
+++ b/eg3d/torch_utils/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+# empty
diff --git a/eg3d/torch_utils/custom_ops.py b/eg3d/torch_utils/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2524f47ab3d5b8750cfb868cc14012f424acc8
--- /dev/null
+++ b/eg3d/torch_utils/custom_ops.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import glob
+import hashlib
+import importlib
+import os
+import re
+import shutil
+import uuid
+
+import torch
+import torch.utils.cpp_extension
+from torch.utils.file_baton import FileBaton
+
+#----------------------------------------------------------------------------
+# Global options.
+
+verbosity = 'brief' # Verbosity level: 'none', 'brief', 'full'
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    patterns = [
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio */vc/bin',
+    ]
+    for pattern in patterns:
+        matches = sorted(glob.glob(pattern))
+        if len(matches):
+            return matches[-1]
+    return None
+
+#----------------------------------------------------------------------------
+
+def _get_mangled_gpu_name():
+    name = torch.cuda.get_device_name().lower()
+    out = []
+    for c in name:
+        if re.match('[a-z0-9_-]+', c):
+            out.append(c)
+        else:
+            out.append('-')
+    return ''.join(out)
+
+#----------------------------------------------------------------------------
+# Main entry point for compiling and loading C++/CUDA plugins.
+
+_cached_plugins = dict()
+
+def get_plugin(module_name, sources, headers=None, source_dir=None, **build_kwargs):
+    assert verbosity in ['none', 'brief', 'full']
+    if headers is None:
+        headers = []
+    if source_dir is not None:
+        sources = [os.path.join(source_dir, fname) for fname in sources]
+        headers = [os.path.join(source_dir, fname) for fname in headers]
+
+    # Already cached?
+    if module_name in _cached_plugins:
+        return _cached_plugins[module_name]
+
+    # Print status.
+    if verbosity == 'full':
+        print(f'Setting up PyTorch plugin "{module_name}"...')
+    elif verbosity == 'brief':
+        print(f'Setting up PyTorch plugin "{module_name}"... ', end='', flush=True)
+    verbose_build = (verbosity == 'full')
+
+    # Compile and load.
+    try: # pylint: disable=too-many-nested-blocks
+        # Make sure we can find the necessary compiler binaries.
+        if os.name == 'nt' and os.system("where cl.exe >nul 2>nul") != 0:
+            compiler_bindir = _find_compiler_bindir()
+            if compiler_bindir is None:
+                raise RuntimeError(f'Could not find MSVC/GCC/CLANG installation on this computer. Check _find_compiler_bindir() in "{__file__}".')
+            os.environ['PATH'] += ';' + compiler_bindir
+
+        # Some containers set TORCH_CUDA_ARCH_LIST to a list that can either
+        # break the build or unnecessarily restrict what's available to nvcc.
+        # Unset it to let nvcc decide based on what's available on the
+        # machine.
+        os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+        # Incremental build md5sum trickery.  Copies all the input source files
+        # into a cached build directory under a combined md5 digest of the input
+        # source files.  Copying is done only if the combined digest has changed.
+        # This keeps input file timestamps and filenames the same as in previous
+        # extension builds, allowing for fast incremental rebuilds.
+        #
+        # This optimization is done only in case all the source files reside in
+        # a single directory (just for simplicity) and if the TORCH_EXTENSIONS_DIR
+        # environment variable is set (we take this as a signal that the user
+        # actually cares about this.)
+        #
+        # EDIT: We now do it regardless of TORCH_EXTENSIOS_DIR, in order to work
+        # around the *.cu dependency bug in ninja config.
+        #
+        all_source_files = sorted(sources + headers)
+        all_source_dirs = set(os.path.dirname(fname) for fname in all_source_files)
+        if len(all_source_dirs) == 1: # and ('TORCH_EXTENSIONS_DIR' in os.environ):
+
+            # Compute combined hash digest for all source files.
+            hash_md5 = hashlib.md5()
+            for src in all_source_files:
+                with open(src, 'rb') as f:
+                    hash_md5.update(f.read())
+
+            # Select cached build directory name.
+            source_digest = hash_md5.hexdigest()
+            build_top_dir = torch.utils.cpp_extension._get_build_directory(module_name, verbose=verbose_build) # pylint: disable=protected-access
+            cached_build_dir = os.path.join(build_top_dir, f'{source_digest}-{_get_mangled_gpu_name()}')
+
+            if not os.path.isdir(cached_build_dir):
+                tmpdir = f'{build_top_dir}/srctmp-{uuid.uuid4().hex}'
+                os.makedirs(tmpdir)
+                for src in all_source_files:
+                    shutil.copyfile(src, os.path.join(tmpdir, os.path.basename(src)))
+                try:
+                    os.replace(tmpdir, cached_build_dir) # atomic
+                except OSError:
+                    # source directory already exists, delete tmpdir and its contents.
+                    shutil.rmtree(tmpdir)
+                    if not os.path.isdir(cached_build_dir): raise
+
+            # Compile.
+            cached_sources = [os.path.join(cached_build_dir, os.path.basename(fname)) for fname in sources]
+            torch.utils.cpp_extension.load(name=module_name, build_directory=cached_build_dir,
+                verbose=verbose_build, sources=cached_sources, **build_kwargs)
+        else:
+            torch.utils.cpp_extension.load(name=module_name, verbose=verbose_build, sources=sources, **build_kwargs)
+
+        # Load.
+        module = importlib.import_module(module_name)
+
+    except:
+        if verbosity == 'brief':
+            print('Failed!')
+        raise
+
+    # Print status and add to cache dict.
+    if verbosity == 'full':
+        print(f'Done setting up PyTorch plugin "{module_name}".')
+    elif verbosity == 'brief':
+        print('Done.')
+    _cached_plugins[module_name] = module
+    return module
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/misc.py b/eg3d/torch_utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c10e139c3b61bf67514725a39d4e220598c1ec3
--- /dev/null
+++ b/eg3d/torch_utils/misc.py
@@ -0,0 +1,268 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import re
+import contextlib
+import numpy as np
+import torch
+import warnings
+import dnnlib
+
+#----------------------------------------------------------------------------
+# Cached construction of constant tensors. Avoids CPU=>GPU copy when the
+# same constant is used multiple times.
+
+_constant_cache = dict()
+
+def constant(value, shape=None, dtype=None, device=None, memory_format=None):
+    value = np.asarray(value)
+    if shape is not None:
+        shape = tuple(shape)
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.device('cpu')
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+
+    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
+    tensor = _constant_cache.get(key, None)
+    if tensor is None:
+        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
+        if shape is not None:
+            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
+        tensor = tensor.contiguous(memory_format=memory_format)
+        _constant_cache[key] = tensor
+    return tensor
+
+#----------------------------------------------------------------------------
+# Replace NaN/Inf with specified numerical values.
+
+try:
+    nan_to_num = torch.nan_to_num # 1.8.0a0
+except AttributeError:
+    def nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None): # pylint: disable=redefined-builtin
+        assert isinstance(input, torch.Tensor)
+        if posinf is None:
+            posinf = torch.finfo(input.dtype).max
+        if neginf is None:
+            neginf = torch.finfo(input.dtype).min
+        assert nan == 0
+        return torch.clamp(input.unsqueeze(0).nansum(0), min=neginf, max=posinf, out=out)
+
+#----------------------------------------------------------------------------
+# Symbolic assert.
+
+try:
+    symbolic_assert = torch._assert # 1.8.0a0 # pylint: disable=protected-access
+except AttributeError:
+    symbolic_assert = torch.Assert # 1.7.0
+
+#----------------------------------------------------------------------------
+# Context manager to temporarily suppress known warnings in torch.jit.trace().
+# Note: Cannot use catch_warnings because of https://bugs.python.org/issue29672
+
+@contextlib.contextmanager
+def suppress_tracer_warnings():
+    flt = ('ignore', None, torch.jit.TracerWarning, None, 0)
+    warnings.filters.insert(0, flt)
+    yield
+    warnings.filters.remove(flt)
+
+#----------------------------------------------------------------------------
+# Assert that the shape of a tensor matches the given list of integers.
+# None indicates that the size of a dimension is allowed to vary.
+# Performs symbolic assertion when used in torch.jit.trace().
+
+def assert_shape(tensor, ref_shape):
+    if tensor.ndim != len(ref_shape):
+        raise AssertionError(f'Wrong number of dimensions: got {tensor.ndim}, expected {len(ref_shape)}')
+    for idx, (size, ref_size) in enumerate(zip(tensor.shape, ref_shape)):
+        if ref_size is None:
+            pass
+        elif isinstance(ref_size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(torch.as_tensor(size), ref_size), f'Wrong size for dimension {idx}')
+        elif isinstance(size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(size, torch.as_tensor(ref_size)), f'Wrong size for dimension {idx}: expected {ref_size}')
+        elif size != ref_size:
+            raise AssertionError(f'Wrong size for dimension {idx}: got {size}, expected {ref_size}')
+
+#----------------------------------------------------------------------------
+# Function decorator that calls torch.autograd.profiler.record_function().
+
+def profiled_function(fn):
+    def decorator(*args, **kwargs):
+        with torch.autograd.profiler.record_function(fn.__name__):
+            return fn(*args, **kwargs)
+    decorator.__name__ = fn.__name__
+    return decorator
+
+#----------------------------------------------------------------------------
+# Sampler for torch.utils.data.DataLoader that loops over the dataset
+# indefinitely, shuffling items as it goes.
+
+class InfiniteSampler(torch.utils.data.Sampler):
+    def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed=0, window_size=0.5):
+        assert len(dataset) > 0
+        assert num_replicas > 0
+        assert 0 <= rank < num_replicas
+        assert 0 <= window_size <= 1
+        super().__init__(dataset)
+        self.dataset = dataset
+        self.rank = rank
+        self.num_replicas = num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+        self.window_size = window_size
+
+    def __iter__(self):
+        order = np.arange(len(self.dataset))
+        rnd = None
+        window = 0
+        if self.shuffle:
+            rnd = np.random.RandomState(self.seed)
+            rnd.shuffle(order)
+            window = int(np.rint(order.size * self.window_size))
+
+        idx = 0
+        while True:
+            i = idx % order.size
+            if idx % self.num_replicas == self.rank:
+                yield order[i]
+            if window >= 2:
+                j = (i - rnd.randint(window)) % order.size
+                order[i], order[j] = order[j], order[i]
+            idx += 1
+
+#----------------------------------------------------------------------------
+# Utilities for operating with torch.nn.Module parameters and buffers.
+
+def params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.parameters()) + list(module.buffers())
+
+def named_params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.named_parameters()) + list(module.named_buffers())
+
+def copy_params_and_buffers(src_module, dst_module, require_all=False):
+    assert isinstance(src_module, torch.nn.Module)
+    assert isinstance(dst_module, torch.nn.Module)
+    src_tensors = dict(named_params_and_buffers(src_module))
+    for name, tensor in named_params_and_buffers(dst_module):
+        assert (name in src_tensors) or (not require_all)
+        if name in src_tensors:
+            tensor.copy_(src_tensors[name].detach()).requires_grad_(tensor.requires_grad)
+
+#----------------------------------------------------------------------------
+# Context manager for easily enabling/disabling DistributedDataParallel
+# synchronization.
+
+@contextlib.contextmanager
+def ddp_sync(module, sync):
+    assert isinstance(module, torch.nn.Module)
+    if sync or not isinstance(module, torch.nn.parallel.DistributedDataParallel):
+        yield
+    else:
+        with module.no_sync():
+            yield
+
+#----------------------------------------------------------------------------
+# Check DistributedDataParallel consistency across processes.
+
+def check_ddp_consistency(module, ignore_regex=None):
+    assert isinstance(module, torch.nn.Module)
+    for name, tensor in named_params_and_buffers(module):
+        fullname = type(module).__name__ + '.' + name
+        if ignore_regex is not None and re.fullmatch(ignore_regex, fullname):
+            continue
+        tensor = tensor.detach()
+        if tensor.is_floating_point():
+            tensor = nan_to_num(tensor)
+        other = tensor.clone()
+        torch.distributed.broadcast(tensor=other, src=0)
+        assert (tensor == other).all(), fullname
+
+#----------------------------------------------------------------------------
+# Print summary table of module hierarchy.
+
+def print_module_summary(module, inputs, max_nesting=3, skip_redundant=True):
+    assert isinstance(module, torch.nn.Module)
+    assert not isinstance(module, torch.jit.ScriptModule)
+    assert isinstance(inputs, (tuple, list))
+
+    # Register hooks.
+    entries = []
+    nesting = [0]
+    def pre_hook(_mod, _inputs):
+        nesting[0] += 1
+    def post_hook(mod, _inputs, outputs):
+        nesting[0] -= 1
+        if nesting[0] <= max_nesting:
+            outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs]
+            outputs = [t for t in outputs if isinstance(t, torch.Tensor)]
+            entries.append(dnnlib.EasyDict(mod=mod, outputs=outputs))
+    hooks = [mod.register_forward_pre_hook(pre_hook) for mod in module.modules()]
+    hooks += [mod.register_forward_hook(post_hook) for mod in module.modules()]
+
+    # Run module.
+    outputs = module(*inputs)
+    for hook in hooks:
+        hook.remove()
+
+    # Identify unique outputs, parameters, and buffers.
+    tensors_seen = set()
+    for e in entries:
+        e.unique_params = [t for t in e.mod.parameters() if id(t) not in tensors_seen]
+        e.unique_buffers = [t for t in e.mod.buffers() if id(t) not in tensors_seen]
+        e.unique_outputs = [t for t in e.outputs if id(t) not in tensors_seen]
+        tensors_seen |= {id(t) for t in e.unique_params + e.unique_buffers + e.unique_outputs}
+
+    # Filter out redundant entries.
+    if skip_redundant:
+        entries = [e for e in entries if len(e.unique_params) or len(e.unique_buffers) or len(e.unique_outputs)]
+
+    # Construct table.
+    rows = [[type(module).__name__, 'Parameters', 'Buffers', 'Output shape', 'Datatype']]
+    rows += [['---'] * len(rows[0])]
+    param_total = 0
+    buffer_total = 0
+    submodule_names = {mod: name for name, mod in module.named_modules()}
+    for e in entries:
+        name = '<top-level>' if e.mod is module else submodule_names[e.mod]
+        param_size = sum(t.numel() for t in e.unique_params)
+        buffer_size = sum(t.numel() for t in e.unique_buffers)
+        output_shapes = [str(list(t.shape)) for t in e.outputs]
+        output_dtypes = [str(t.dtype).split('.')[-1] for t in e.outputs]
+        rows += [[
+            name + (':0' if len(e.outputs) >= 2 else ''),
+            str(param_size) if param_size else '-',
+            str(buffer_size) if buffer_size else '-',
+            (output_shapes + ['-'])[0],
+            (output_dtypes + ['-'])[0],
+        ]]
+        for idx in range(1, len(e.outputs)):
+            rows += [[name + f':{idx}', '-', '-', output_shapes[idx], output_dtypes[idx]]]
+        param_total += param_size
+        buffer_total += buffer_size
+    rows += [['---'] * len(rows[0])]
+    rows += [['Total', str(param_total), str(buffer_total), '-', '-']]
+
+    # Print table.
+    widths = [max(len(cell) for cell in column) for column in zip(*rows)]
+    print()
+    for row in rows:
+        print('  '.join(cell + ' ' * (width - len(cell)) for cell, width in zip(row, widths)))
+    print()
+    return outputs
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/__init__.py b/eg3d/torch_utils/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfebd04f47e6f6b1b44984c14c23b57d56f72240
--- /dev/null
+++ b/eg3d/torch_utils/ops/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+# empty
diff --git a/eg3d/torch_utils/ops/bias_act.cpp b/eg3d/torch_utils/ops/bias_act.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee6f6d0caaf4f84b94851d223e384344e1109cdc
--- /dev/null
+++ b/eg3d/torch_utils/ops/bias_act.cpp
@@ -0,0 +1,103 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+
+static bool has_same_layout(torch::Tensor x, torch::Tensor y)
+{
+    if (x.dim() != y.dim())
+        return false;
+    for (int64_t i = 0; i < x.dim(); i++)
+    {
+        if (x.size(i) != y.size(i))
+            return false;
+        if (x.size(i) >= 2 && x.stride(i) != y.stride(i))
+            return false;
+    }
+    return true;
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor bias_act(torch::Tensor x, torch::Tensor b, torch::Tensor xref, torch::Tensor yref, torch::Tensor dy, int grad, int dim, int act, float alpha, float gain, float clamp)
+{
+    // Validate arguments.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    TORCH_CHECK(b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()), "b must have the same dtype and device as x");
+    TORCH_CHECK(xref.numel() == 0 || (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() && xref.device() == x.device()), "xref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(yref.numel() == 0 || (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() && yref.device() == x.device()), "yref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() && dy.device() == x.device()), "dy must have the same dtype and device as x");
+    TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+    TORCH_CHECK(b.dim() == 1, "b must have rank 1");
+    TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()), "dim is out of bounds");
+    TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim), "b has wrong number of elements");
+    TORCH_CHECK(grad >= 0, "grad must be non-negative");
+
+    // Validate layout.
+    TORCH_CHECK(x.is_non_overlapping_and_dense(), "x must be non-overlapping and dense");
+    TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
+    TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x), "xref must have the same layout as x");
+    TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x), "yref must have the same layout as x");
+    TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x), "dy must have the same layout as x");
+
+    // Create output tensor.
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    torch::Tensor y = torch::empty_like(x);
+    TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");
+
+    // Initialize CUDA kernel parameters.
+    bias_act_kernel_params p;
+    p.x     = x.data_ptr();
+    p.b     = (b.numel()) ? b.data_ptr() : NULL;
+    p.xref  = (xref.numel()) ? xref.data_ptr() : NULL;
+    p.yref  = (yref.numel()) ? yref.data_ptr() : NULL;
+    p.dy    = (dy.numel()) ? dy.data_ptr() : NULL;
+    p.y     = y.data_ptr();
+    p.grad  = grad;
+    p.act   = act;
+    p.alpha = alpha;
+    p.gain  = gain;
+    p.clamp = clamp;
+    p.sizeX = (int)x.numel();
+    p.sizeB = (int)b.numel();
+    p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;
+
+    // Choose CUDA kernel.
+    void* kernel;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
+    {
+        kernel = choose_bias_act_kernel<scalar_t>(p);
+    });
+    TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");
+
+    // Launch CUDA kernel.
+    p.loopX = 4;
+    int blockSize = 4 * 32;
+    int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+    void* args[] = {&p};
+    AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
+    return y;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("bias_act", &bias_act);
+}
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/bias_act.cu b/eg3d/torch_utils/ops/bias_act.cu
new file mode 100644
index 0000000000000000000000000000000000000000..71ca3900deda41e62d80044f0e409875f4c794b5
--- /dev/null
+++ b/eg3d/torch_utils/ops/bias_act.cu
@@ -0,0 +1,177 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/util/Half.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>     { typedef double scalar_t; };
+template <> struct InternalType<float>      { typedef float  scalar_t; };
+template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T, int A>
+__global__ void bias_act_kernel(bias_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    int G                 = p.grad;
+    scalar_t alpha        = (scalar_t)p.alpha;
+    scalar_t gain         = (scalar_t)p.gain;
+    scalar_t clamp        = (scalar_t)p.clamp;
+    scalar_t one          = (scalar_t)1;
+    scalar_t two          = (scalar_t)2;
+    scalar_t expRange     = (scalar_t)80;
+    scalar_t halfExpRange = (scalar_t)40;
+    scalar_t seluScale    = (scalar_t)1.0507009873554804934193349852946;
+    scalar_t seluAlpha    = (scalar_t)1.6732632423543772848170429916717;
+
+    // Loop over elements.
+    int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+    for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x)
+    {
+        // Load.
+        scalar_t x = (scalar_t)((const T*)p.x)[xi];
+        scalar_t b = (p.b) ? (scalar_t)((const T*)p.b)[(xi / p.stepB) % p.sizeB] : 0;
+        scalar_t xref = (p.xref) ? (scalar_t)((const T*)p.xref)[xi] : 0;
+        scalar_t yref = (p.yref) ? (scalar_t)((const T*)p.yref)[xi] : 0;
+        scalar_t dy = (p.dy) ? (scalar_t)((const T*)p.dy)[xi] : one;
+        scalar_t yy = (gain != 0) ? yref / gain : 0;
+        scalar_t y = 0;
+
+        // Apply bias.
+        ((G == 0) ? x : xref) += b;
+
+        // linear
+        if (A == 1)
+        {
+            if (G == 0) y = x;
+            if (G == 1) y = x;
+        }
+
+        // relu
+        if (A == 2)
+        {
+            if (G == 0) y = (x > 0) ? x : 0;
+            if (G == 1) y = (yy > 0) ? x : 0;
+        }
+
+        // lrelu
+        if (A == 3)
+        {
+            if (G == 0) y = (x > 0) ? x : x * alpha;
+            if (G == 1) y = (yy > 0) ? x : x * alpha;
+        }
+
+        // tanh
+        if (A == 4)
+        {
+            if (G == 0) { scalar_t c = exp(x); scalar_t d = one / c; y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d); }
+            if (G == 1) y = x * (one - yy * yy);
+            if (G == 2) y = x * (one - yy * yy) * (-two * yy);
+        }
+
+        // sigmoid
+        if (A == 5)
+        {
+            if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
+            if (G == 1) y = x * yy * (one - yy);
+            if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
+        }
+
+        // elu
+        if (A == 6)
+        {
+            if (G == 0) y = (x >= 0) ? x : exp(x) - one;
+            if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
+        }
+
+        // selu
+        if (A == 7)
+        {
+            if (G == 0) y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
+            if (G == 1) y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
+        }
+
+        // softplus
+        if (A == 8)
+        {
+            if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
+            if (G == 1) y = x * (one - exp(-yy));
+            if (G == 2) { scalar_t c = exp(-yy); y = x * c * (one - c); }
+        }
+
+        // swish
+        if (A == 9)
+        {
+            if (G == 0)
+                y = (x < -expRange) ? 0 : x / (exp(-x) + one);
+            else
+            {
+                scalar_t c = exp(xref);
+                scalar_t d = c + one;
+                if (G == 1)
+                    y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
+                else
+                    y = (xref > halfExpRange) ? 0 : x * c * (xref * (two - d) + two * d) / (d * d * d);
+                yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
+            }
+        }
+
+        // Apply gain.
+        y *= gain * dy;
+
+        // Clamp.
+        if (clamp >= 0)
+        {
+            if (G == 0)
+                y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
+            else
+                y = (yref > -clamp & yref < clamp) ? y : 0;
+        }
+
+        // Store.
+        ((T*)p.y)[xi] = (T)y;
+    }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p)
+{
+    if (p.act == 1) return (void*)bias_act_kernel<T, 1>;
+    if (p.act == 2) return (void*)bias_act_kernel<T, 2>;
+    if (p.act == 3) return (void*)bias_act_kernel<T, 3>;
+    if (p.act == 4) return (void*)bias_act_kernel<T, 4>;
+    if (p.act == 5) return (void*)bias_act_kernel<T, 5>;
+    if (p.act == 6) return (void*)bias_act_kernel<T, 6>;
+    if (p.act == 7) return (void*)bias_act_kernel<T, 7>;
+    if (p.act == 8) return (void*)bias_act_kernel<T, 8>;
+    if (p.act == 9) return (void*)bias_act_kernel<T, 9>;
+    return NULL;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template void* choose_bias_act_kernel<double>       (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<float>        (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<c10::Half>    (const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/bias_act.h b/eg3d/torch_utils/ops/bias_act.h
new file mode 100644
index 0000000000000000000000000000000000000000..8994bfb4e9cae790865348e08de5f685152d3344
--- /dev/null
+++ b/eg3d/torch_utils/ops/bias_act.h
@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct bias_act_kernel_params
+{
+    const void* x;      // [sizeX]
+    const void* b;      // [sizeB] or NULL
+    const void* xref;   // [sizeX] or NULL
+    const void* yref;   // [sizeX] or NULL
+    const void* dy;     // [sizeX] or NULL
+    void*       y;      // [sizeX]
+
+    int         grad;
+    int         act;
+    float       alpha;
+    float       gain;
+    float       clamp;
+
+    int         sizeX;
+    int         sizeB;
+    int         stepB;
+    int         loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/bias_act.py b/eg3d/torch_utils/ops/bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f4d39d643f7d6f4fdc64f9cab99be9087b780d
--- /dev/null
+++ b/eg3d/torch_utils/ops/bias_act.py
@@ -0,0 +1,211 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Custom PyTorch ops for efficient bias and activation."""
+
+import os
+import numpy as np
+import torch
+import dnnlib
+
+from .. import custom_ops
+from .. import misc
+
+#----------------------------------------------------------------------------
+
+activation_funcs = {
+    'linear':   dnnlib.EasyDict(func=lambda x, **_:         x,                                          def_alpha=0,    def_gain=1,             cuda_idx=1, ref='',  has_2nd_grad=False),
+    'relu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.relu(x),                def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=2, ref='y', has_2nd_grad=False),
+    'lrelu':    dnnlib.EasyDict(func=lambda x, alpha, **_:  torch.nn.functional.leaky_relu(x, alpha),   def_alpha=0.2,  def_gain=np.sqrt(2),    cuda_idx=3, ref='y', has_2nd_grad=False),
+    'tanh':     dnnlib.EasyDict(func=lambda x, **_:         torch.tanh(x),                              def_alpha=0,    def_gain=1,             cuda_idx=4, ref='y', has_2nd_grad=True),
+    'sigmoid':  dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x),                           def_alpha=0,    def_gain=1,             cuda_idx=5, ref='y', has_2nd_grad=True),
+    'elu':      dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.elu(x),                 def_alpha=0,    def_gain=1,             cuda_idx=6, ref='y', has_2nd_grad=True),
+    'selu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.selu(x),                def_alpha=0,    def_gain=1,             cuda_idx=7, ref='y', has_2nd_grad=True),
+    'softplus': dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.softplus(x),            def_alpha=0,    def_gain=1,             cuda_idx=8, ref='y', has_2nd_grad=True),
+    'swish':    dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x) * x,                       def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=9, ref='x', has_2nd_grad=True),
+}
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+_null_tensor = torch.empty([0])
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='bias_act_plugin',
+            sources=['bias_act.cpp', 'bias_act.cu'],
+            headers=['bias_act.h'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+    return True
+
+#----------------------------------------------------------------------------
+
+def bias_act(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None, impl='cuda'):
+    r"""Fused bias and activation function.
+
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports first and second order gradients,
+    but not third order gradients.
+
+    Args:
+        x:      Input activation tensor. Can be of any shape.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `dim`.
+        dim:    The dimension in `x` corresponding to the elements of `b`.
+                The value of `dim` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying 1.
+        clamp:  Clamp the output values to `[-clamp, +clamp]`, or `None` to disable
+                the clamping (default).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _bias_act_cuda(dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(x, b)
+    return _bias_act_ref(x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _bias_act_ref(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Slow reference implementation of `bias_act()` using standard TensorFlow ops.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.ndim == 1
+        assert 0 <= dim < x.ndim
+        assert b.shape[0] == x.shape[dim]
+        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])
+
+    # Evaluate activation function.
+    alpha = float(alpha)
+    x = spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        x = x * gain
+
+    # Clamp.
+    if clamp >= 0:
+        x = x.clamp(-clamp, clamp) # pylint: disable=invalid-unary-operand-type
+    return x
+
+#----------------------------------------------------------------------------
+
+_bias_act_cuda_cache = dict()
+
+def _bias_act_cuda(dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Fast CUDA implementation of `bias_act()` using custom ops.
+    """
+    # Parse arguments.
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Lookup from cache.
+    key = (dim, act, alpha, gain, clamp)
+    if key in _bias_act_cuda_cache:
+        return _bias_act_cuda_cache[key]
+
+    # Forward op.
+    class BiasActCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, b): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(1) == 1 else torch.contiguous_format
+            x = x.contiguous(memory_format=ctx.memory_format)
+            b = b.contiguous() if b is not None else _null_tensor
+            y = x
+            if act != 'linear' or gain != 1 or clamp >= 0 or b is not _null_tensor:
+                y = _plugin.bias_act(x, b, _null_tensor, _null_tensor, _null_tensor, 0, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                y if 'y' in spec.ref else _null_tensor)
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            dy = dy.contiguous(memory_format=ctx.memory_format)
+            x, b, y = ctx.saved_tensors
+            dx = None
+            db = None
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                dx = dy
+                if act != 'linear' or gain != 1 or clamp >= 0:
+                    dx = BiasActCudaGrad.apply(dy, x, b, y)
+
+            if ctx.needs_input_grad[1]:
+                db = dx.sum([i for i in range(dx.ndim) if i != dim])
+
+            return dx, db
+
+    # Backward op.
+    class BiasActCudaGrad(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if dy.ndim > 2 and dy.stride(1) == 1 else torch.contiguous_format
+            dx = _plugin.bias_act(dy, b, x, y, _null_tensor, 1, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                dy if spec.has_2nd_grad else _null_tensor,
+                x, b, y)
+            return dx
+
+        @staticmethod
+        def backward(ctx, d_dx): # pylint: disable=arguments-differ
+            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
+            dy, x, b, y = ctx.saved_tensors
+            d_dy = None
+            d_x = None
+            d_b = None
+            d_y = None
+
+            if ctx.needs_input_grad[0]:
+                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
+
+            if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]):
+                d_x = _plugin.bias_act(d_dx, b, x, y, dy, 2, dim, spec.cuda_idx, alpha, gain, clamp)
+
+            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
+                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
+
+            return d_dy, d_x, d_b, d_y
+
+    # Add to cache.
+    _bias_act_cuda_cache[key] = BiasActCuda
+    return BiasActCuda
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/conv2d_gradfix.py b/eg3d/torch_utils/ops/conv2d_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a177cc1c0b6eabf16908cf9afaa4387e7716b72
--- /dev/null
+++ b/eg3d/torch_utils/ops/conv2d_gradfix.py
@@ -0,0 +1,199 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Custom replacement for `torch.nn.functional.conv2d` that supports
+arbitrarily high order gradients with zero performance penalty."""
+
+import contextlib
+import torch
+
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+
+#----------------------------------------------------------------------------
+
+enabled = False                     # Enable the custom op by setting this to true.
+weight_gradients_disabled = False   # Forcefully disable computation of gradients with respect to the weights.
+
+@contextlib.contextmanager
+def no_weight_gradients(disable=True):
+    global weight_gradients_disabled
+    old = weight_gradients_disabled
+    if disable:
+        weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+
+#----------------------------------------------------------------------------
+
+def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=False, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=0, dilation=dilation, groups=groups).apply(input, weight, bias)
+    return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=True, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation).apply(input, weight, bias)
+    return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation)
+
+#----------------------------------------------------------------------------
+
+def _should_use_custom_op(input):
+    assert isinstance(input, torch.Tensor)
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+    if input.device.type != 'cuda':
+        return False
+    return True
+
+def _tuple_of_ints(xs, ndim):
+    xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs,) * ndim
+    assert len(xs) == ndim
+    assert all(isinstance(x, int) for x in xs)
+    return xs
+
+#----------------------------------------------------------------------------
+
+_conv2d_gradfix_cache = dict()
+_null_tensor = torch.empty([0])
+
+def _conv2d_gradfix(transpose, weight_shape, stride, padding, output_padding, dilation, groups):
+    # Parse arguments.
+    ndim = 2
+    weight_shape = tuple(weight_shape)
+    stride = _tuple_of_ints(stride, ndim)
+    padding = _tuple_of_ints(padding, ndim)
+    output_padding = _tuple_of_ints(output_padding, ndim)
+    dilation = _tuple_of_ints(dilation, ndim)
+
+    # Lookup from cache.
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups)
+    if key in _conv2d_gradfix_cache:
+        return _conv2d_gradfix_cache[key]
+
+    # Validate arguments.
+    assert groups >= 1
+    assert len(weight_shape) == ndim + 2
+    assert all(stride[i] >= 1 for i in range(ndim))
+    assert all(padding[i] >= 0 for i in range(ndim))
+    assert all(dilation[i] >= 0 for i in range(ndim))
+    if not transpose:
+        assert all(output_padding[i] == 0 for i in range(ndim))
+    else: # transpose
+        assert all(0 <= output_padding[i] < max(stride[i], dilation[i]) for i in range(ndim))
+
+    # Helpers.
+    common_kwargs = dict(stride=stride, padding=padding, dilation=dilation, groups=groups)
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [0, 0]
+        return [
+            input_shape[i + 2]
+            - (output_shape[i + 2] - 1) * stride[i]
+            - (1 - 2 * padding[i])
+            - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+
+    # Forward & backward.
+    class Conv2d(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            assert weight.shape == weight_shape
+            ctx.save_for_backward(
+                input if weight.requires_grad else _null_tensor,
+                weight if input.requires_grad else _null_tensor,
+            )
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
+            if weight_shape[2:] == stride == dilation == (1, 1) and padding == (0, 0) and torch.cuda.get_device_capability(input.device) < (8, 0):
+                a = weight.reshape(groups, weight_shape[0] // groups, weight_shape[1])
+                b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1)
+                c = (a.transpose(1, 2) if transpose else a) @ b.permute(1, 2, 0, 3).flatten(2)
+                c = c.reshape(-1, input.shape[0], *input.shape[2:]).transpose(0, 1)
+                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(2).unsqueeze(3)
+                return c.contiguous(memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format))
+
+            # General case => cuDNN.
+            if transpose:
+                return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, output_padding=output_padding, **common_kwargs)
+            return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, **common_kwargs)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            input_shape = ctx.input_shape
+            grad_input = None
+            grad_weight = None
+            grad_bias = None
+
+            if ctx.needs_input_grad[0]:
+                p = calc_output_padding(input_shape=input_shape, output_shape=grad_output.shape)
+                op = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs)
+                grad_input = op.apply(grad_output, weight, None)
+                assert grad_input.shape == input_shape
+
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+                grad_weight = Conv2dGradWeight.apply(grad_output, input, weight)
+                assert grad_weight.shape == weight_shape
+
+            if ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum([0, 2, 3])
+
+            return grad_input, grad_weight, grad_bias
+
+    # Gradient with respect to the weights.
+    class Conv2dGradWeight(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, grad_output, input, weight):
+            ctx.save_for_backward(
+                grad_output if input.requires_grad else _null_tensor,
+                input if grad_output.requires_grad else _null_tensor,
+            )
+            ctx.grad_output_shape = grad_output.shape
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
+            if weight_shape[2:] == stride == dilation == (1, 1) and padding == (0, 0):
+                a = grad_output.reshape(grad_output.shape[0], groups, grad_output.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2)
+                b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2)
+                c = (b @ a.transpose(1, 2) if transpose else a @ b.transpose(1, 2)).reshape(weight_shape)
+                return c.contiguous(memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format))
+
+            # General case => cuDNN.
+            return torch.ops.aten.convolution_backward(grad_output=grad_output, input=input, weight=weight, bias_sizes=None, stride=stride, padding=padding, dilation=dilation, transposed=transpose, output_padding=output_padding, groups=groups, output_mask=[False, True, False])[1]
+
+
+        @staticmethod
+        def backward(ctx, grad2_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad_output_shape = ctx.grad_output_shape
+            input_shape = ctx.input_shape
+            grad2_grad_output = None
+            grad2_input = None
+
+            if ctx.needs_input_grad[0]:
+                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, None)
+                assert grad2_grad_output.shape == grad_output_shape
+
+            if ctx.needs_input_grad[1]:
+                p = calc_output_padding(input_shape=input_shape, output_shape=grad_output_shape)
+                op = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs)
+                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
+                assert grad2_input.shape == input_shape
+
+            return grad2_grad_output, grad2_input
+
+    _conv2d_gradfix_cache[key] = Conv2d
+    return Conv2d
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/conv2d_resample.py b/eg3d/torch_utils/ops/conv2d_resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..d46f4ddd85606b9032d08efe3556ecad4676cee5
--- /dev/null
+++ b/eg3d/torch_utils/ops/conv2d_resample.py
@@ -0,0 +1,145 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""2D convolution with optional up/downsampling."""
+
+import torch
+
+from .. import misc
+from . import conv2d_gradfix
+from . import upfirdn2d
+from .upfirdn2d import _parse_padding
+from .upfirdn2d import _get_filter_size
+
+#----------------------------------------------------------------------------
+
+def _get_weight_shape(w):
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        shape = [int(sz) for sz in w.shape]
+    misc.assert_shape(w, shape)
+    return shape
+
+#----------------------------------------------------------------------------
+
+def _conv2d_wrapper(x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True):
+    """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations.
+    """
+    _out_channels, _in_channels_per_group, kh, kw = _get_weight_shape(w)
+
+    # Flip weight if requested.
+    # Note: conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False).
+    if not flip_weight and (kw > 1 or kh > 1):
+        w = w.flip([2, 3])
+
+    # Execute using conv2d_gradfix.
+    op = conv2d_gradfix.conv_transpose2d if transpose else conv2d_gradfix.conv2d
+    return op(x, w, stride=stride, padding=padding, groups=groups)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def conv2d_resample(x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False):
+    r"""2D convolution with optional up/downsampling.
+
+    Padding is performed only once at the beginning, not between the operations.
+
+    Args:
+        x:              Input tensor of shape
+                        `[batch_size, in_channels, in_height, in_width]`.
+        w:              Weight tensor of shape
+                        `[out_channels, in_channels//groups, kernel_height, kernel_width]`.
+        f:              Low-pass filter for up/downsampling. Must be prepared beforehand by
+                        calling upfirdn2d.setup_filter(). None = identity (default).
+        up:             Integer upsampling factor (default: 1).
+        down:           Integer downsampling factor (default: 1).
+        padding:        Padding with respect to the upsampled image. Can be a single number
+                        or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                        (default: 0).
+        groups:         Split input channels into N groups (default: 1).
+        flip_weight:    False = convolution, True = correlation (default: True).
+        flip_filter:    False = convolution, True = correlation (default: False).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
+    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
+    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2] and f.dtype == torch.float32)
+    assert isinstance(up, int) and (up >= 1)
+    assert isinstance(down, int) and (down >= 1)
+    assert isinstance(groups, int) and (groups >= 1)
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+    fw, fh = _get_filter_size(f)
+    px0, px1, py0, py1 = _parse_padding(padding)
+
+    # Adjust padding to account for up/downsampling.
+    if up > 1:
+        px0 += (fw + up - 1) // 2
+        px1 += (fw - up) // 2
+        py0 += (fh + up - 1) // 2
+        py1 += (fh - up) // 2
+    if down > 1:
+        px0 += (fw - down + 1) // 2
+        px1 += (fw - down) // 2
+        py0 += (fh - down + 1) // 2
+        py1 += (fh - down) // 2
+
+    # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve.
+    if kw == 1 and kh == 1 and (down > 1 and up == 1):
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample.
+    if kw == 1 and kh == 1 and (up > 1 and down == 1):
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        x = upfirdn2d.upfirdn2d(x=x, f=f, up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+        return x
+
+    # Fast path: downsampling only => use strided convolution.
+    if down > 1 and up == 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: upsampling with optional downsampling => use transpose strided convolution.
+    if up > 1:
+        if groups == 1:
+            w = w.transpose(0, 1)
+        else:
+            w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw)
+            w = w.transpose(1, 2)
+            w = w.reshape(groups * in_channels_per_group, out_channels // groups, kh, kw)
+        px0 -= kw - 1
+        px1 -= kw - up
+        py0 -= kh - 1
+        py1 -= kh - up
+        pxt = max(min(-px0, -px1), 0)
+        pyt = max(min(-py0, -py1), 0)
+        x = _conv2d_wrapper(x=x, w=w, stride=up, padding=[pyt,pxt], groups=groups, transpose=True, flip_weight=(not flip_weight))
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0+pxt,px1+pxt,py0+pyt,py1+pyt], gain=up**2, flip_filter=flip_filter)
+        if down > 1:
+            x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+        return x
+
+    # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d.
+    if up == 1 and down == 1:
+        if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0:
+            return _conv2d_wrapper(x=x, w=w, padding=[py0,px0], groups=groups, flip_weight=flip_weight)
+
+    # Fallback: Generic reference implementation.
+    x = upfirdn2d.upfirdn2d(x=x, f=(f if up > 1 else None), up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+    if down > 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+    return x
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/filtered_lrelu.cpp b/eg3d/torch_utils/ops/filtered_lrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f55466235a020b0f5e150350bfdcd8b2a1e579d
--- /dev/null
+++ b/eg3d/torch_utils/ops/filtered_lrelu.cpp
@@ -0,0 +1,304 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "filtered_lrelu.h"
+
+//------------------------------------------------------------------------
+
+static std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b, torch::Tensor si,
+    int up, int down, int px0, int px1, int py0, int py1, int sx, int sy, float gain, float slope, float clamp, bool flip_filters, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() && b.device() == x.device(), "all input tensors must reside on the same device");
+    TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat, "fu and fd must be float32");
+    TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat, "x and b must be float16 or float32");
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK((fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2), "fu and fd must be rank 1 or 2");
+    TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX, "fu is too large");
+    TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX, "fd is too large");
+    TORCH_CHECK(fu.numel() > 0, "fu is empty");
+    TORCH_CHECK(fd.numel() > 0, "fd is empty");
+    TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1), "b must be a vector with the same number of channels as x");
+    TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");
+
+    // Figure out how much shared memory is available on the device.
+    int maxSharedBytes = 0;
+    AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, x.device().index()));
+    int sharedKB = maxSharedBytes >> 10;
+
+    // Populate enough launch parameters to check if a CUDA kernel exists.
+    filtered_lrelu_kernel_params p;
+    p.up      = up;
+    p.down    = down;
+    p.fuShape = make_int2((int)fu.size(-1), fu.dim() == 2 ? (int)fu.size(0) : 0); // shape [n, 0] indicates separable filter.
+    p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
+    filtered_lrelu_kernel_spec test_spec = choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
+    if (!test_spec.exec)
+    {
+        // No kernel found - return empty tensors and indicate missing kernel with return code of -1.
+        return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
+    }
+
+    // Input/output element size.
+    int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;
+
+    // Input sizes.
+    int64_t xw = (int)x.size(3);
+    int64_t xh = (int)x.size(2);
+    int64_t fut_w = (int)fu.size(-1) - 1;
+    int64_t fut_h = (int)fu.size(0)  - 1;
+    int64_t fdt_w = (int)fd.size(-1) - 1;
+    int64_t fdt_h = (int)fd.size(0)  - 1;
+
+    // Logical size of upsampled buffer.
+    int64_t cw = xw * up + (px0 + px1) - fut_w;
+    int64_t ch = xh * up + (py0 + py1) - fut_h;
+    TORCH_CHECK(cw > fdt_w && ch > fdt_h, "upsampled buffer must be at least the size of downsampling filter");
+    TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");
+
+    // Compute output size and allocate.
+    int64_t yw = (cw - fdt_w + (down - 1)) / down;
+    int64_t yh = (ch - fdt_h + (down - 1)) / down;
+    TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
+    TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
+    torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(), x.suggest_memory_format());
+
+    // Allocate sign tensor.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    int64_t sw_active = 0; // Active width of sign tensor.
+    if (writeSigns)
+    {
+        sw_active = yw * down - (down - 1) + fdt_w;     // Active width in elements.
+        int64_t sh = yh * down - (down - 1) + fdt_h;    // Height = active height.
+        int64_t sw = (sw_active + 15) & ~15;            // Width  = active width in elements, rounded up to multiple of 16.
+        TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
+        s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+    else if (readSigns)
+        sw_active = s.size(3) << 2;
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX, "signs is too large");
+    }
+
+    // Populate rest of CUDA kernel parameters.
+    p.x         = x.data_ptr();
+    p.y         = y.data_ptr();
+    p.b         = b.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.fu        = fu.data_ptr<float>();
+    p.fd        = fd.data_ptr<float>();
+    p.pad0      = make_int2(px0, py0);
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.flip      = (flip_filters) ? 1 : 0;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.yShape    = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3), (int)s.size(2)) : make_int2(0, 0); // Width is in bytes. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+    p.swLimit   = (sw_active + 3) >> 2; // Rounded up to bytes.
+
+    // x, y, b strides are in bytes.
+    p.xStride   = make_longlong4(sz * x.stride(3), sz * x.stride(2), sz * x.stride(1), sz * x.stride(0));
+    p.yStride   = make_longlong4(sz * y.stride(3), sz * y.stride(2), sz * y.stride(1), sz * y.stride(0));
+    p.bStride   = sz * b.stride(0);
+
+    // fu, fd strides are in elements.
+    p.fuStride  = make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
+    p.fdStride  = make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);
+
+    // Determine if indices don't fit in int32. Support negative strides although Torch currently never produces those.
+    bool index64b = false;
+    if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
+    if (std::min(x.size(0) * p.xStride.w, 0ll) + std::min(x.size(1) * p.xStride.z, 0ll) + std::min(x.size(2) * p.xStride.y, 0ll) + std::min(x.size(3) * p.xStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(x.size(0) * p.xStride.w, 0ll) + std::max(x.size(1) * p.xStride.z, 0ll) + std::max(x.size(2) * p.xStride.y, 0ll) + std::max(x.size(3) * p.xStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (std::min(y.size(0) * p.yStride.w, 0ll) + std::min(y.size(1) * p.yStride.z, 0ll) + std::min(y.size(2) * p.yStride.y, 0ll) + std::min(y.size(3) * p.yStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(y.size(0) * p.yStride.w, 0ll) + std::max(y.size(1) * p.yStride.z, 0ll) + std::max(y.size(2) * p.yStride.y, 0ll) + std::max(y.size(3) * p.yStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (s.numel() > INT_MAX) index64b = true;
+
+    // Choose CUDA kernel.
+    filtered_lrelu_kernel_spec spec = { 0 };
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_cuda", [&]
+    {
+        if constexpr (sizeof(scalar_t) <= 4) // Exclude doubles. constexpr prevents template instantiation.
+        {
+            // Choose kernel based on index type, datatype and sign read/write modes.
+            if      (!index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true,  false>(p, sharedKB);
+            else if (!index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true >(p, sharedKB);
+            else if (!index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(p, sharedKB);
+            else if ( index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true,  false>(p, sharedKB);
+            else if ( index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true >(p, sharedKB);
+            else if ( index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(p, sharedKB);
+        }
+    });
+    TORCH_CHECK(spec.exec, "internal error - CUDA kernel not found") // This should not happen because we tested earlier that kernel exists.
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = spec.numWarps * 32;
+    int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
+    int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
+    int gz = p.yShape.z * p.yShape.w;
+
+    // Repeat multiple horizontal tiles in a CTA?
+    if (spec.xrep)
+    {
+        p.tilesXrep = spec.xrep;
+        p.tilesXdim = gx;
+
+        gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
+        std::swap(gx, gy);
+    }
+    else
+    {
+        p.tilesXrep = 0;
+        p.tilesXdim = 0;
+    }
+
+    // Launch filter setup kernel.
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0, at::cuda::getCurrentCUDAStream()));
+
+    // Copy kernels to constant memory.
+    if      ( writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<true,  false>(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns &&  readSigns) AT_CUDA_CHECK((copy_filters<false, true >(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<false, false>(at::cuda::getCurrentCUDAStream())));
+
+    // Set cache and shared memory configurations for main kernel.
+    AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
+    if (spec.dynamicSharedKB) // Need dynamically allocated shared memory?
+        AT_CUDA_CHECK(cudaFuncSetAttribute(spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize, spec.dynamicSharedKB << 10));
+    AT_CUDA_CHECK(cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
+
+    // Launch main kernel.
+    const int maxSubGz = 65535; // CUDA maximum for block z dimension.
+    for (int zofs=0; zofs < gz; zofs += maxSubGz) // Do multiple launches if gz is too big.
+    {
+        p.blockZofs = zofs;
+        int subGz = std::min(maxSubGz, gz - zofs);
+        AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args, spec.dynamicSharedKB << 10, at::cuda::getCurrentCUDAStream()));
+    }
+
+    // Done.
+    return std::make_tuple(y, so, 0);
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor filtered_lrelu_act(torch::Tensor x, torch::Tensor si, int sx, int sy, float gain, float slope, float clamp, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat || x.dtype() == torch::kDouble, "x must be float16, float32 or float64");
+
+    // Output signs if we don't have sign input.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    if (writeSigns)
+    {
+        int64_t sw = x.size(3);
+        sw = (sw + 15) & ~15; // Round to a multiple of 16 for coalescing.
+        s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX, "signs tensor is too large");
+    }
+
+    // Initialize CUDA kernel parameters.
+    filtered_lrelu_act_kernel_params p;
+    p.x         = x.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.xStride   = make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3) << 2, (int)s.size(2)) : make_int2(0, 0); // Width is in elements. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+
+    // Choose CUDA kernel.
+    void* func = 0;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_act_cuda", [&]
+    {
+        if (writeSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
+        else if (readSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
+        else
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
+    });
+    TORCH_CHECK(func, "internal error - CUDA kernel not found");
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = 128; // 4 warps per block.
+
+    // Logical size of launch = writeSigns ? p.s : p.x
+    uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
+    uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
+    uint32_t gz = p.xShape.z * p.xShape.w; // Same as in p.sShape if signs are in use.
+    gx = (gx - 1) / bx + 1;
+
+    // Make sure grid y and z dimensions are within CUDA launch limits. Kernel loops internally to do the rest.
+    const uint32_t gmax = 65535;
+    gy = std::min(gy, gmax);
+    gz = std::min(gz, gmax);
+
+    // Launch.
+    AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0, at::cuda::getCurrentCUDAStream()));
+    return so;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("filtered_lrelu",      &filtered_lrelu);      // The whole thing.
+    m.def("filtered_lrelu_act_", &filtered_lrelu_act);  // Activation and sign tensor handling only. Modifies data tensor in-place.
+}
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/filtered_lrelu.cu b/eg3d/torch_utils/ops/filtered_lrelu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aaac95408365f023ffaa4cb89348d499d3b948f0
--- /dev/null
+++ b/eg3d/torch_utils/ops/filtered_lrelu.cu
@@ -0,0 +1,1288 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/util/Half.h>
+#include "filtered_lrelu.h"
+#include <cstdint>
+
+//------------------------------------------------------------------------
+// Helpers.
+
+enum // Filter modes.
+{
+    MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
+    MODE_FUSD = 1,  // Full upsampling, separable downsampling.
+    MODE_SUFD = 2,  // Separable upsampling, full downsampling.
+    MODE_FUFD = 3,  // Full upsampling, full downsampling.
+};
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>
+{
+    typedef double scalar_t; typedef double2 vec2_t; typedef double4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_double2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_double4(0, 0, 0, 0); }
+    __device__ __forceinline__ static double clamp(double x, double c) { return fmin(fmax(x, -c), c); }
+};
+template <> struct InternalType<float>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+template <> struct InternalType<c10::Half>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+
+#define MIN(A, B)       ((A) < (B) ? (A) : (B))
+#define MAX(A, B)       ((A) > (B) ? (A) : (B))
+#define CEIL_DIV(A, B) (((B)==1) ? (A) : \
+                        ((B)==2) ? ((int)((A)+1) >> 1) : \
+                        ((B)==4) ? ((int)((A)+3) >> 2) : \
+                        (((A) + ((A) > 0 ? (B) - 1 : 0)) / (B)))
+
+// This works only up to blocks of size 256 x 256 and for all N that are powers of two.
+template <int N> __device__ __forceinline__ void fast_div_mod(int& x, int& y, unsigned int i)
+{
+    if ((N & (N-1)) && N <= 256)
+        y = (i * ((1<<24)/N + 1)) >> 24; // Assumes N <= 256, i < N*256.
+    else
+        y = i/N;
+
+    x = i - y*N;
+}
+
+// Type cast stride before reading it.
+template <class T> __device__ __forceinline__ T get_stride(const int64_t& x)
+{
+    return *reinterpret_cast<const T*>(&x);
+}
+
+//------------------------------------------------------------------------
+// Filters, setup kernel, copying function.
+
+#define MAX_FILTER_SIZE 32
+
+// Combined up/down filter buffers so that transfer can be done with one copy.
+__device__              float g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in global memory, written by setup kernel.
+__device__ __constant__ float c_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in constant memory, read by main kernel.
+
+// Accessors to combined buffers to index up/down filters individually.
+#define c_fu (c_fbuf)
+#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+#define g_fu (g_fbuf)
+#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+
+// Set up filters into global memory buffer.
+static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p)
+{
+    for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE; idx += blockDim.x)
+    {
+        int x, y;
+        fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);
+
+        int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
+        int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
+        if (p.fuShape.y > 0)
+            g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y) ? 0.0f : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
+        else
+            g_fu[idx] = (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];
+
+        int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
+        int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
+        if (p.fdShape.y > 0)
+            g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y) ? 0.0f : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
+        else
+            g_fd[idx] = (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
+    }
+}
+
+// Host function to copy filters written by setup kernel into constant buffer for main kernel.
+template <bool, bool> static cudaError_t copy_filters(cudaStream_t stream)
+{
+    void* src = 0;
+    cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
+    if (err) return err;
+    return cudaMemcpyToSymbolAsync(c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream);
+}
+
+//------------------------------------------------------------------------
+// Coordinate spaces:
+// - Relative to input tensor:      inX, inY, tileInX, tileInY
+// - Relative to input tile:        relInX, relInY, tileInW, tileInH
+// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
+// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
+// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
+//
+// Relationships between coordinate spaces:
+// - inX = tileInX + relInX
+// - inY = tileInY + relInY
+// - relUpX = relInX * up + phaseInX
+// - relUpY = relInY * up + phaseInY
+// - relUpX = relOutX * down
+// - relUpY = relOutY * down
+// - outX = tileOutX + relOutX
+// - outY = tileOutY + relOutY
+
+extern __shared__ char s_buf_raw[]; // When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.
+
+template <class T, class index_t, int sharedKB, bool signWrite, bool signRead, int filterMode, int up, int fuSize, int down, int fdSize, int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep, bool enableWriteSkip>
+static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p)
+{
+    // Check that we don't try to support non-existing filter modes.
+    static_assert(up   == 1 || up   == 2 || up   == 4, "only up=1, up=2, up=4 scales supported");
+    static_assert(down == 1 || down == 2 || down == 4, "only down=1, down=2, down=4 scales supported");
+    static_assert(fuSize >= up,   "upsampling filter size must be at least upsampling factor");
+    static_assert(fdSize >= down, "downsampling filter size must be at least downsampling factor");
+    static_assert(fuSize % up   == 0, "upsampling filter size must be divisible with upsampling factor");
+    static_assert(fdSize % down == 0, "downsampling filter size must be divisible with downsampling factor");
+    static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE, "filter size greater than MAX_FILTER_SIZE");
+    static_assert(up   != 1 || (fuSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "up=1 supported only for 1x1 full filters");
+    static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "down=1 supported only for 1x1 full filters");
+    static_assert(!(up   == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "full filters not supported for up=4");
+    static_assert(!(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "full filters not supported for down=4");
+
+    // Static definitions.
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    typedef typename InternalType<T>::vec2_t vec2_t;
+    typedef typename InternalType<T>::vec4_t vec4_t;
+    const int tileUpW    = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) & ~3;  // Upsampled tile width, rounded up to multiple of 4.
+    const int tileUpH    = tileOutH * down + (fdSize - 1) - (down - 1);             // Upsampled tile height.
+    const int tileInW    = CEIL_DIV(tileUpW  + (fuSize - 1), up);                   // Input tile width.
+    const int tileInH    = CEIL_DIV(tileUpH  + (fuSize - 1), up);                   // Input tile height.
+    const int tileUpH_up = CEIL_DIV(tileUpH, up) * up;                              // Upsampled tile height rounded up to a multiple of up.
+    const int tileInH_up = CEIL_DIV(tileUpH_up + (fuSize - 1), up);                 // For allocations only, to avoid shared memory read overruns with up=2 and up=4.
+
+    // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
+    const bool downInline = (down == 1) && ((up == 1 && filterMode == MODE_FUFD) || (up == 2 && filterMode == MODE_SUFD));
+
+    // Sizes of logical buffers.
+    const int szIn    = tileInH_up * tileInW;
+    const int szUpX   = tileInH_up * tileUpW;
+    const int szUpXY  = downInline ? 0 : (tileUpH * tileUpW);
+    const int szDownX = tileUpH * tileOutW;
+
+    // Sizes for shared memory arrays.
+    const int s_buf0_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUSD) ? MAX(szIn, szDownX) :
+        (filterMode == MODE_SUFD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUFD) ? szIn :
+        -1;
+    const int s_buf1_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szUpX, szDownX) :
+        (filterMode == MODE_FUSD) ? szUpXY :
+        (filterMode == MODE_SUFD) ? szUpX  :
+        (filterMode == MODE_FUFD) ? szUpXY :
+        -1;
+
+    // Ensure U128 alignment.
+    const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
+    const int s_buf1_size = (s_buf1_size_base + 3) & ~3;
+
+    // Check at compile time that we don't use too much shared memory.
+    static_assert((s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10), "shared memory overflow");
+
+    // Declare shared memory arrays.
+    scalar_t* s_buf0;
+    scalar_t* s_buf1;
+    if (sharedKB <= 48)
+    {
+        // Allocate shared memory arrays here.
+        __shared__ scalar_t s_buf0_st[(sharedKB > 48) ? (1<<24) : (s_buf0_size + s_buf1_size)]; // Prevent launching if this isn't optimized away when unused.
+        s_buf0 = s_buf0_st;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+    else
+    {
+        // Use the dynamically allocated shared memory array.
+        s_buf0 = (scalar_t*)s_buf_raw;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+
+    // Pointers to the buffers.
+    scalar_t* s_tileIn;       // Input tile:                      [relInX * tileInH + relInY]
+    scalar_t* s_tileUpX;      // After horizontal upsampling:     [relInY * tileUpW + relUpX]
+    scalar_t* s_tileUpXY;     // After upsampling:                [relUpY * tileUpW + relUpX]
+    scalar_t* s_tileDownX;    // After horizontal downsampling:   [relUpY * tileOutW + relOutX]
+    if (filterMode == MODE_SUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+        s_tileDownX = s_buf1;
+    }
+    else if (filterMode == MODE_FUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+        s_tileDownX = s_buf0;
+    }
+    else if (filterMode == MODE_SUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+    }
+    else if (filterMode == MODE_FUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+    }
+
+    // Allow large grids in z direction via per-launch offset.
+    int channelIdx = blockIdx.z + p.blockZofs;
+    int batchIdx = channelIdx / p.yShape.z;
+    channelIdx -= batchIdx * p.yShape.z;
+
+    // Offset to output feature map. In bytes.
+    index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) + batchIdx * get_stride<index_t>(p.yStride.w);
+
+    // Sign shift amount.
+    uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;
+
+    // Inner tile loop.
+    #pragma unroll 1
+    for (int tileIdx = 0; !enableXrep || (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y)); tileIdx++)
+    {
+        // Locate output tile.
+        int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
+        int tileOutX = tileX * tileOutW;
+        int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;
+
+        // Locate input tile.
+        int tmpX = tileOutX * down - p.pad0.x;
+        int tmpY = tileOutY * down - p.pad0.y;
+        int tileInX = CEIL_DIV(tmpX, up);
+        int tileInY = CEIL_DIV(tmpY, up);
+        const int phaseInX = tileInX * up - tmpX;
+        const int phaseInY = tileInY * up - tmpY;
+
+        // Extra sync if input and output buffers are the same and we are not on first tile.
+        if (enableXrep && tileIdx > 0 && (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) || (filterMode == MODE_FUFD && downInline)))
+            __syncthreads();
+
+        // Load input tile & apply bias. Unrolled.
+        scalar_t b = (scalar_t)*(const T*)((const char*)p.b + (channelIdx * get_stride<index_t>(p.bStride)));
+        index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) + batchIdx * get_stride<index_t>(p.xStride.w);
+        int idx = threadIdx.x;
+        const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
+        #pragma unroll
+        for (int loop = 0; loop < loopCountIN; loop++)
+        {
+            int relInX, relInY;
+            fast_div_mod<tileInW>(relInX, relInY, idx);
+            int inX = tileInX + relInX;
+            int inY = tileInY + relInY;
+            scalar_t v = 0;
+
+            if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
+                v = (scalar_t)*((const T*)((const char*)p.x + (inX * get_stride<index_t>(p.xStride.x) + inY * get_stride<index_t>(p.xStride.y) + mapOfsIn))) + b;
+
+            bool skip = (loop == loopCountIN-1) && (idx >= tileInW * tileInH);
+            if (!skip)
+                s_tileIn[idx] = v;
+
+            idx += threadsPerBlock;
+        }
+
+        if (filterMode == MODE_SUSD || filterMode == MODE_SUFD) // Separable upsampling filter.
+        {
+            // Horizontal upsampling.
+            __syncthreads();
+            if (up == 4)
+            {
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    scalar_t a = s_tileIn[src0];
+                    if (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInX == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInX == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                    s_tileUpX[dst+2] = v.z;
+                    s_tileUpX[dst+3] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                bool p0 = (phaseInX == 0);
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    scalar_t a = s_tileIn[src0];
+                    if (p0) // (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                }
+            }
+
+            // Vertical upsampling & nonlinearity.
+
+            __syncthreads();
+            int groupMask = 15 << ((threadIdx.x & 31) & ~3);
+            int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+            int sShapeMaxY = MIN(p.sShape.y, tileOutY * down + tileUpH); // Avoid out-of-tile sign writes.
+            if (up == 4)
+            {
+                minY -= 3; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec4_t v = InternalType<T>::zero_vec4();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInY == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInY == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+                    index_t si2 = si0 + p.sShape.x * 2;
+                    index_t si3 = si0 + p.sShape.x * 3;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 <<  0;
+                            int sy = __float_as_uint(v.y) >> 31 <<  8;
+                            int sz = __float_as_uint(v.z) >> 31 << 16;
+                            int sw = __float_as_uint(v.w) >> 31 << 24;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (sz) v.z *= p.slope;
+                            if (sw) v.w *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 <<  0;
+                                int sy = __float_as_uint(v.y) >> 31 <<  8;
+                                int sz = __float_as_uint(v.z) >> 31 << 16;
+                                int sw = __float_as_uint(v.w) >> 31 << 24;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (sz) v.z *= p.slope;
+                                if (sw) v.w *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            int ss = (signX & 3) << 1;
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> ss; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> ss; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                            if ((uint32_t)(signY + 2) < p.sShape.y) { int s = p.s[si2] >> ss; if (s & 1) v.z *= p.slope; if (s & 2) v.z = 0.f; }
+                            if ((uint32_t)(signY + 3) < p.sShape.y) { int s = p.s[si3] >> ss; if (s & 1) v.w *= p.slope; if (s & 2) v.w = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[dst + 0 * tileUpW] = v.x;
+                    if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
+                    if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
+                    if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                minY -= 1; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec2_t v = InternalType<T>::zero_vec2();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 << 0;
+                            int sy = __float_as_uint(v.y) >> 31 << 8;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 << 0;
+                                int sy = __float_as_uint(v.y) >> 31 << 8;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> signXo; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> signXo; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                    }
+
+                    if (!downInline)
+                    {
+                        // Write into temporary buffer.
+                        s_tileUpXY[dst] = v.x;
+                        if (relUpY0 < tileUpH - 1)
+                            s_tileUpXY[dst + tileUpW] = v.y;
+                    }
+                    else
+                    {
+                        // Write directly into output buffer.
+                        if ((uint32_t)x < p.yShape.x)
+                        {
+                            int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
+                            index_t ofs = x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                            if ((uint32_t)y + 0 < p.yShape.y) *((T*)((char*)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
+                            if ((uint32_t)y + 1 < ymax) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.y))) = (T)(v.y * (scalar_t)c_fd[0]);
+                        }
+                    }
+                }
+            }
+        }
+        else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD)
+        {
+            // Full upsampling filter.
+
+            if (up == 2)
+            {
+                // 2 x 2-wide.
+                __syncthreads();
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y : 0; // Skip already written signs.
+                for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
+                    int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
+                    int src0 = relInX0 + tileInW * relInY0;
+                    int tap0y = (relInY0 * up + phaseInY - relUpY0);
+
+                    #define X_LOOP(TAPY, PX) \
+                        for (int sx = 0; sx < fuSize / up; sx++) \
+                        { \
+                            v.x += a * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.z += b * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 0) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                            v.y += a * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.w += b * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 1) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                        }
+
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    if (tap0y == 0 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 0) }
+                    if (tap0y == 0 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 1) }
+                    if (tap0y == 1 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 0) }
+                    if (tap0y == 1 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 1) }
+
+                    #undef X_LOOP
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31;
+                            int sy = __float_as_uint(v.y) >> 31;
+                            int sz = __float_as_uint(v.z) >> 31;
+                            int sw = __float_as_uint(v.w) >> 31;
+                            if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31;
+                                int sy = __float_as_uint(v.y) >> 31;
+                                int sz = __float_as_uint(v.z) >> 31;
+                                int sw = __float_as_uint(v.w) >> 31;
+                                if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read sign and apply.
+                    {
+                        if ((uint32_t)signY < p.sShape.y)
+                        {
+                            int s = 0;
+                            if ((uint32_t)signXb     < p.swLimit) s  = p.s[si];
+                            if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
+                            s >>= (signX & 3) << 1;
+                            if (s & 0x01) v.x *= p.slope; if (s & 0x02) v.x = 0.f;
+                            if (s & 0x04) v.y *= p.slope; if (s & 0x08) v.y = 0.f;
+                            if (s & 0x10) v.z *= p.slope; if (s & 0x20) v.z = 0.f;
+                            if (s & 0x40) v.w *= p.slope; if (s & 0x80) v.w = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[idx + 0] = v.x;
+                    s_tileUpXY[idx + 1] = v.y;
+                    s_tileUpXY[idx + 2] = v.z;
+                    s_tileUpXY[idx + 3] = v.w;
+                }
+            }
+            else if (up == 1)
+            {
+                __syncthreads();
+                uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH; idx += blockDim.x)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0]; // 1x1 filter.
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    v *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write sign.
+                            uint32_t s = 0;
+                            uint32_t signXbit = (1u << signXo);
+                            if (v < 0.f)
+                            {
+                                s = signXbit;
+                                v *= p.slope;
+                            }
+                            if (fabsf(v) > p.clamp)
+                            {
+                                s = signXbit * 2;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write sign.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                uint32_t s = 0;
+                                uint32_t signXbit = (1u << signXo);
+                                if (v < 0.f)
+                                {
+                                    s = signXbit;
+                                    v *= p.slope;
+                                }
+                                if (fabsf(v) > p.clamp)
+                                {
+                                    s = signXbit * 2;
+                                    v = InternalType<T>::clamp(v, p.clamp);
+                                }
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                            else
+                            {
+                                // Just compute the value.
+                                if (v < 0.f) v *= p.slope;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead)
+                    {
+                        // Read sign and apply if within sign tensor bounds.
+                        if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y)
+                        {
+                            int s = p.s[si];
+                            s >>= signXo;
+                            if (s & 1) v *= p.slope;
+                            if (s & 2) v = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v < 0.f) v *= p.slope;
+                        v = InternalType<T>::clamp(v, p.clamp);
+                    }
+
+                    if (!downInline) // Write into temporary buffer.
+                        s_tileUpXY[idx] = v;
+                    else if ((uint32_t)x < p.yShape.x && (uint32_t)y < p.yShape.y) // Write directly into output buffer
+                        *((T*)((char*)p.y + (x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
+                }
+            }
+        }
+
+        // Downsampling.
+        if (filterMode == MODE_SUSD || filterMode == MODE_FUSD)
+        {
+            // Horizontal downsampling.
+            __syncthreads();
+            if (down == 4 && tileOutW % 4 == 0)
+            {
+                // Calculate 4 pixels at a time.
+                for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +  0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 +  4 + step] * (scalar_t)c_fd[step];
+                        v.z += s_tileUpXY[src0 +  8 + step] * (scalar_t)c_fd[step];
+                        v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                    s_tileDownX[idx+2] = v.z;
+                    s_tileDownX[idx+3] = v.w;
+                }
+            }
+            else if ((down == 2 || down == 4) && (tileOutW % 2 == 0))
+            {
+                // Calculate 2 pixels at a time.
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +    0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                }
+            }
+            else
+            {
+                // Calculate 1 pixel at a time.
+                for (int idx = threadIdx.x; idx < tileOutW * tileUpH; idx += blockDim.x)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src = relUpY * tileUpW + relUpX0;
+                    scalar_t v = 0.f;
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                        v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
+                    s_tileDownX[idx] = v;
+                }
+            }
+
+            // Vertical downsampling & store output tile.
+            __syncthreads();
+            for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+            {
+                int relOutX, relOutY0;
+                fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
+                int relUpY0 = relOutY0 * down;
+                int src0 = relUpY0 * tileOutW + relOutX;
+                scalar_t v = 0;
+                #pragma unroll
+                for (int step = 0; step < fdSize; step++)
+                    v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];
+
+                int outX = tileOutX + relOutX;
+                int outY = tileOutY + relOutY0;
+
+                if (outX < p.yShape.x & outY < p.yShape.y)
+                    *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+            }
+        }
+        else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD)
+        {
+            // Full downsampling filter.
+            if (down == 2)
+            {
+                // 2-wide.
+                __syncthreads();
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int relUpY0 = relOutY0 * down;
+                    int src0 = relUpY0 * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int sy = 0; sy < fdSize; sy++)
+                    #pragma unroll
+                    for (int sx = 0; sx < fdSize; sx++)
+                    {
+                        v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                        v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                    }
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outY < p.yShape.y)
+                    {
+                        index_t ofs = outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                        if (outX + 0 < p.yShape.x) *((T*)((char*)p.y + ofs)) = (T)v.x;
+                        if (outX + 1 < p.yShape.x) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.x))) = (T)v.y;
+                    }
+                }
+            }
+            else if (down == 1 && !downInline)
+            {
+                // Thread per pixel.
+                __syncthreads();
+                for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0]; // 1x1 filter.
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
+                        *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+                }
+            }
+        }
+
+        if (!enableXrep)
+            break;
+    }
+}
+
+//------------------------------------------------------------------------
+// Compute activation function and signs for upsampled data tensor, modifying data tensor in-place. Used for accelerating the generic variant.
+// Sign tensor is known to be contiguous, and p.x and p.s have the same z, w dimensions. 64-bit indexing is always used.
+
+template <class T, bool signWrite, bool signRead>
+static __global__ void filtered_lrelu_act_kernel(filtered_lrelu_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+
+    // Indexing.
+    int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
+    int32_t qmax = p.xShape.z * p.xShape.w; // Combined minibatch*channel maximum index.
+
+    // Loop to accommodate oversized tensors.
+    for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
+    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y)
+    {
+        // Extract z and w (channel, minibatch index).
+        int32_t w = q / p.xShape.z;
+        int32_t z = q - w * p.xShape.z;
+
+        // Choose behavior based on sign read/write mode.
+        if (signWrite)
+        {
+            // Process value if in p.x.
+            uint32_t s = 0;
+            if (x < p.xShape.x && y < p.xShape.y)
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+
+                // Gain, LReLU, clamp.
+                v *= p.gain;
+                if (v < 0.f)
+                {
+                    v *= p.slope;
+                    s = 1; // Sign.
+                }
+                if (fabsf(v) > p.clamp)
+                {
+                    v = InternalType<T>::clamp(v, p.clamp);
+                    s = 2; // Clamp.
+                }
+
+                *pv = (T)v; // Write value.
+            }
+
+            // Coalesce into threads 0 and 16 of warp.
+            uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
+            s <<= ((threadIdx.x & 15) << 1); // Shift into place.
+            s |= __shfl_xor_sync(m, s, 1); // Distribute.
+            s |= __shfl_xor_sync(m, s, 2);
+            s |= __shfl_xor_sync(m, s, 4);
+            s |= __shfl_xor_sync(m, s, 8);
+
+            // Write signs if leader and in p.s.
+            if (!(threadIdx.x & 15) && x < p.sShape.x) // y is always in.
+            {
+                uint64_t is = x + p.sShape.x * (y + (int64_t)p.sShape.y * q); // Contiguous.
+                ((uint32_t*)p.s)[is >> 4] = s;
+            }
+        }
+        else if (signRead)
+        {
+            // Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+
+                // Apply sign buffer offset.
+                uint32_t sx = x + p.sOfs.x;
+                uint32_t sy = y + p.sOfs.y;
+
+                // Read and apply signs if we land inside valid region of sign buffer.
+                if (sx < p.sShape.x && sy < p.sShape.y)
+                {
+                    uint64_t is = (sx >> 2) + (p.sShape.x >> 2) * (sy + (uint64_t)p.sShape.y * q); // Contiguous.
+                    unsigned char s = p.s[is];
+                    s >>= (sx & 3) << 1; // Shift into place.
+                    if (s & 1) // Sign?
+                        v *= p.slope;
+                    if (s & 2) // Clamp?
+                        v = 0.f;
+                }
+
+                *pv = (T)v; // Write value.
+            }
+        }
+        else
+        {
+            // Forward pass with no sign write. Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+                if (v < 0.f)
+                    v *= p.slope;
+                if (fabsf(v) > p.clamp)
+                    v = InternalType<T>::clamp(v, p.clamp);
+                *pv = (T)v; // Write value.
+            }
+        }
+    }
+}
+
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void)
+{
+    return (void*)filtered_lrelu_act_kernel<T, signWrite, signRead>;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB)
+{
+    filtered_lrelu_kernel_spec s = { 0 };
+
+    // Return the first matching kernel.
+#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS) \
+    if (sharedKB >= SH) \
+    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) || (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD))) \
+    if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) || (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD))) \
+    if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU && p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) \
+    { \
+        static_assert((D*TW % 4) == 0, "down * tileWidth must be divisible by 4"); \
+        static_assert(FU % U == 0, "upscaling filter size must be multiple of upscaling factor"); \
+        static_assert(FD % D == 0, "downscaling filter size must be multiple of downscaling factor"); \
+        s.setup = (void*)setup_filters_kernel; \
+        s.exec = (void*)filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, U, FU, D, FD, TW, TH, W*32, !!XR, !!WS>; \
+        s.tileOut = make_int2(TW, TH); \
+        s.numWarps = W; \
+        s.xrep = XR; \
+        s.dynamicSharedKB = (SH == 48) ? 0 : SH; \
+        return s; \
+    }
+
+    // Launch parameters for various kernel specializations.
+    // Small filters must be listed before large filters, otherwise the kernel for larger filter will always match first.
+    // Kernels that use more shared memory must be listed before those that use less, for the same reason.
+
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/1,1,  /*mode*/MODE_FUFD, /*tw,th,warps,xrep,wskip*/64,  178, 32,  0,  0) // 1t-upf1-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/152, 95,  16,  0,  0) // 4t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  22,  16,  0,  0) // 4t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  29,  16,  11, 0) // 4t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/60,  28,  16,  0,  0) // 4t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  28,  16,  0,  0) // 4t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  31,  16,  11, 0) // 4t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  36,  16,  0,  0) // 4t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  22,  16,  12, 0) // 4t-ups2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/29,  15,  16,  0,  0) // 4t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/96,  150, 28,  0,  0) // 6t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  35,  24,  0,  0) // 6t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  16,  10, 0) // 6t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/58,  28,  24,  8,  0) // 6t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/52,  28,  16,  0,  0) // 6t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  51,  16,  5,  0) // 6t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  56,  16,  6,  0) // 6t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  18,  16,  12, 0) // 6t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  31,  32,  6,  0) // 6t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  13,  24,  0,  0) // 6t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/148, 89,  24,  0,  0) // 8t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  31,  16,  5,  0) // 8t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  41,  16,  9,  0) // 8t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  26,  24,  0,  0) // 8t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  40,  16,  0,  0) // 8t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  24,  5,  0) // 8t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  50,  16,  0,  0) // 8t-ups4-downf2
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/24,  24,  32,  12, 1) // 8t-ups2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  13,  16,  10, 1) // 8t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  28,  28,  4,  0) // 8t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  10,  24,  0,  0) // 8t-upf2-downs4
+
+    #undef CASE
+    return s; // No kernel found.
+}
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/filtered_lrelu.h b/eg3d/torch_utils/ops/filtered_lrelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2bfd1dd537909de9cd3b14765a482056391683b
--- /dev/null
+++ b/eg3d/torch_utils/ops/filtered_lrelu.h
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct filtered_lrelu_kernel_params
+{
+    // These parameters decide which kernel to use.
+    int             up;         // upsampling ratio (1, 2, 4)
+    int             down;       // downsampling ratio (1, 2, 4)
+    int2            fuShape;    // [size, 1] | [size, size]
+    int2            fdShape;    // [size, 1] | [size, size]
+
+    int             _dummy;     // Alignment.
+
+    // Rest of the parameters.
+    const void*     x;          // Input tensor.
+    void*           y;          // Output tensor.
+    const void*     b;          // Bias tensor.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+    const float*    fu;         // Upsampling filter.
+    const float*    fd;         // Downsampling filter.
+
+    int2            pad0;       // Left/top padding.
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+    int             flip;       // Filter kernel flip for gradient computation.
+
+    int             tilesXdim;  // Original number of horizontal output tiles.
+    int             tilesXrep;  // Number of horizontal tiles per CTA.
+    int             blockZofs;  // Block z offset to support large minibatch, channel dimensions.
+
+    int4            xShape;     // [width, height, channel, batch]
+    int4            yShape;     // [width, height, channel, batch]
+    int2            sShape;     // [width, height] - width is in bytes. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+    int             swLimit;    // Active width of sign tensor in bytes.
+
+    longlong4       xStride;    // Strides of all tensors except signs, same component order as shapes.
+    longlong4       yStride;    //
+    int64_t         bStride;    //
+    longlong3       fuStride;   //
+    longlong3       fdStride;   //
+};
+
+struct filtered_lrelu_act_kernel_params
+{
+    void*           x;          // Input/output, modified in-place.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+
+    int4            xShape;     // [width, height, channel, batch]
+    longlong4       xStride;    // Input/output tensor strides, same order as in shape.
+    int2            sShape;     // [width, height] - width is in elements. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct filtered_lrelu_kernel_spec
+{
+    void*   setup;              // Function for filter kernel setup.
+    void*   exec;               // Function for main operation.
+    int2    tileOut;            // Width/height of launch tile.
+    int     numWarps;           // Number of warps per thread block, determines launch block size.
+    int     xrep;               // For processing multiple horizontal tiles per thread block.
+    int     dynamicSharedKB;    // How much dynamic shared memory the exec kernel wants.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB);
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void);
+template <bool signWrite, bool signRead> cudaError_t copy_filters(cudaStream_t stream);
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/filtered_lrelu.py b/eg3d/torch_utils/ops/filtered_lrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2047b7e19320e8d03e444ca1cb03fe00d0c5e96e
--- /dev/null
+++ b/eg3d/torch_utils/ops/filtered_lrelu.py
@@ -0,0 +1,276 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import os
+import numpy as np
+import torch
+import warnings
+
+from .. import custom_ops
+from .. import misc
+from . import upfirdn2d
+from . import bias_act
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='filtered_lrelu_plugin',
+            sources=['filtered_lrelu.cpp', 'filtered_lrelu_wr.cu', 'filtered_lrelu_rd.cu', 'filtered_lrelu_ns.cu'],
+            headers=['filtered_lrelu.h', 'filtered_lrelu.cu'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+    return True
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor)
+    assert 1 <= f.ndim <= 2
+    return f.shape[-1], f.shape[0] # width, height
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, (int, np.integer)) for x in padding)
+    padding = [int(x) for x in padding]
+    if len(padding) == 2:
+        px, py = padding
+        padding = [px, px, py, py]
+    px0, px1, py0, py1 = padding
+    return px0, px1, py0, py1
+
+#----------------------------------------------------------------------------
+
+def filtered_lrelu(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False, impl='cuda'):
+    r"""Filtered leaky ReLU for a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Add channel-specific bias if provided (`b`).
+
+    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    3. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    4. Convolve the image with the specified upsampling FIR filter (`fu`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    5. Multiply each value by the provided gain factor (`gain`).
+
+    6. Apply leaky ReLU activation function to each value.
+
+    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is provided.
+
+    8. Convolve the image with the specified downsampling FIR filter (`fd`), shrinking
+       it so that the footprint of all output pixels lies within the input image.
+
+    9. Downsample the image by keeping every Nth pixel (`down`).
+
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float16/float64 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        fu:          Float32 upsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        fd:          Float32 downsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        b:           Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                     as `x`. The length of vector must must match the channel dimension of `x`.
+        up:          Integer upsampling factor (default: 1).
+        down:        Integer downsampling factor. (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        gain:        Overall scaling factor for signal magnitude (default: sqrt(2)).
+        slope:       Slope on the negative side of leaky ReLU (default: 0.2).
+        clamp:       Maximum magnitude for leaky ReLU output (default: None).
+        flip_filter: False = convolution, True = correlation (default: False).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _filtered_lrelu_cuda(up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter).apply(x, fu, fd, b, None, 0, 0)
+    return _filtered_lrelu_ref(x, fu=fu, fd=fd, b=b, up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _filtered_lrelu_ref(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Slow and memory-inefficient reference implementation of `filtered_lrelu()` using
+    existing `upfirdn2n()` and `bias_act()` ops.
+    """
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    fu_w, fu_h = _get_filter_size(fu)
+    fd_w, fd_h = _get_filter_size(fd)
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.dtype == x.dtype
+        misc.assert_shape(b, [x.shape[1]])
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    assert slope == float(slope) and slope >= 0
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+
+    # Calculate output size.
+    batch_size, channels, in_h, in_w = x.shape
+    in_dtype = x.dtype
+    out_w = (in_w * up + (px0 + px1) - (fu_w - 1) - (fd_w - 1) + (down - 1)) // down
+    out_h = (in_h * up + (py0 + py1) - (fu_h - 1) - (fd_h - 1) + (down - 1)) // down
+
+    # Compute using existing ops.
+    x = bias_act.bias_act(x=x, b=b) # Apply bias.
+    x = upfirdn2d.upfirdn2d(x=x, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+    x = bias_act.bias_act(x=x, act='lrelu', alpha=slope, gain=gain, clamp=clamp) # Bias, leaky ReLU, clamp.
+    x = upfirdn2d.upfirdn2d(x=x, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+    # Check output shape & dtype.
+    misc.assert_shape(x, [batch_size, channels, out_h, out_w])
+    assert x.dtype == in_dtype
+    return x
+
+#----------------------------------------------------------------------------
+
+_filtered_lrelu_cuda_cache = dict()
+
+def _filtered_lrelu_cuda(up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.
+    """
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    gain = float(gain)
+    assert slope == float(slope) and slope >= 0
+    slope = float(slope)
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+    clamp = float(clamp if clamp is not None else 'inf')
+
+    # Lookup from cache.
+    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
+    if key in _filtered_lrelu_cuda_cache:
+        return _filtered_lrelu_cuda_cache[key]
+
+    # Forward op.
+    class FilteredLReluCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, fu, fd, b, si, sx, sy): # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+
+            # Replace empty up/downsample kernels with full 1x1 kernels (faster than separable).
+            if fu is None:
+                fu = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if fd is None:
+                fd = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            assert 1 <= fu.ndim <= 2
+            assert 1 <= fd.ndim <= 2
+
+            # Replace separable 1x1 kernels with full 1x1 kernels when scale factor is 1.
+            if up == 1 and fu.ndim == 1 and fu.shape[0] == 1:
+                fu = fu.square()[None]
+            if down == 1 and fd.ndim == 1 and fd.shape[0] == 1:
+                fd = fd.square()[None]
+
+            # Missing sign input tensor.
+            if si is None:
+                si = torch.empty([0])
+
+            # Missing bias tensor.
+            if b is None:
+                b = torch.zeros([x.shape[1]], dtype=x.dtype, device=x.device)
+
+            # Construct internal sign tensor only if gradients are needed.
+            write_signs = (si.numel() == 0) and (x.requires_grad or b.requires_grad)
+
+            # Warn if input storage strides are not in decreasing order due to e.g. channels-last layout.
+            strides = [x.stride(i) for i in range(x.ndim) if x.size(i) > 1]
+            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
+                warnings.warn("low-performance memory layout detected in filtered_lrelu input", RuntimeWarning)
+
+            # Call C++/Cuda plugin if datatype is supported.
+            if x.dtype in [torch.float16, torch.float32]:
+                if torch.cuda.current_stream(x.device) != torch.cuda.default_stream(x.device):
+                    warnings.warn("filtered_lrelu called with non-default cuda stream but concurrent execution is not supported", RuntimeWarning)
+                y, so, return_code = _plugin.filtered_lrelu(x, fu, fd, b, si, up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp, flip_filter, write_signs)
+            else:
+                return_code = -1
+
+            # No Cuda kernel found? Fall back to generic implementation. Still more memory efficient than the reference implementation because
+            # only the bit-packed sign tensor is retained for gradient computation.
+            if return_code < 0:
+                warnings.warn("filtered_lrelu called with parameters that have no optimized CUDA kernel, using generic fallback", RuntimeWarning)
+
+                y = x.add(b.unsqueeze(-1).unsqueeze(-1)) # Add bias.
+                y = upfirdn2d.upfirdn2d(x=y, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+                so = _plugin.filtered_lrelu_act_(y, si, sx, sy, gain, slope, clamp, write_signs) # Activation function and sign handling. Modifies y in-place.
+                y = upfirdn2d.upfirdn2d(x=y, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+            # Prepare for gradient computation.
+            ctx.save_for_backward(fu, fd, (si if si.numel() else so))
+            ctx.x_shape = x.shape
+            ctx.y_shape = y.shape
+            ctx.s_ofs = sx, sy
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            fu, fd, si = ctx.saved_tensors
+            _, _, xh, xw = ctx.x_shape
+            _, _, yh, yw = ctx.y_shape
+            sx, sy = ctx.s_ofs
+            dx  = None # 0
+            dfu = None; assert not ctx.needs_input_grad[1]
+            dfd = None; assert not ctx.needs_input_grad[2]
+            db  = None # 3
+            dsi = None; assert not ctx.needs_input_grad[4]
+            dsx = None; assert not ctx.needs_input_grad[5]
+            dsy = None; assert not ctx.needs_input_grad[6]
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
+                pp = [
+                    (fu.shape[-1] - 1) + (fd.shape[-1] - 1) - px0,
+                    xw * up - yw * down + px0 - (up - 1),
+                    (fu.shape[0] - 1) + (fd.shape[0] - 1) - py0,
+                    xh * up - yh * down + py0 - (up - 1),
+                ]
+                gg = gain * (up ** 2) / (down ** 2)
+                ff = (not flip_filter)
+                sx = sx - (fu.shape[-1] - 1) + px0
+                sy = sy - (fu.shape[0]  - 1) + py0
+                dx = _filtered_lrelu_cuda(up=down, down=up, padding=pp, gain=gg, slope=slope, clamp=None, flip_filter=ff).apply(dy, fd, fu, None, si, sx, sy)
+
+            if ctx.needs_input_grad[3]:
+                db = dx.sum([0, 2, 3])
+
+            return dx, dfu, dfd, db, dsi, dsx, dsy
+
+    # Add to cache.
+    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
+    return FilteredLReluCuda
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/filtered_lrelu_ns.cu b/eg3d/torch_utils/ops/filtered_lrelu_ns.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a3eae46215c3babea2c54e3ae255b05f4d777af
--- /dev/null
+++ b/eg3d/torch_utils/ops/filtered_lrelu_ns.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for no signs mode (no gradients required).
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, false>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, false>(cudaStream_t stream);
diff --git a/eg3d/torch_utils/ops/filtered_lrelu_rd.cu b/eg3d/torch_utils/ops/filtered_lrelu_rd.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cd43ec0648d3db05e5808299fc0ee318e5ceaa6
--- /dev/null
+++ b/eg3d/torch_utils/ops/filtered_lrelu_rd.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for sign read mode.
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, true>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, true>(cudaStream_t stream);
diff --git a/eg3d/torch_utils/ops/filtered_lrelu_wr.cu b/eg3d/torch_utils/ops/filtered_lrelu_wr.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bc2fa06912eb703dd77ca64533208428bdf373ac
--- /dev/null
+++ b/eg3d/torch_utils/ops/filtered_lrelu_wr.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for sign write mode.
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, true, false>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     true, false>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    true, false>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<true, false>(cudaStream_t stream);
diff --git a/eg3d/torch_utils/ops/fma.py b/eg3d/torch_utils/ops/fma.py
new file mode 100644
index 0000000000000000000000000000000000000000..5458116d0b6f8b133608456bbe9003aa0283ac85
--- /dev/null
+++ b/eg3d/torch_utils/ops/fma.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Fused multiply-add, with slightly faster gradients than `torch.addcmul()`."""
+
+import torch
+
+#----------------------------------------------------------------------------
+
+def fma(a, b, c): # => a * b + c
+    return _FusedMultiplyAdd.apply(a, b, c)
+
+#----------------------------------------------------------------------------
+
+class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c
+    @staticmethod
+    def forward(ctx, a, b, c): # pylint: disable=arguments-differ
+        out = torch.addcmul(c, a, b)
+        ctx.save_for_backward(a, b)
+        ctx.c_shape = c.shape
+        return out
+
+    @staticmethod
+    def backward(ctx, dout): # pylint: disable=arguments-differ
+        a, b = ctx.saved_tensors
+        c_shape = ctx.c_shape
+        da = None
+        db = None
+        dc = None
+
+        if ctx.needs_input_grad[0]:
+            da = _unbroadcast(dout * b, a.shape)
+
+        if ctx.needs_input_grad[1]:
+            db = _unbroadcast(dout * a, b.shape)
+
+        if ctx.needs_input_grad[2]:
+            dc = _unbroadcast(dout, c_shape)
+
+        return da, db, dc
+
+#----------------------------------------------------------------------------
+
+def _unbroadcast(x, shape):
+    extra_dims = x.ndim - len(shape)
+    assert extra_dims >= 0
+    dim = [i for i in range(x.ndim) if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1)]
+    if len(dim):
+        x = x.sum(dim=dim, keepdim=True)
+    if extra_dims:
+        x = x.reshape(-1, *x.shape[extra_dims+1:])
+    assert x.shape == shape
+    return x
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/grid_sample_gradfix.py b/eg3d/torch_utils/ops/grid_sample_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..35d94724136ba162d8416803b1ad00d6da0db99f
--- /dev/null
+++ b/eg3d/torch_utils/ops/grid_sample_gradfix.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Custom replacement for `torch.nn.functional.grid_sample` that
+supports arbitrarily high order gradients between the input and output.
+Only works on 2D images and assumes
+`mode='bilinear'`, `padding_mode='zeros'`, `align_corners=False`."""
+
+import torch
+
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+
+#----------------------------------------------------------------------------
+
+enabled = False  # Enable the custom op by setting this to true.
+
+#----------------------------------------------------------------------------
+
+def grid_sample(input, grid):
+    if _should_use_custom_op():
+        return _GridSample2dForward.apply(input, grid)
+    return torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False)
+
+#----------------------------------------------------------------------------
+
+def _should_use_custom_op():
+    return enabled
+
+#----------------------------------------------------------------------------
+
+class _GridSample2dForward(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, grid):
+        assert input.ndim == 4
+        assert grid.ndim == 4
+        output = torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False)
+        ctx.save_for_backward(input, grid)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, grid = ctx.saved_tensors
+        grad_input, grad_grid = _GridSample2dBackward.apply(grad_output, input, grid)
+        return grad_input, grad_grid
+
+#----------------------------------------------------------------------------
+
+class _GridSample2dBackward(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, grad_output, input, grid):
+        op = torch._C._jit_get_operation('aten::grid_sampler_2d_backward')
+        grad_input, grad_grid = op(grad_output, input, grid, 0, 0, False)
+        ctx.save_for_backward(grid)
+        return grad_input, grad_grid
+
+    @staticmethod
+    def backward(ctx, grad2_grad_input, grad2_grad_grid):
+        _ = grad2_grad_grid # unused
+        grid, = ctx.saved_tensors
+        grad2_grad_output = None
+        grad2_input = None
+        grad2_grid = None
+
+        if ctx.needs_input_grad[0]:
+            grad2_grad_output = _GridSample2dForward.apply(grad2_grad_input, grid)
+
+        assert not ctx.needs_input_grad[2]
+        return grad2_grad_output, grad2_input, grad2_grid
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/upfirdn2d.cpp b/eg3d/torch_utils/ops/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1769c3cbe4dd04f76f9ccef726680720e6f39c8
--- /dev/null
+++ b/eg3d/torch_utils/ops/upfirdn2d.cpp
@@ -0,0 +1,111 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "upfirdn2d.h"
+
+//------------------------------------------------------------------------
+
+static torch::Tensor upfirdn2d(torch::Tensor x, torch::Tensor f, int upx, int upy, int downx, int downy, int padx0, int padx1, int pady0, int pady1, bool flip, float gain)
+{
+    // Validate arguments.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    TORCH_CHECK(f.device() == x.device(), "f must reside on the same device as x");
+    TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
+    TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+    TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
+    TORCH_CHECK(x.numel() > 0, "x has zero size");
+    TORCH_CHECK(f.numel() > 0, "f has zero size");
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(f.dim() == 2, "f must be rank 2");
+    TORCH_CHECK((x.size(0)-1)*x.stride(0) + (x.size(1)-1)*x.stride(1) + (x.size(2)-1)*x.stride(2) + (x.size(3)-1)*x.stride(3) <= INT_MAX, "x memory footprint is too large");
+    TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
+    TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
+    TORCH_CHECK(downx >= 1 && downy >= 1, "downsampling factor must be at least 1");
+
+    // Create output tensor.
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    int outW = ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
+    int outH = ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
+    TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
+    torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW}, x.options(), x.suggest_memory_format());
+    TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
+    TORCH_CHECK((y.size(0)-1)*y.stride(0) + (y.size(1)-1)*y.stride(1) + (y.size(2)-1)*y.stride(2) + (y.size(3)-1)*y.stride(3) <= INT_MAX, "output memory footprint is too large");
+
+    // Initialize CUDA kernel parameters.
+    upfirdn2d_kernel_params p;
+    p.x             = x.data_ptr();
+    p.f             = f.data_ptr<float>();
+    p.y             = y.data_ptr();
+    p.up            = make_int2(upx, upy);
+    p.down          = make_int2(downx, downy);
+    p.pad0          = make_int2(padx0, pady0);
+    p.flip          = (flip) ? 1 : 0;
+    p.gain          = gain;
+    p.inSize        = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.inStride      = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1), (int)x.stride(0));
+    p.filterSize    = make_int2((int)f.size(1), (int)f.size(0));
+    p.filterStride  = make_int2((int)f.stride(1), (int)f.stride(0));
+    p.outSize       = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+    p.outStride     = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1), (int)y.stride(0));
+    p.sizeMajor     = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
+    p.sizeMinor     = (p.inStride.z == 1) ? p.inSize.z : 1;
+
+    // Choose CUDA kernel.
+    upfirdn2d_kernel_spec spec;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
+    {
+        spec = choose_upfirdn2d_kernel<scalar_t>(p);
+    });
+
+    // Set looping options.
+    p.loopMajor     = (p.sizeMajor - 1) / 16384 + 1;
+    p.loopMinor     = spec.loopMinor;
+    p.loopX         = spec.loopX;
+    p.launchMinor   = (p.sizeMinor - 1) / p.loopMinor + 1;
+    p.launchMajor   = (p.sizeMajor - 1) / p.loopMajor + 1;
+
+    // Compute grid size.
+    dim3 blockSize, gridSize;
+    if (spec.tileOutW < 0) // large
+    {
+        blockSize = dim3(4, 32, 1);
+        gridSize = dim3(
+            ((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
+            (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1,
+            p.launchMajor);
+    }
+    else // small
+    {
+        blockSize = dim3(256, 1, 1);
+        gridSize = dim3(
+            ((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
+            (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1,
+            p.launchMajor);
+    }
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
+    return y;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("upfirdn2d", &upfirdn2d);
+}
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/upfirdn2d.cu b/eg3d/torch_utils/ops/upfirdn2d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d182d7b86a9058d0c007b13716d6e7f08207f42
--- /dev/null
+++ b/eg3d/torch_utils/ops/upfirdn2d.cu
@@ -0,0 +1,388 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/util/Half.h>
+#include "upfirdn2d.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>     { typedef double scalar_t; };
+template <> struct InternalType<float>      { typedef float  scalar_t; };
+template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };
+
+static __device__ __forceinline__ int floor_div(int a, int b)
+{
+    int t = 1 - a / b;
+    return (a + t * b) / b - t;
+}
+
+//------------------------------------------------------------------------
+// Generic CUDA implementation for large filters.
+
+template <class T> static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+
+    // Calculate thread index.
+    int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
+    int outY = minorBase / p.launchMinor;
+    minorBase -= outY * p.launchMinor;
+    int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
+    int majorBase = blockIdx.z * p.loopMajor;
+    if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
+        return;
+
+    // Setup Y receptive field.
+    int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
+    int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
+    int h = min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
+    int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
+    if (p.flip)
+        filterY = p.filterSize.y - 1 - filterY;
+
+    // Loop over major, minor, and X.
+    for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    for (int minorIdx = 0, minor = minorBase; minorIdx < p.loopMinor & minor < p.sizeMinor; minorIdx++, minor += p.launchMinor)
+    {
+        int nc = major * p.sizeMinor + minor;
+        int n = nc / p.inSize.z;
+        int c = nc - n * p.inSize.z;
+        for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x; loopX++, outX += blockDim.y)
+        {
+            // Setup X receptive field.
+            int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
+            int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
+            int w = min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) - inX;
+            int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
+            if (p.flip)
+                filterX = p.filterSize.x - 1 - filterX;
+
+            // Initialize pointers.
+            const T* xp = &((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w];
+            const float* fp = &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
+            int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
+            int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;
+
+            // Inner loop.
+            scalar_t v = 0;
+            for (int y = 0; y < h; y++)
+            {
+                for (int x = 0; x < w; x++)
+                {
+                    v += (scalar_t)(*xp) * (scalar_t)(*fp);
+                    xp += p.inStride.x;
+                    fp += filterStepX;
+                }
+                xp += p.inStride.y - w * p.inStride.x;
+                fp += filterStepY - w * filterStepX;
+            }
+
+            // Store result.
+            v *= p.gain;
+            ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Specialized CUDA implementation for small filters.
+
+template <class T, int upx, int upy, int downx, int downy, int filterW, int filterH, int tileOutW, int tileOutH, int loopMinor>
+static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
+    const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
+    __shared__ volatile scalar_t sf[filterH][filterW];
+    __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];
+
+    // Calculate tile index.
+    int minorBase = blockIdx.x;
+    int tileOutY = minorBase / p.launchMinor;
+    minorBase -= tileOutY * p.launchMinor;
+    minorBase *= loopMinor;
+    tileOutY *= tileOutH;
+    int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
+    int majorBase = blockIdx.z * p.loopMajor;
+    if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y | majorBase >= p.sizeMajor)
+        return;
+
+    // Load filter (flipped).
+    for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW; tapIdx += blockDim.x)
+    {
+        int fy = tapIdx / filterW;
+        int fx = tapIdx - fy * filterW;
+        scalar_t v = 0;
+        if (fx < p.filterSize.x & fy < p.filterSize.y)
+        {
+            int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
+            int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
+            v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
+        }
+        sf[fy][fx] = v;
+    }
+
+    // Loop over major and X.
+    for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    {
+        int baseNC = major * p.sizeMinor + minorBase;
+        int n = baseNC / p.inSize.z;
+        int baseC = baseNC - n * p.inSize.z;
+        for (int loopX = 0, tileOutX = tileOutXBase; loopX < p.loopX & tileOutX < p.outSize.x; loopX++, tileOutX += tileOutW)
+        {
+            // Load input pixels.
+            int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
+            int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
+            int tileInX = floor_div(tileMidX, upx);
+            int tileInY = floor_div(tileMidY, upy);
+            __syncthreads();
+            for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor; inIdx += blockDim.x)
+            {
+                int relC = inIdx;
+                int relInX = relC / loopMinor;
+                int relInY = relInX / tileInW;
+                relC -= relInX * loopMinor;
+                relInX -= relInY * tileInW;
+                int c = baseC + relC;
+                int inX = tileInX + relInX;
+                int inY = tileInY + relInY;
+                scalar_t v = 0;
+                if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y & c < p.inSize.z)
+                    v = (scalar_t)((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w];
+                sx[relInY][relInX][relC] = v;
+            }
+
+            // Loop over output pixels.
+            __syncthreads();
+            for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor; outIdx += blockDim.x)
+            {
+                int relC = outIdx;
+                int relOutX = relC / loopMinor;
+                int relOutY = relOutX / tileOutW;
+                relC -= relOutX * loopMinor;
+                relOutX -= relOutY * tileOutW;
+                int c = baseC + relC;
+                int outX = tileOutX + relOutX;
+                int outY = tileOutY + relOutY;
+
+                // Setup receptive field.
+                int midX = tileMidX + relOutX * downx;
+                int midY = tileMidY + relOutY * downy;
+                int inX = floor_div(midX, upx);
+                int inY = floor_div(midY, upy);
+                int relInX = inX - tileInX;
+                int relInY = inY - tileInY;
+                int filterX = (inX + 1) * upx - midX - 1; // flipped
+                int filterY = (inY + 1) * upy - midY - 1; // flipped
+
+                // Inner loop.
+                if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z)
+                {
+                    scalar_t v = 0;
+                    #pragma unroll
+                    for (int y = 0; y < filterH / upy; y++)
+                        #pragma unroll
+                        for (int x = 0; x < filterW / upx; x++)
+                            v += sx[relInY + y][relInX + x][relC] * sf[filterY + y * upy][filterX + x * upx];
+                    v *= p.gain;
+                    ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v;
+                }
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p)
+{
+    int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
+    upfirdn2d_kernel_spec spec = {(void*)upfirdn2d_kernel_large<T>, -1,-1,1, 4}; // contiguous
+    if (s == 1)           spec = {(void*)upfirdn2d_kernel_large<T>, -1,-1,4, 1}; // channels_last
+
+    // No up/downsampling.
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,24, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,16, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 7  && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 7,7,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 6,6,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 5  && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 5,5,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 3  && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 3,3,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,1,  128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,1,  128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 8,1,   128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,24,  32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,16,  32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,8,   32,32,1>, 32,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,24, 32,32,1>,  32,32,1,  1};
+        if (s == 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,16, 32,32,1>,  32,32,1,  1};
+        if (s == 1 && fx <= 7  && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 7,7,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 6,6,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 5  && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 5,5,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 3  && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 3,3,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,1,  128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,1,  128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 8,1,   128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,24,  1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,16,  1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,8,   1,128,16>, 1,128,16, 1};
+    }
+
+    // 2x upsampling.
+    if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 24,24, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 16,16, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 8,8,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 6,6,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 4,4,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 2,2,   64,16,1>, 64,16,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 24,24, 32,32,1>, 32,32,1, 1};
+        if (s == 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 16,16, 32,32,1>, 32,32,1, 1};
+        if (s == 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 8,8,   16,16,8>, 16,16,8, 1};
+        if (s == 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 6,6,   16,16,8>, 16,16,8, 1};
+        if (s == 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 4,4,   16,16,8>, 16,16,8, 1};
+        if (s == 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 2,2,   16,16,8>, 16,16,8, 1};
+    }
+    if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 24,1, 128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 16,1, 128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 8,1,  128,8,1>, 128,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 24,1, 128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 16,1, 128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 8,1,  128,1,16>, 128,1,16, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,24, 32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,16, 32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,8,  32,32,1>, 32,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,24, 1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,16, 1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,8,  1,128,16>, 1,128,16, 1};
+    }
+
+    // 2x downsampling.
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 24,24, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 16,16, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 8,8,   32,8,1>,  32,8,1,  1};
+        if (s != 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 6,6,   32,8,1>,  32,8,1,  1};
+        if (s != 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 4,4,   32,8,1>,  32,8,1,  1};
+        if (s != 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 2,2,   32,8,1>,  32,8,1,  1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 24,24, 16,16,1>, 16,16,1, 1};
+        if (s == 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 16,16, 16,16,1>, 16,16,1, 1};
+        if (s == 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 8,8,   8,8,8>,   8,8,8,   1};
+        if (s == 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 6,6,   8,8,8>,   8,8,8,   1};
+        if (s == 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 4,4,   8,8,8>,   8,8,8,   1};
+        if (s == 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 2,2,   8,8,8>,   8,8,8,   1};
+    }
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 24,1, 64,8,1>, 64,8,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 16,1, 64,8,1>, 64,8,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 8,1,  64,8,1>, 64,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 24,1, 64,1,8>, 64,1,8, 1};
+        if (s == 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 16,1, 64,1,8>, 64,1,8, 1};
+        if (s == 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 8,1,  64,1,8>, 64,1,8, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,24, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,16, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,8,  32,16,1>, 32,16,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,24, 1,64,8>, 1,64,8, 1};
+        if (s == 1 && fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,16, 1,64,8>, 1,64,8, 1};
+        if (s == 1 && fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,8,  1,64,8>, 1,64,8, 1};
+    }
+
+    // 4x upsampling.
+    if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 48 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 48,48, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 32 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 32,32, 64,32,1>, 64,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 48 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 48,48, 32,32,1>, 32,32,1, 1};
+        if (s == 1 && fx <= 32 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 32,32, 32,32,1>, 32,32,1, 1};
+    }
+    if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 48,1, 128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 32,1, 128,8,1>, 128,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 48,1, 128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 32,1, 128,1,16>, 128,1,16, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,48, 32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,32, 32,32,1>, 32,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,48, 1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,32, 1,128,16>, 1,128,16, 1};
+    }
+
+    // 4x downsampling (inefficient).
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 48,1, 32,8,1>, 32,8,1, 1};
+        if (s != 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 32,1, 32,8,1>, 32,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 48,1, 32,1,8>, 32,1,8, 1};
+        if (s == 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 32,1, 32,1,8>, 32,1,8, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,48, 32,8,1>, 32,8,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,32, 32,8,1>, 32,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1  && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,48, 1,32,8>, 1,32,8, 1};
+        if (s == 1 && fx <= 1  && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,32, 1,32,8>, 1,32,8, 1};
+    }
+    return spec;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>   (const upfirdn2d_kernel_params& p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>    (const upfirdn2d_kernel_params& p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(const upfirdn2d_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/upfirdn2d.h b/eg3d/torch_utils/ops/upfirdn2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5de893d6489921d4689ac1e2cdb45da9a253f18
--- /dev/null
+++ b/eg3d/torch_utils/ops/upfirdn2d.h
@@ -0,0 +1,63 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct upfirdn2d_kernel_params
+{
+    const void*     x;
+    const float*    f;
+    void*           y;
+
+    int2            up;
+    int2            down;
+    int2            pad0;
+    int             flip;
+    float           gain;
+
+    int4            inSize;         // [width, height, channel, batch]
+    int4            inStride;
+    int2            filterSize;     // [width, height]
+    int2            filterStride;
+    int4            outSize;        // [width, height, channel, batch]
+    int4            outStride;
+    int             sizeMinor;
+    int             sizeMajor;
+
+    int             loopMinor;
+    int             loopMajor;
+    int             loopX;
+    int             launchMinor;
+    int             launchMajor;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct upfirdn2d_kernel_spec
+{
+    void*   kernel;
+    int     tileOutW;
+    int     tileOutH;
+    int     loopMinor;
+    int     loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/ops/upfirdn2d.py b/eg3d/torch_utils/ops/upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d634714167043daf63ec7f643ddd85d98d926dc
--- /dev/null
+++ b/eg3d/torch_utils/ops/upfirdn2d.py
@@ -0,0 +1,391 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Custom PyTorch ops for efficient resampling of 2D images."""
+
+import os
+import numpy as np
+import torch
+
+from .. import custom_ops
+from .. import misc
+from . import conv2d_gradfix
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='upfirdn2d_plugin',
+            sources=['upfirdn2d.cpp', 'upfirdn2d.cu'],
+            headers=['upfirdn2d.h'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+    return True
+
+def _parse_scaling(scaling):
+    if isinstance(scaling, int):
+        scaling = [scaling, scaling]
+    assert isinstance(scaling, (list, tuple))
+    assert all(isinstance(x, int) for x in scaling)
+    sx, sy = scaling
+    assert sx >= 1 and sy >= 1
+    return sx, sy
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, int) for x in padding)
+    if len(padding) == 2:
+        padx, pady = padding
+        padding = [padx, padx, pady, pady]
+    padx0, padx1, pady0, pady1 = padding
+    return padx0, padx1, pady0, pady1
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    fw = f.shape[-1]
+    fh = f.shape[0]
+    with misc.suppress_tracer_warnings():
+        fw = int(fw)
+        fh = int(fh)
+    misc.assert_shape(f, [fh, fw][:f.ndim])
+    assert fw >= 1 and fh >= 1
+    return fw, fh
+
+#----------------------------------------------------------------------------
+
+def setup_filter(f, device=torch.device('cpu'), normalize=True, flip_filter=False, gain=1, separable=None):
+    r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`.
+
+    Args:
+        f:           Torch tensor, numpy array, or python list of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable),
+                     `[]` (impulse), or
+                     `None` (identity).
+        device:      Result device (default: cpu).
+        normalize:   Normalize the filter so that it retains the magnitude
+                     for constant input signal (DC)? (default: True).
+        flip_filter: Flip the filter? (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        separable:   Return a separable filter? (default: select automatically).
+
+    Returns:
+        Float32 tensor of the shape
+        `[filter_height, filter_width]` (non-separable) or
+        `[filter_taps]` (separable).
+    """
+    # Validate.
+    if f is None:
+        f = 1
+    f = torch.as_tensor(f, dtype=torch.float32)
+    assert f.ndim in [0, 1, 2]
+    assert f.numel() > 0
+    if f.ndim == 0:
+        f = f[np.newaxis]
+
+    # Separable?
+    if separable is None:
+        separable = (f.ndim == 1 and f.numel() >= 8)
+    if f.ndim == 1 and not separable:
+        f = f.ger(f)
+    assert f.ndim == (1 if separable else 2)
+
+    # Apply normalize, flip, gain, and device.
+    if normalize:
+        f /= f.sum()
+    if flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(device=device)
+    return f
+
+#----------------------------------------------------------------------------
+
+def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Pad, upsample, filter, and downsample a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    2. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    3. Convolve the image with the specified 2D FIR filter (`f`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    4. Downsample the image by keeping every Nth pixel (`down`).
+
+    This sequence of operations bears close resemblance to scipy.signal.upfirdn().
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _upfirdn2d_cuda(up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain).apply(x, f)
+    return _upfirdn2d_ref(x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    if f is None:
+        f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    assert f.dtype == torch.float32 and not f.requires_grad
+    batch_size, num_channels, in_height, in_width = x.shape
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Check that upsampled buffer is not smaller than the filter.
+    upW = in_width * upx + padx0 + padx1
+    upH = in_height * upy + pady0 + pady1
+    assert upW >= f.shape[-1] and upH >= f.shape[0]
+
+    # Upsample by inserting zeros.
+    x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+
+    # Pad or crop.
+    x = torch.nn.functional.pad(x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)])
+    x = x[:, :, max(-pady0, 0) : x.shape[2] - max(-pady1, 0), max(-padx0, 0) : x.shape[3] - max(-padx1, 0)]
+
+    # Setup filter.
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(x.dtype)
+    if not flip_filter:
+        f = f.flip(list(range(f.ndim)))
+
+    # Convolve with the filter.
+    f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim)
+    if f.ndim == 4:
+        x = conv2d_gradfix.conv2d(input=x, weight=f, groups=num_channels)
+    else:
+        x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels)
+        x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels)
+
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+
+#----------------------------------------------------------------------------
+
+_upfirdn2d_cuda_cache = dict()
+
+def _upfirdn2d_cuda(up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Fast CUDA implementation of `upfirdn2d()` using custom ops.
+    """
+    # Parse arguments.
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Lookup from cache.
+    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain)
+    if key in _upfirdn2d_cuda_cache:
+        return _upfirdn2d_cuda_cache[key]
+
+    # Forward op.
+    class Upfirdn2dCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, f): # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+            if f is None:
+                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if f.ndim == 1 and f.shape[0] == 1:
+                f = f.square().unsqueeze(0) # Convert separable-1 into full-1x1.
+            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+            y = x
+            if f.ndim == 2:
+                y = _plugin.upfirdn2d(y, f, upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain)
+            else:
+                y = _plugin.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1, padx0, padx1, 0, 0, flip_filter, 1.0)
+                y = _plugin.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy, 0, 0, pady0, pady1, flip_filter, gain)
+            ctx.save_for_backward(f)
+            ctx.x_shape = x.shape
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            f, = ctx.saved_tensors
+            _, _, ih, iw = ctx.x_shape
+            _, _, oh, ow = dy.shape
+            fw, fh = _get_filter_size(f)
+            p = [
+                fw - padx0 - 1,
+                iw * upx - ow * downx + padx0 - upx + 1,
+                fh - pady0 - 1,
+                ih * upy - oh * downy + pady0 - upy + 1,
+            ]
+            dx = None
+            df = None
+
+            if ctx.needs_input_grad[0]:
+                dx = _upfirdn2d_cuda(up=down, down=up, padding=p, flip_filter=(not flip_filter), gain=gain).apply(dy, f)
+
+            assert not ctx.needs_input_grad[1]
+            return dx, df
+
+    # Add to cache.
+    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda
+    return Upfirdn2dCuda
+
+#----------------------------------------------------------------------------
+
+def filter2d(x, f, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Filter a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape matches the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + fw // 2,
+        padx1 + (fw - 1) // 2,
+        pady0 + fh // 2,
+        pady1 + (fh - 1) // 2,
+    ]
+    return upfirdn2d(x, f, padding=p, flip_filter=flip_filter, gain=gain, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Upsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a multiple of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    upx, upy = _parse_scaling(up)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw + upx - 1) // 2,
+        padx1 + (fw - upx) // 2,
+        pady0 + (fh + upy - 1) // 2,
+        pady1 + (fh - upy) // 2,
+    ]
+    return upfirdn2d(x, f, up=up, padding=p, flip_filter=flip_filter, gain=gain*upx*upy, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Downsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a fraction of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the input. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw - downx + 1) // 2,
+        padx1 + (fw - downx) // 2,
+        pady0 + (fh - downy + 1) // 2,
+        pady1 + (fh - downy) // 2,
+    ]
+    return upfirdn2d(x, f, down=down, padding=p, flip_filter=flip_filter, gain=gain, impl=impl)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/persistence.py b/eg3d/torch_utils/persistence.py
new file mode 100644
index 0000000000000000000000000000000000000000..1abf9cbf2c92a631ab1ac22fc1b0b382e22a0af0
--- /dev/null
+++ b/eg3d/torch_utils/persistence.py
@@ -0,0 +1,253 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Facilities for pickling Python code alongside other data.
+
+The pickled code is automatically imported into a separate Python module
+during unpickling. This way, any previously exported pickles will remain
+usable even if the original code is no longer available, or if the current
+version of the code is not consistent with what was originally pickled."""
+
+import sys
+import pickle
+import io
+import inspect
+import copy
+import uuid
+import types
+import dnnlib
+
+#----------------------------------------------------------------------------
+
+_version            = 6         # internal version number
+_decorators         = set()     # {decorator_class, ...}
+_import_hooks       = []        # [hook_function, ...]
+_module_to_src_dict = dict()    # {module: src, ...}
+_src_to_module_dict = dict()    # {src: module, ...}
+
+#----------------------------------------------------------------------------
+
+def persistent_class(orig_class):
+    r"""Class decorator that extends a given class to save its source code
+    when pickled.
+
+    Example:
+
+        from torch_utils import persistence
+
+        @persistence.persistent_class
+        class MyNetwork(torch.nn.Module):
+            def __init__(self, num_inputs, num_outputs):
+                super().__init__()
+                self.fc = MyLayer(num_inputs, num_outputs)
+                ...
+
+        @persistence.persistent_class
+        class MyLayer(torch.nn.Module):
+            ...
+
+    When pickled, any instance of `MyNetwork` and `MyLayer` will save its
+    source code alongside other internal state (e.g., parameters, buffers,
+    and submodules). This way, any previously exported pickle will remain
+    usable even if the class definitions have been modified or are no
+    longer available.
+
+    The decorator saves the source code of the entire Python module
+    containing the decorated class. It does *not* save the source code of
+    any imported modules. Thus, the imported modules must be available
+    during unpickling, also including `torch_utils.persistence` itself.
+
+    It is ok to call functions defined in the same module from the
+    decorated class. However, if the decorated class depends on other
+    classes defined in the same module, they must be decorated as well.
+    This is illustrated in the above example in the case of `MyLayer`.
+
+    It is also possible to employ the decorator just-in-time before
+    calling the constructor. For example:
+
+        cls = MyLayer
+        if want_to_make_it_persistent:
+            cls = persistence.persistent_class(cls)
+        layer = cls(num_inputs, num_outputs)
+
+    As an additional feature, the decorator also keeps track of the
+    arguments that were used to construct each instance of the decorated
+    class. The arguments can be queried via `obj.init_args` and
+    `obj.init_kwargs`, and they are automatically pickled alongside other
+    object state. A typical use case is to first unpickle a previous
+    instance of a persistent class, and then upgrade it to use the latest
+    version of the source code:
+
+        with open('old_pickle.pkl', 'rb') as f:
+            old_net = pickle.load(f)
+        new_net = MyNetwork(*old_obj.init_args, **old_obj.init_kwargs)
+        misc.copy_params_and_buffers(old_net, new_net, require_all=True)
+    """
+    assert isinstance(orig_class, type)
+    if is_persistent(orig_class):
+        return orig_class
+
+    assert orig_class.__module__ in sys.modules
+    orig_module = sys.modules[orig_class.__module__]
+    orig_module_src = _module_to_src(orig_module)
+
+    class Decorator(orig_class):
+        _orig_module_src = orig_module_src
+        _orig_class_name = orig_class.__name__
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self._init_args = copy.deepcopy(args)
+            self._init_kwargs = copy.deepcopy(kwargs)
+            assert orig_class.__name__ in orig_module.__dict__
+            _check_pickleable(self.__reduce__())
+
+        @property
+        def init_args(self):
+            return copy.deepcopy(self._init_args)
+
+        @property
+        def init_kwargs(self):
+            return dnnlib.EasyDict(copy.deepcopy(self._init_kwargs))
+
+        def __reduce__(self):
+            fields = list(super().__reduce__())
+            fields += [None] * max(3 - len(fields), 0)
+            if fields[0] is not _reconstruct_persistent_obj:
+                meta = dict(type='class', version=_version, module_src=self._orig_module_src, class_name=self._orig_class_name, state=fields[2])
+                fields[0] = _reconstruct_persistent_obj # reconstruct func
+                fields[1] = (meta,) # reconstruct args
+                fields[2] = None # state dict
+            return tuple(fields)
+
+    Decorator.__name__ = orig_class.__name__
+    _decorators.add(Decorator)
+    return Decorator
+
+#----------------------------------------------------------------------------
+
+def is_persistent(obj):
+    r"""Test whether the given object or class is persistent, i.e.,
+    whether it will save its source code when pickled.
+    """
+    try:
+        if obj in _decorators:
+            return True
+    except TypeError:
+        pass
+    return type(obj) in _decorators # pylint: disable=unidiomatic-typecheck
+
+#----------------------------------------------------------------------------
+
+def import_hook(hook):
+    r"""Register an import hook that is called whenever a persistent object
+    is being unpickled. A typical use case is to patch the pickled source
+    code to avoid errors and inconsistencies when the API of some imported
+    module has changed.
+
+    The hook should have the following signature:
+
+        hook(meta) -> modified meta
+
+    `meta` is an instance of `dnnlib.EasyDict` with the following fields:
+
+        type:       Type of the persistent object, e.g. `'class'`.
+        version:    Internal version number of `torch_utils.persistence`.
+        module_src  Original source code of the Python module.
+        class_name: Class name in the original Python module.
+        state:      Internal state of the object.
+
+    Example:
+
+        @persistence.import_hook
+        def wreck_my_network(meta):
+            if meta.class_name == 'MyNetwork':
+                print('MyNetwork is being imported. I will wreck it!')
+                meta.module_src = meta.module_src.replace("True", "False")
+            return meta
+    """
+    assert callable(hook)
+    _import_hooks.append(hook)
+
+#----------------------------------------------------------------------------
+
+def _reconstruct_persistent_obj(meta):
+    r"""Hook that is called internally by the `pickle` module to unpickle
+    a persistent object.
+    """
+    meta = dnnlib.EasyDict(meta)
+    meta.state = dnnlib.EasyDict(meta.state)
+    for hook in _import_hooks:
+        meta = hook(meta)
+        assert meta is not None
+
+    assert meta.version == _version
+    module = _src_to_module(meta.module_src)
+
+    assert meta.type == 'class'
+    orig_class = module.__dict__[meta.class_name]
+    decorator_class = persistent_class(orig_class)
+    obj = decorator_class.__new__(decorator_class)
+
+    setstate = getattr(obj, '__setstate__', None)
+    if callable(setstate):
+        setstate(meta.state) # pylint: disable=not-callable
+    else:
+        obj.__dict__.update(meta.state)
+    return obj
+
+#----------------------------------------------------------------------------
+
+def _module_to_src(module):
+    r"""Query the source code of a given Python module.
+    """
+    src = _module_to_src_dict.get(module, None)
+    if src is None:
+        src = inspect.getsource(module)
+        _module_to_src_dict[module] = src
+        _src_to_module_dict[src] = module
+    return src
+
+def _src_to_module(src):
+    r"""Get or create a Python module for the given source code.
+    """
+    module = _src_to_module_dict.get(src, None)
+    if module is None:
+        module_name = "_imported_module_" + uuid.uuid4().hex
+        module = types.ModuleType(module_name)
+        sys.modules[module_name] = module
+        _module_to_src_dict[module] = src
+        _src_to_module_dict[src] = module
+        exec(src, module.__dict__) # pylint: disable=exec-used
+    return module
+
+#----------------------------------------------------------------------------
+
+def _check_pickleable(obj):
+    r"""Check that the given object is pickleable, raising an exception if
+    it is not. This function is expected to be considerably more efficient
+    than actually pickling the object.
+    """
+    def recurse(obj):
+        if isinstance(obj, (list, tuple, set)):
+            return [recurse(x) for x in obj]
+        if isinstance(obj, dict):
+            return [[recurse(x), recurse(y)] for x, y in obj.items()]
+        if isinstance(obj, (str, int, float, bool, bytes, bytearray)):
+            return None # Python primitive types are pickleable.
+        if f'{type(obj).__module__}.{type(obj).__name__}' in ['numpy.ndarray', 'torch.Tensor', 'torch.nn.parameter.Parameter']:
+            return None # NumPy arrays and PyTorch tensors are pickleable.
+        if is_persistent(obj):
+            return None # Persistent objects are pickleable, by virtue of the constructor check.
+        return obj
+    with io.BytesIO() as f:
+        pickle.dump(recurse(obj), f)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/torch_utils/training_stats.py b/eg3d/torch_utils/training_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..636dd7f9919632c84795265b7c472f1138c901b2
--- /dev/null
+++ b/eg3d/torch_utils/training_stats.py
@@ -0,0 +1,270 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Facilities for reporting and collecting training statistics across
+multiple processes and devices. The interface is designed to minimize
+synchronization overhead as well as the amount of boilerplate in user
+code."""
+
+import re
+import numpy as np
+import torch
+import dnnlib
+
+from . import misc
+
+#----------------------------------------------------------------------------
+
+_num_moments    = 3             # [num_scalars, sum_of_scalars, sum_of_squares]
+_reduce_dtype   = torch.float32 # Data type to use for initial per-tensor reduction.
+_counter_dtype  = torch.float64 # Data type to use for the internal counters.
+_rank           = 0             # Rank of the current process.
+_sync_device    = None          # Device to use for multiprocess communication. None = single-process.
+_sync_called    = False         # Has _sync() been called yet?
+_counters       = dict()        # Running counters on each device, updated by report(): name => device => torch.Tensor
+_cumulative     = dict()        # Cumulative counters on the CPU, updated by _sync(): name => torch.Tensor
+
+#----------------------------------------------------------------------------
+
+def init_multiprocessing(rank, sync_device):
+    r"""Initializes `torch_utils.training_stats` for collecting statistics
+    across multiple processes.
+
+    This function must be called after
+    `torch.distributed.init_process_group()` and before `Collector.update()`.
+    The call is not necessary if multi-process collection is not needed.
+
+    Args:
+        rank:           Rank of the current process.
+        sync_device:    PyTorch device to use for inter-process
+                        communication, or None to disable multi-process
+                        collection. Typically `torch.device('cuda', rank)`.
+    """
+    global _rank, _sync_device
+    assert not _sync_called
+    _rank = rank
+    _sync_device = sync_device
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def report(name, value):
+    r"""Broadcasts the given set of scalars to all interested instances of
+    `Collector`, across device and process boundaries.
+
+    This function is expected to be extremely cheap and can be safely
+    called from anywhere in the training loop, loss function, or inside a
+    `torch.nn.Module`.
+
+    Warning: The current implementation expects the set of unique names to
+    be consistent across processes. Please make sure that `report()` is
+    called at least once for each unique name by each process, and in the
+    same order. If a given process has no scalars to broadcast, it can do
+    `report(name, [])` (empty list).
+
+    Args:
+        name:   Arbitrary string specifying the name of the statistic.
+                Averages are accumulated separately for each unique name.
+        value:  Arbitrary set of scalars. Can be a list, tuple,
+                NumPy array, PyTorch tensor, or Python scalar.
+
+    Returns:
+        The same `value` that was passed in.
+    """
+    if name not in _counters:
+        _counters[name] = dict()
+
+    elems = torch.as_tensor(value)
+    if elems.numel() == 0:
+        return value
+
+    elems = elems.detach().flatten().to(_reduce_dtype)
+    moments = torch.stack([
+        torch.ones_like(elems).sum(),
+        elems.sum(),
+        elems.square().sum(),
+    ])
+    assert moments.ndim == 1 and moments.shape[0] == _num_moments
+    moments = moments.to(_counter_dtype)
+
+    device = moments.device
+    if device not in _counters[name]:
+        _counters[name][device] = torch.zeros_like(moments)
+    _counters[name][device].add_(moments)
+    return value
+
+#----------------------------------------------------------------------------
+
+def report0(name, value):
+    r"""Broadcasts the given set of scalars by the first process (`rank = 0`),
+    but ignores any scalars provided by the other processes.
+    See `report()` for further details.
+    """
+    report(name, value if _rank == 0 else [])
+    return value
+
+#----------------------------------------------------------------------------
+
+class Collector:
+    r"""Collects the scalars broadcasted by `report()` and `report0()` and
+    computes their long-term averages (mean and standard deviation) over
+    user-defined periods of time.
+
+    The averages are first collected into internal counters that are not
+    directly visible to the user. They are then copied to the user-visible
+    state as a result of calling `update()` and can then be queried using
+    `mean()`, `std()`, `as_dict()`, etc. Calling `update()` also resets the
+    internal counters for the next round, so that the user-visible state
+    effectively reflects averages collected between the last two calls to
+    `update()`.
+
+    Args:
+        regex:          Regular expression defining which statistics to
+                        collect. The default is to collect everything.
+        keep_previous:  Whether to retain the previous averages if no
+                        scalars were collected on a given round
+                        (default: True).
+    """
+    def __init__(self, regex='.*', keep_previous=True):
+        self._regex = re.compile(regex)
+        self._keep_previous = keep_previous
+        self._cumulative = dict()
+        self._moments = dict()
+        self.update()
+        self._moments.clear()
+
+    def names(self):
+        r"""Returns the names of all statistics broadcasted so far that
+        match the regular expression specified at construction time.
+        """
+        return [name for name in _counters if self._regex.fullmatch(name)]
+
+    def update(self):
+        r"""Copies current values of the internal counters to the
+        user-visible state and resets them for the next round.
+
+        If `keep_previous=True` was specified at construction time, the
+        operation is skipped for statistics that have received no scalars
+        since the last update, retaining their previous averages.
+
+        This method performs a number of GPU-to-CPU transfers and one
+        `torch.distributed.all_reduce()`. It is intended to be called
+        periodically in the main training loop, typically once every
+        N training steps.
+        """
+        if not self._keep_previous:
+            self._moments.clear()
+        for name, cumulative in _sync(self.names()):
+            if name not in self._cumulative:
+                self._cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+            delta = cumulative - self._cumulative[name]
+            self._cumulative[name].copy_(cumulative)
+            if float(delta[0]) != 0:
+                self._moments[name] = delta
+
+    def _get_delta(self, name):
+        r"""Returns the raw moments that were accumulated for the given
+        statistic between the last two calls to `update()`, or zero if
+        no scalars were collected.
+        """
+        assert self._regex.fullmatch(name)
+        if name not in self._moments:
+            self._moments[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+        return self._moments[name]
+
+    def num(self, name):
+        r"""Returns the number of scalars that were accumulated for the given
+        statistic between the last two calls to `update()`, or zero if
+        no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        return int(delta[0])
+
+    def mean(self, name):
+        r"""Returns the mean of the scalars that were accumulated for the
+        given statistic between the last two calls to `update()`, or NaN if
+        no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        if int(delta[0]) == 0:
+            return float('nan')
+        return float(delta[1] / delta[0])
+
+    def std(self, name):
+        r"""Returns the standard deviation of the scalars that were
+        accumulated for the given statistic between the last two calls to
+        `update()`, or NaN if no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        if int(delta[0]) == 0 or not np.isfinite(float(delta[1])):
+            return float('nan')
+        if int(delta[0]) == 1:
+            return float(0)
+        mean = float(delta[1] / delta[0])
+        raw_var = float(delta[2] / delta[0])
+        return np.sqrt(max(raw_var - np.square(mean), 0))
+
+    def as_dict(self):
+        r"""Returns the averages accumulated between the last two calls to
+        `update()` as an `dnnlib.EasyDict`. The contents are as follows:
+
+            dnnlib.EasyDict(
+                NAME = dnnlib.EasyDict(num=FLOAT, mean=FLOAT, std=FLOAT),
+                ...
+            )
+        """
+        stats = dnnlib.EasyDict()
+        for name in self.names():
+            stats[name] = dnnlib.EasyDict(num=self.num(name), mean=self.mean(name), std=self.std(name))
+        return stats
+
+    def __getitem__(self, name):
+        r"""Convenience getter.
+        `collector[name]` is a synonym for `collector.mean(name)`.
+        """
+        return self.mean(name)
+
+#----------------------------------------------------------------------------
+
+def _sync(names):
+    r"""Synchronize the global cumulative counters across devices and
+    processes. Called internally by `Collector.update()`.
+    """
+    if len(names) == 0:
+        return []
+    global _sync_called
+    _sync_called = True
+
+    # Collect deltas within current rank.
+    deltas = []
+    device = _sync_device if _sync_device is not None else torch.device('cpu')
+    for name in names:
+        delta = torch.zeros([_num_moments], dtype=_counter_dtype, device=device)
+        for counter in _counters[name].values():
+            delta.add_(counter.to(device))
+            counter.copy_(torch.zeros_like(counter))
+        deltas.append(delta)
+    deltas = torch.stack(deltas)
+
+    # Sum deltas across ranks.
+    if _sync_device is not None:
+        torch.distributed.all_reduce(deltas)
+
+    # Update cumulative values.
+    deltas = deltas.cpu()
+    for idx, name in enumerate(names):
+        if name not in _cumulative:
+            _cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+        _cumulative[name].add_(deltas[idx])
+
+    # Return name-value pairs.
+    return [(name, _cumulative[name]) for name in names]
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/train.py b/eg3d/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea30ed58d06409d1d892b916bf44298d0acd3278
--- /dev/null
+++ b/eg3d/train.py
@@ -0,0 +1,415 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Train a GAN using the techniques described in the paper
+"Efficient Geometry-aware 3D Generative Adversarial Networks."
+
+Code adapted from
+"Alias-Free Generative Adversarial Networks"."""
+
+import os
+import click
+import re
+import json
+import tempfile
+import torch
+
+import dnnlib
+from training import training_loop
+from metrics import metric_main
+from torch_utils import training_stats
+from torch_utils import custom_ops
+
+#----------------------------------------------------------------------------
+
+def subprocess_fn(rank, c, temp_dir):
+    dnnlib.util.Logger(file_name=os.path.join(c.run_dir, 'log.txt'), file_mode='a', should_flush=True)
+
+    # Init torch.distributed.
+    if c.num_gpus > 1:
+        init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init'))
+        if os.name == 'nt':
+            init_method = 'file:///' + init_file.replace('\\', '/')
+            torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=c.num_gpus)
+        else:
+            init_method = f'file://{init_file}'
+            torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=c.num_gpus)
+
+    # Init torch_utils.
+    sync_device = torch.device('cuda', rank) if c.num_gpus > 1 else None
+    training_stats.init_multiprocessing(rank=rank, sync_device=sync_device)
+    if rank != 0:
+        custom_ops.verbosity = 'none'
+
+    # Execute training loop.
+    training_loop.training_loop(rank=rank, **c)
+
+#----------------------------------------------------------------------------
+
+def launch_training(c, desc, outdir, dry_run):
+    dnnlib.util.Logger(should_flush=True)
+
+    # Pick output directory.
+    prev_run_dirs = []
+    if os.path.isdir(outdir):
+        prev_run_dirs = [x for x in os.listdir(outdir) if os.path.isdir(os.path.join(outdir, x))]
+    prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs]
+    prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None]
+    cur_run_id = max(prev_run_ids, default=-1) + 1
+    c.run_dir = os.path.join(outdir, f'{cur_run_id:05d}-{desc}')
+    assert not os.path.exists(c.run_dir)
+
+    # Print options.
+    print()
+    print('Training options:')
+    print(json.dumps(c, indent=2))
+    print()
+    print(f'Output directory:    {c.run_dir}')
+    print(f'Number of GPUs:      {c.num_gpus}')
+    print(f'Batch size:          {c.batch_size} images')
+    print(f'Training duration:   {c.total_kimg} kimg')
+    print(f'Dataset path:        {c.training_set_kwargs.path}')
+    print(f'Dataset size:        {c.training_set_kwargs.max_size} images')
+    print(f'Dataset resolution:  {c.training_set_kwargs.resolution}')
+    print(f'Dataset labels:      {c.training_set_kwargs.use_labels}')
+    print(f'Dataset x-flips:     {c.training_set_kwargs.xflip}')
+    print()
+
+    # Dry run?
+    if dry_run:
+        print('Dry run; exiting.')
+        return
+
+    # Create output directory.
+    print('Creating output directory...')
+    os.makedirs(c.run_dir)
+    with open(os.path.join(c.run_dir, 'training_options.json'), 'wt') as f:
+        json.dump(c, f, indent=2)
+
+    # Launch processes.
+    print('Launching processes...')
+    torch.multiprocessing.set_start_method('spawn')
+    with tempfile.TemporaryDirectory() as temp_dir:
+        if c.num_gpus == 1:
+            subprocess_fn(rank=0, c=c, temp_dir=temp_dir)
+        else:
+            torch.multiprocessing.spawn(fn=subprocess_fn, args=(c, temp_dir), nprocs=c.num_gpus)
+
+#----------------------------------------------------------------------------
+
+def init_dataset_kwargs(data, max_size):
+    try:
+        dataset_kwargs = dnnlib.EasyDict(class_name='training.dataset.ImageFolderDataset', path=data, use_labels=True, max_size=max_size, xflip=False)
+        dataset_obj = dnnlib.util.construct_class_by_name(**dataset_kwargs) # Subclass of training.dataset.Dataset.
+        dataset_kwargs.resolution = dataset_obj.resolution # Be explicit about resolution.
+        dataset_kwargs.use_labels = dataset_obj.has_labels # Be explicit about labels.
+        dataset_kwargs.max_size = len(dataset_obj) # Be explicit about dataset size.
+        return dataset_kwargs, dataset_obj.name
+    except IOError as err:
+        raise click.ClickException(f'--data: {err}')
+
+#----------------------------------------------------------------------------
+
+def parse_comma_separated_list(s):
+    if isinstance(s, list):
+        return s
+    if s is None or s.lower() == 'none' or s == '':
+        return []
+    return s.split(',')
+
+#----------------------------------------------------------------------------
+
+@click.command()
+
+# Required.
+@click.option('--outdir',       help='Where to save the results', metavar='DIR',                required=True)
+@click.option('--cfg',          help='Base configuration',                                      type=str, required=True)
+@click.option('--data',         help='Training data', metavar='[ZIP|DIR]',                      type=str, required=True)
+@click.option('--data_max_size',help='Training data max size', metavar='INT',                   type=click.IntRange(min=1), default=None)
+@click.option('--gpus',         help='Number of GPUs to use', metavar='INT',                    type=click.IntRange(min=1), required=True)
+@click.option('--batch',        help='Total batch size', metavar='INT',                         type=click.IntRange(min=1), required=True)
+@click.option('--gamma',        help='R1 regularization weight', metavar='FLOAT',               type=click.FloatRange(min=0), required=True)
+
+# Optional features.
+@click.option('--cond',         help='Train conditional model', metavar='BOOL',                 type=bool, default=True, show_default=True)
+@click.option('--mirror',       help='Enable dataset x-flips', metavar='BOOL',                  type=bool, default=False, show_default=True)
+@click.option('--aug',          help='Augmentation mode',                                       type=click.Choice(['noaug', 'ada', 'fixed']), default='noaug', show_default=True)
+@click.option('--resume',       help='Resume from given network pickle', metavar='[PATH|URL]',  type=str)
+@click.option('--freezed',      help='Freeze first layers of D', metavar='INT',                 type=click.IntRange(min=0), default=0, show_default=True)
+
+# Misc hyperparameters.
+@click.option('--p',            help='Probability for --aug=fixed', metavar='FLOAT',            type=click.FloatRange(min=0, max=1), default=0.2, show_default=True)
+@click.option('--target',       help='Target value for --aug=ada', metavar='FLOAT',             type=click.FloatRange(min=0, max=1), default=0.6, show_default=True)
+@click.option('--batch-gpu',    help='Limit batch size per GPU', metavar='INT',                 type=click.IntRange(min=1))
+@click.option('--cbase',        help='Capacity multiplier', metavar='INT',                      type=click.IntRange(min=1), default=32768, show_default=True)
+@click.option('--cmax',         help='Max. feature maps', metavar='INT',                        type=click.IntRange(min=1), default=512, show_default=True)
+@click.option('--glr',          help='G learning rate  [default: varies]', metavar='FLOAT',     type=click.FloatRange(min=0))
+@click.option('--dlr',          help='D learning rate', metavar='FLOAT',                        type=click.FloatRange(min=0), default=0.002, show_default=True)
+@click.option('--map-depth',    help='Mapping network depth  [default: varies]', metavar='INT', type=click.IntRange(min=1), default=2, show_default=True)
+@click.option('--mbstd-group',  help='Minibatch std group size', metavar='INT',                 type=click.IntRange(min=1), default=4, show_default=True)
+
+# Misc settings.
+@click.option('--desc',         help='String to include in result dir name', metavar='STR',     type=str)
+@click.option('--metrics',      help='Quality metrics', metavar='[NAME|A,B,C|none]',            type=parse_comma_separated_list, default='fid50k_full', show_default=True)
+@click.option('--kimg',         help='Total training duration', metavar='KIMG',                 type=click.IntRange(min=1), default=500, show_default=True)
+@click.option('--tick',         help='How often to print progress', metavar='KIMG',             type=click.IntRange(min=1), default=1, show_default=True)
+@click.option('--snap',         help='How often to save snapshots', metavar='TICKS',            type=click.IntRange(min=1), default=10, show_default=True)
+@click.option('--seed',         help='Random seed', metavar='INT',                              type=click.IntRange(min=0), default=0, show_default=True)
+# @click.option('--fp32',         help='Disable mixed-precision', metavar='BOOL',                 type=bool, default=False, show_default=True)
+@click.option('--nobench',      help='Disable cuDNN benchmarking', metavar='BOOL',              type=bool, default=False, show_default=True)
+@click.option('--freeze_dec_sr',help='                          ', metavar='BOOL',              type=bool, default=False, show_default=True)
+@click.option('--workers',      help='DataLoader worker processes', metavar='INT',              type=click.IntRange(min=0), default=3, show_default=True)
+@click.option('-n','--dry-run', help='Print training options and exit',                         is_flag=True)
+
+# @click.option('--sr_module',    help='Superresolution module', metavar='STR',  type=str, required=True)
+@click.option('--neural_rendering_resolution_initial', help='Resolution to render at', metavar='INT',  type=click.IntRange(min=1), default=64, required=False)
+@click.option('--neural_rendering_resolution_final', help='Final resolution to render at, if blending', metavar='INT',  type=click.IntRange(min=1), required=False, default=None)
+@click.option('--neural_rendering_resolution_fade_kimg', help='Kimg to blend resolution over', metavar='INT',  type=click.IntRange(min=0), required=False, default=1000, show_default=True)
+
+@click.option('--blur_fade_kimg', help='Blur over how many', metavar='INT',  type=click.IntRange(min=1), required=False, default=200)
+@click.option('--gen_pose_cond', help='If true, enable generator pose conditioning.', metavar='BOOL',  type=bool, required=False, default=False)
+@click.option('--c-scale', help='Scale factor for generator pose conditioning.', metavar='FLOAT',  type=click.FloatRange(min=0), required=False, default=1)
+@click.option('--c-noise', help='Add noise for generator pose conditioning.', metavar='FLOAT',  type=click.FloatRange(min=0), required=False, default=0)
+@click.option('--gpc_reg_prob', help='Strength of swapping regularization. None means no generator pose conditioning, i.e. condition with zeros.', metavar='FLOAT',  type=click.FloatRange(min=0), required=False, default=0.5)
+@click.option('--gpc_reg_fade_kimg', help='Length of swapping prob fade', metavar='INT',  type=click.IntRange(min=0), required=False, default=1000)
+@click.option('--disc_c_noise', help='Strength of discriminator pose conditioning regularization, in standard deviations.', metavar='FLOAT',  type=click.FloatRange(min=0), required=False, default=0)
+@click.option('--sr_noise_mode', help='Type of noise for superresolution', metavar='STR',  type=click.Choice(['random', 'none']), required=False, default='none')
+@click.option('--resume_blur', help='Enable to blur even on resume', metavar='BOOL',  type=bool, required=False, default=False)
+@click.option('--sr_num_fp16_res',    help='Number of fp16 layers in superresolution', metavar='INT', type=click.IntRange(min=0), default=4, required=False, show_default=True)
+@click.option('--g_num_fp16_res',    help='Number of fp16 layers in generator', metavar='INT', type=click.IntRange(min=0), default=0, required=False, show_default=True)
+@click.option('--d_num_fp16_res',    help='Number of fp16 layers in discriminator', metavar='INT', type=click.IntRange(min=0), default=4, required=False, show_default=True)
+@click.option('--sr_first_cutoff',    help='First cutoff for AF superresolution', metavar='INT', type=click.IntRange(min=2), default=2, required=False, show_default=True)
+@click.option('--sr_first_stopband',    help='First cutoff for AF superresolution', metavar='FLOAT', type=click.FloatRange(min=2), default=2**2.1, required=False, show_default=True)
+@click.option('--style_mixing_prob',    help='Style-mixing regularization probability for training.', metavar='FLOAT', type=click.FloatRange(min=0, max=1), default=0, required=False, show_default=True)
+@click.option('--sr-module',    help='Superresolution module override', metavar='STR',  type=str, required=False, default=None)
+@click.option('--density_reg',    help='Density regularization strength.', metavar='FLOAT', type=click.FloatRange(min=0), default=0.25, required=False, show_default=True)
+@click.option('--density_reg_every',    help='lazy density reg', metavar='int', type=click.FloatRange(min=1), default=4, required=False, show_default=True)
+@click.option('--density_reg_p_dist',    help='density regularization strength.', metavar='FLOAT', type=click.FloatRange(min=0), default=0.004, required=False, show_default=True)
+@click.option('--reg_type', help='Type of regularization', metavar='STR',  type=click.Choice(['l1', 'l1-alt', 'monotonic-detach', 'monotonic-fixed', 'total-variation']), required=False, default='l1')
+@click.option('--decoder_lr_mul',    help='decoder learning rate multiplier.', metavar='FLOAT', type=click.FloatRange(min=0), default=1, required=False, show_default=True)
+
+def main(**kwargs):
+    """Train a GAN using the techniques described in the paper
+    "Alias-Free Generative Adversarial Networks".
+
+    Examples:
+
+    \b
+    # Train StyleGAN3-T for AFHQv2 using 8 GPUs.
+    python train.py --outdir=~/training-runs --cfg=stylegan3-t --data=~/datasets/afhqv2-512x512.zip \\
+        --gpus=8 --batch=32 --gamma=8.2 --mirror=1
+
+    \b
+    # Fine-tune StyleGAN3-R for MetFaces-U using 1 GPU, starting from the pre-trained FFHQ-U pickle.
+    python train.py --outdir=~/training-runs --cfg=stylegan3-r --data=~/datasets/metfacesu-1024x1024.zip \\
+        --gpus=8 --batch=32 --gamma=6.6 --mirror=1 --kimg=5000 --snap=5 \\
+        --resume=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl
+
+    \b
+    # Train StyleGAN2 for FFHQ at 1024x1024 resolution using 8 GPUs.
+    python train.py --outdir=~/training-runs --cfg=stylegan2 --data=~/datasets/ffhq-1024x1024.zip \\
+        --gpus=8 --batch=32 --gamma=10 --mirror=1 --aug=noaug
+    """
+
+    # Initialize config.
+    opts = dnnlib.EasyDict(kwargs) # Command line arguments.
+    c = dnnlib.EasyDict() # Main config dict.
+    c.G_kwargs = dnnlib.EasyDict(class_name=None, z_dim=512, w_dim=512, mapping_kwargs=dnnlib.EasyDict())
+    c.D_kwargs = dnnlib.EasyDict(class_name='training.networks_stylegan2.Discriminator', block_kwargs=dnnlib.EasyDict(), mapping_kwargs=dnnlib.EasyDict(), epilogue_kwargs=dnnlib.EasyDict())
+    c.G_opt_kwargs = dnnlib.EasyDict(class_name='torch.optim.Adam', betas=[0,0.99], eps=1e-8)
+    c.D_opt_kwargs = dnnlib.EasyDict(class_name='torch.optim.Adam', betas=[0,0.99], eps=1e-8)
+    c.loss_kwargs = dnnlib.EasyDict(class_name='training.loss.StyleGAN2Loss')
+    c.data_loader_kwargs = dnnlib.EasyDict(pin_memory=True, prefetch_factor=2)
+
+    # Training set.
+    c.training_set_kwargs, dataset_name = init_dataset_kwargs(data=opts.data, max_size=opts.data_max_size)
+    if opts.cond and not c.training_set_kwargs.use_labels:
+        raise click.ClickException('--cond=True requires labels specified in dataset.json')
+    c.training_set_kwargs.use_labels = opts.cond
+    c.training_set_kwargs.xflip = opts.mirror
+
+    # Hyperparameters & settings.
+    c.freeze_dec_sr = opts.freeze_dec_sr
+    c.num_gpus = opts.gpus
+    c.batch_size = opts.batch
+    c.batch_gpu = opts.batch_gpu or opts.batch // opts.gpus
+    c.G_kwargs.channel_base = c.D_kwargs.channel_base = opts.cbase
+    c.G_kwargs.channel_max = c.D_kwargs.channel_max = opts.cmax
+    c.G_kwargs.mapping_kwargs.num_layers = opts.map_depth
+    c.D_kwargs.block_kwargs.freeze_layers = opts.freezed
+    c.D_kwargs.epilogue_kwargs.mbstd_group_size = opts.mbstd_group
+    c.loss_kwargs.r1_gamma = opts.gamma
+    c.G_opt_kwargs.lr = (0.002 if opts.cfg == 'stylegan2' else 0.0025) if opts.glr is None else opts.glr
+    c.D_opt_kwargs.lr = opts.dlr
+    c.metrics = opts.metrics
+    c.total_kimg = opts.kimg
+    c.kimg_per_tick = opts.tick
+    c.image_snapshot_ticks = c.network_snapshot_ticks = opts.snap
+    c.random_seed = c.training_set_kwargs.random_seed = opts.seed
+    c.data_loader_kwargs.num_workers = opts.workers
+
+    # Sanity checks.
+    if c.batch_size % c.num_gpus != 0:
+        raise click.ClickException('--batch must be a multiple of --gpus')
+    if c.batch_size % (c.num_gpus * c.batch_gpu) != 0:
+        raise click.ClickException('--batch must be a multiple of --gpus times --batch-gpu')
+    if c.batch_gpu < c.D_kwargs.epilogue_kwargs.mbstd_group_size:
+        raise click.ClickException('--batch-gpu cannot be smaller than --mbstd')
+    if any(not metric_main.is_valid_metric(metric) for metric in c.metrics):
+        raise click.ClickException('\n'.join(['--metrics can only contain the following values:'] + metric_main.list_valid_metrics()))
+
+    # Base configuration.
+    c.ema_kimg = c.batch_size * 10 / 32
+    c.G_kwargs.class_name = 'training.triplane.TriPlaneGenerator'
+    c.D_kwargs.class_name = 'training.dual_discriminator.DualDiscriminator'
+    c.G_kwargs.fused_modconv_default = 'inference_only' # Speed up training by using regular convolutions instead of grouped convolutions.
+    c.loss_kwargs.filter_mode = 'antialiased' # Filter mode for raw images ['antialiased', 'none', float [0-1]]
+    c.D_kwargs.disc_c_noise = opts.disc_c_noise # Regularization for discriminator pose conditioning
+
+    if opts.cfg =='cat':
+        sr_module = 'training.superresolution.SuperresolutionHybrid8XDC_afhq'
+        print("Hello")
+    else:
+        if c.training_set_kwargs.resolution == 512:
+            sr_module = 'training.superresolution.SuperresolutionHybrid8XDC'
+        elif c.training_set_kwargs.resolution == 256:
+            sr_module = 'training.superresolution.SuperresolutionHybrid4X'
+        elif c.training_set_kwargs.resolution == 128:
+            sr_module = 'training.superresolution.SuperresolutionHybrid2X'
+        else:
+            assert False, f"Unsupported resolution {c.training_set_kwargs.resolution}; make a new superresolution module"
+
+    if opts.sr_module != None:
+        sr_module = opts.sr_module
+    
+    rendering_options = {
+        'image_resolution': c.training_set_kwargs.resolution,
+        'disparity_space_sampling': False,
+        'clamp_mode': 'softplus',
+        'superresolution_module': sr_module,
+        'c_gen_conditioning_zero': not opts.gen_pose_cond, # if true, fill generator pose conditioning label with dummy zero vector
+        'gpc_reg_prob': opts.gpc_reg_prob if opts.gen_pose_cond else None,
+        'c_scale': opts.c_scale, # mutliplier for generator pose conditioning label
+        'superresolution_noise_mode': opts.sr_noise_mode, # [random or none], whether to inject pixel noise into super-resolution layers
+        'density_reg': opts.density_reg, # strength of density regularization
+        'density_reg_p_dist': opts.density_reg_p_dist, # distance at which to sample perturbed points for density regularization
+        'reg_type': opts.reg_type, # for experimenting with variations on density regularization
+        'decoder_lr_mul': opts.decoder_lr_mul, # learning rate multiplier for decoder
+        'sr_antialias': True,
+    }
+
+    if opts.cfg == 'ffhq':
+        rendering_options.update({
+            'depth_resolution': 48, # number of uniform samples to take per ray.
+            'depth_resolution_importance': 48, # number of importance samples to take per ray.
+            'ray_start': 2.25, # near point along each ray to start taking samples.
+            'ray_end': 3.3, # far point along each ray to stop taking samples. 
+            'box_warp': 1, # the side-length of the bounding box spanned by the tri-planes; box_warp=1 means [-0.5, -0.5, -0.5] -> [0.5, 0.5, 0.5].
+            'avg_camera_radius': 2.7, # used only in the visualizer to specify camera orbit radius.
+            'avg_camera_pivot': [0, 0, 0.2], # used only in the visualizer to control center of camera rotation.
+        })
+    elif opts.cfg == 'afhq':
+        rendering_options.update({
+            'depth_resolution': 48,
+            'depth_resolution_importance': 48,
+            'ray_start': 2.25,
+            'ray_end': 3.3,
+            'box_warp': 1,
+            'avg_camera_radius': 2.7,
+            'avg_camera_pivot': [0, 0, -0.06],
+        })
+    elif opts.cfg == 'cat':
+        rendering_options.update({
+            'depth_resolution': 48,
+            'depth_resolution_importance': 48,
+            'ray_start': 2.25,
+            'ray_end': 3.3,
+            'box_warp': 1,
+            'avg_camera_radius': 2.7,
+            'avg_camera_pivot': [0, 0, -0.06],
+        })
+    elif opts.cfg == 'shapenet':
+        rendering_options.update({
+            'depth_resolution': 64,
+            'depth_resolution_importance': 64,
+            'ray_start': 0.1,
+            'ray_end': 2.6,
+            'box_warp': 1.6,
+            'white_back': True,
+            'avg_camera_radius': 1.7,
+            'avg_camera_pivot': [0, 0, 0],
+        })
+    else:
+        assert False, "Need to specify config"
+
+
+
+    if opts.density_reg > 0:
+        c.G_reg_interval = opts.density_reg_every
+    c.G_kwargs.rendering_kwargs = rendering_options
+    c.G_kwargs.num_fp16_res = 0
+    c.loss_kwargs.blur_init_sigma = 10 # Blur the images seen by the discriminator.
+    c.loss_kwargs.blur_fade_kimg = c.batch_size * opts.blur_fade_kimg / 32 # Fade out the blur during the first N kimg.
+
+    c.loss_kwargs.gpc_reg_prob = opts.gpc_reg_prob if opts.gen_pose_cond else None
+    c.loss_kwargs.gpc_reg_fade_kimg = opts.gpc_reg_fade_kimg
+    c.loss_kwargs.dual_discrimination = True
+    c.loss_kwargs.neural_rendering_resolution_initial = opts.neural_rendering_resolution_initial
+    c.loss_kwargs.neural_rendering_resolution_final = opts.neural_rendering_resolution_final
+    c.loss_kwargs.neural_rendering_resolution_fade_kimg = opts.neural_rendering_resolution_fade_kimg
+    c.G_kwargs.sr_num_fp16_res = opts.sr_num_fp16_res
+
+    c.G_kwargs.sr_kwargs = dnnlib.EasyDict(channel_base=opts.cbase, channel_max=opts.cmax, fused_modconv_default='inference_only')
+
+    c.loss_kwargs.style_mixing_prob = opts.style_mixing_prob
+
+    # Augmentation.
+    if opts.aug != 'noaug':
+        c.augment_kwargs = dnnlib.EasyDict(class_name='training.augment.AugmentPipe', xflip=1, rotate90=1, xint=1, scale=1, rotate=1, aniso=1, xfrac=1, brightness=1, contrast=1, lumaflip=1, hue=1, saturation=1)
+        if opts.aug == 'ada':
+            c.ada_target = opts.target
+        if opts.aug == 'fixed':
+            c.augment_p = opts.p
+
+    # Resume.
+    if opts.resume is not None:
+        c.resume_pkl = opts.resume
+        c.ada_kimg = 100 # Make ADA react faster at the beginning.
+        c.ema_rampup = None # Disable EMA rampup.
+        if not opts.resume_blur:
+            c.loss_kwargs.blur_init_sigma = 0 # Disable blur rampup.
+            c.loss_kwargs.gpc_reg_fade_kimg = 0 # Disable swapping rampup
+
+    # Performance-related toggles.
+    # if opts.fp32:
+    #     c.G_kwargs.num_fp16_res = c.D_kwargs.num_fp16_res = 0
+    #     c.G_kwargs.conv_clamp = c.D_kwargs.conv_clamp = None
+    c.G_kwargs.num_fp16_res = opts.g_num_fp16_res
+    c.G_kwargs.conv_clamp = 256 if opts.g_num_fp16_res > 0 else None
+    c.D_kwargs.num_fp16_res = opts.d_num_fp16_res
+    c.D_kwargs.conv_clamp = 256 if opts.d_num_fp16_res > 0 else None
+
+    if opts.nobench:
+        c.cudnn_benchmark = False
+
+    # Description string.
+    desc = f'{opts.cfg:s}-{dataset_name:s}-gpus{c.num_gpus:d}-batch{c.batch_size:d}-gamma{c.loss_kwargs.r1_gamma:g}'
+    if opts.desc is not None:
+        desc += f'-{opts.desc}'
+
+    # Launch.
+    launch_training(c=c, desc=desc, outdir=opts.outdir, dry_run=opts.dry_run)
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/training/__init__.py b/eg3d/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfebd04f47e6f6b1b44984c14c23b57d56f72240
--- /dev/null
+++ b/eg3d/training/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+# empty
diff --git a/eg3d/training/augment.py b/eg3d/training/augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b00a4ade50459c16e34fa4c132b2cb947cfff28
--- /dev/null
+++ b/eg3d/training/augment.py
@@ -0,0 +1,441 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Augmentation pipeline from the paper
+"Training Generative Adversarial Networks with Limited Data".
+Matches the original implementation by Karras et al. at
+https://github.com/NVlabs/stylegan2-ada/blob/main/training/augment.py"""
+
+import numpy as np
+import scipy.signal
+import torch
+from torch_utils import persistence
+from torch_utils import misc
+from torch_utils.ops import upfirdn2d
+from torch_utils.ops import grid_sample_gradfix
+from torch_utils.ops import conv2d_gradfix
+
+#----------------------------------------------------------------------------
+# Coefficients of various wavelet decomposition low-pass filters.
+
+wavelets = {
+    'haar': [0.7071067811865476, 0.7071067811865476],
+    'db1':  [0.7071067811865476, 0.7071067811865476],
+    'db2':  [-0.12940952255092145, 0.22414386804185735, 0.836516303737469, 0.48296291314469025],
+    'db3':  [0.035226291882100656, -0.08544127388224149, -0.13501102001039084, 0.4598775021193313, 0.8068915093133388, 0.3326705529509569],
+    'db4':  [-0.010597401784997278, 0.032883011666982945, 0.030841381835986965, -0.18703481171888114, -0.02798376941698385, 0.6308807679295904, 0.7148465705525415, 0.23037781330885523],
+    'db5':  [0.003335725285001549, -0.012580751999015526, -0.006241490213011705, 0.07757149384006515, -0.03224486958502952, -0.24229488706619015, 0.13842814590110342, 0.7243085284385744, 0.6038292697974729, 0.160102397974125],
+    'db6':  [-0.00107730108499558, 0.004777257511010651, 0.0005538422009938016, -0.031582039318031156, 0.02752286553001629, 0.09750160558707936, -0.12976686756709563, -0.22626469396516913, 0.3152503517092432, 0.7511339080215775, 0.4946238903983854, 0.11154074335008017],
+    'db7':  [0.0003537138000010399, -0.0018016407039998328, 0.00042957797300470274, 0.012550998556013784, -0.01657454163101562, -0.03802993693503463, 0.0806126091510659, 0.07130921926705004, -0.22403618499416572, -0.14390600392910627, 0.4697822874053586, 0.7291320908465551, 0.39653931948230575, 0.07785205408506236],
+    'db8':  [-0.00011747678400228192, 0.0006754494059985568, -0.0003917403729959771, -0.00487035299301066, 0.008746094047015655, 0.013981027917015516, -0.04408825393106472, -0.01736930100202211, 0.128747426620186, 0.00047248457399797254, -0.2840155429624281, -0.015829105256023893, 0.5853546836548691, 0.6756307362980128, 0.3128715909144659, 0.05441584224308161],
+    'sym2': [-0.12940952255092145, 0.22414386804185735, 0.836516303737469, 0.48296291314469025],
+    'sym3': [0.035226291882100656, -0.08544127388224149, -0.13501102001039084, 0.4598775021193313, 0.8068915093133388, 0.3326705529509569],
+    'sym4': [-0.07576571478927333, -0.02963552764599851, 0.49761866763201545, 0.8037387518059161, 0.29785779560527736, -0.09921954357684722, -0.012603967262037833, 0.0322231006040427],
+    'sym5': [0.027333068345077982, 0.029519490925774643, -0.039134249302383094, 0.1993975339773936, 0.7234076904024206, 0.6339789634582119, 0.01660210576452232, -0.17532808990845047, -0.021101834024758855, 0.019538882735286728],
+    'sym6': [0.015404109327027373, 0.0034907120842174702, -0.11799011114819057, -0.048311742585633, 0.4910559419267466, 0.787641141030194, 0.3379294217276218, -0.07263752278646252, -0.021060292512300564, 0.04472490177066578, 0.0017677118642428036, -0.007800708325034148],
+    'sym7': [0.002681814568257878, -0.0010473848886829163, -0.01263630340325193, 0.03051551316596357, 0.0678926935013727, -0.049552834937127255, 0.017441255086855827, 0.5361019170917628, 0.767764317003164, 0.2886296317515146, -0.14004724044296152, -0.10780823770381774, 0.004010244871533663, 0.010268176708511255],
+    'sym8': [-0.0033824159510061256, -0.0005421323317911481, 0.03169508781149298, 0.007607487324917605, -0.1432942383508097, -0.061273359067658524, 0.4813596512583722, 0.7771857517005235, 0.3644418948353314, -0.05194583810770904, -0.027219029917056003, 0.049137179673607506, 0.003808752013890615, -0.01495225833704823, -0.0003029205147213668, 0.0018899503327594609],
+}
+
+#----------------------------------------------------------------------------
+# Helpers for constructing transformation matrices.
+
+def matrix(*rows, device=None):
+    assert all(len(row) == len(rows[0]) for row in rows)
+    elems = [x for row in rows for x in row]
+    ref = [x for x in elems if isinstance(x, torch.Tensor)]
+    if len(ref) == 0:
+        return misc.constant(np.asarray(rows), device=device)
+    assert device is None or device == ref[0].device
+    elems = [x if isinstance(x, torch.Tensor) else misc.constant(x, shape=ref[0].shape, device=ref[0].device) for x in elems]
+    return torch.stack(elems, dim=-1).reshape(ref[0].shape + (len(rows), -1))
+
+def translate2d(tx, ty, **kwargs):
+    return matrix(
+        [1, 0, tx],
+        [0, 1, ty],
+        [0, 0, 1],
+        **kwargs)
+
+def translate3d(tx, ty, tz, **kwargs):
+    return matrix(
+        [1, 0, 0, tx],
+        [0, 1, 0, ty],
+        [0, 0, 1, tz],
+        [0, 0, 0, 1],
+        **kwargs)
+
+def scale2d(sx, sy, **kwargs):
+    return matrix(
+        [sx, 0,  0],
+        [0,  sy, 0],
+        [0,  0,  1],
+        **kwargs)
+
+def scale3d(sx, sy, sz, **kwargs):
+    return matrix(
+        [sx, 0,  0,  0],
+        [0,  sy, 0,  0],
+        [0,  0,  sz, 0],
+        [0,  0,  0,  1],
+        **kwargs)
+
+def rotate2d(theta, **kwargs):
+    return matrix(
+        [torch.cos(theta), torch.sin(-theta), 0],
+        [torch.sin(theta), torch.cos(theta),  0],
+        [0,                0,                 1],
+        **kwargs)
+
+def rotate3d(v, theta, **kwargs):
+    vx = v[..., 0]; vy = v[..., 1]; vz = v[..., 2]
+    s = torch.sin(theta); c = torch.cos(theta); cc = 1 - c
+    return matrix(
+        [vx*vx*cc+c,    vx*vy*cc-vz*s, vx*vz*cc+vy*s, 0],
+        [vy*vx*cc+vz*s, vy*vy*cc+c,    vy*vz*cc-vx*s, 0],
+        [vz*vx*cc-vy*s, vz*vy*cc+vx*s, vz*vz*cc+c,    0],
+        [0,             0,             0,             1],
+        **kwargs)
+
+def translate2d_inv(tx, ty, **kwargs):
+    return translate2d(-tx, -ty, **kwargs)
+
+def scale2d_inv(sx, sy, **kwargs):
+    return scale2d(1 / sx, 1 / sy, **kwargs)
+
+def rotate2d_inv(theta, **kwargs):
+    return rotate2d(-theta, **kwargs)
+
+#----------------------------------------------------------------------------
+# Versatile image augmentation pipeline from the paper
+# "Training Generative Adversarial Networks with Limited Data".
+#
+# All augmentations are disabled by default; individual augmentations can
+# be enabled by setting their probability multipliers to 1.
+
+@persistence.persistent_class
+class AugmentPipe(torch.nn.Module):
+    def __init__(self,
+        xflip=0, rotate90=0, xint=0, xint_max=0.125,
+        scale=0, rotate=0, aniso=0, xfrac=0, scale_std=0.2, rotate_max=1, aniso_std=0.2, xfrac_std=0.125,
+        brightness=0, contrast=0, lumaflip=0, hue=0, saturation=0, brightness_std=0.2, contrast_std=0.5, hue_max=1, saturation_std=1,
+        imgfilter=0, imgfilter_bands=[1,1,1,1], imgfilter_std=1,
+        noise=0, cutout=0, noise_std=0.1, cutout_size=0.5,
+    ):
+        super().__init__()
+        self.register_buffer('p', torch.ones([]))       # Overall multiplier for augmentation probability.
+
+        # Pixel blitting.
+        self.xflip            = float(xflip)            # Probability multiplier for x-flip.
+        self.rotate90         = float(rotate90)         # Probability multiplier for 90 degree rotations.
+        self.xint             = float(xint)             # Probability multiplier for integer translation.
+        self.xint_max         = float(xint_max)         # Range of integer translation, relative to image dimensions.
+
+        # General geometric transformations.
+        self.scale            = float(scale)            # Probability multiplier for isotropic scaling.
+        self.rotate           = float(rotate)           # Probability multiplier for arbitrary rotation.
+        self.aniso            = float(aniso)            # Probability multiplier for anisotropic scaling.
+        self.xfrac            = float(xfrac)            # Probability multiplier for fractional translation.
+        self.scale_std        = float(scale_std)        # Log2 standard deviation of isotropic scaling.
+        self.rotate_max       = float(rotate_max)       # Range of arbitrary rotation, 1 = full circle.
+        self.aniso_std        = float(aniso_std)        # Log2 standard deviation of anisotropic scaling.
+        self.xfrac_std        = float(xfrac_std)        # Standard deviation of frational translation, relative to image dimensions.
+
+        # Color transformations.
+        self.brightness       = float(brightness)       # Probability multiplier for brightness.
+        self.contrast         = float(contrast)         # Probability multiplier for contrast.
+        self.lumaflip         = float(lumaflip)         # Probability multiplier for luma flip.
+        self.hue              = float(hue)              # Probability multiplier for hue rotation.
+        self.saturation       = float(saturation)       # Probability multiplier for saturation.
+        self.brightness_std   = float(brightness_std)   # Standard deviation of brightness.
+        self.contrast_std     = float(contrast_std)     # Log2 standard deviation of contrast.
+        self.hue_max          = float(hue_max)          # Range of hue rotation, 1 = full circle.
+        self.saturation_std   = float(saturation_std)   # Log2 standard deviation of saturation.
+
+        # Image-space filtering.
+        self.imgfilter        = float(imgfilter)        # Probability multiplier for image-space filtering.
+        self.imgfilter_bands  = list(imgfilter_bands)   # Probability multipliers for individual frequency bands.
+        self.imgfilter_std    = float(imgfilter_std)    # Log2 standard deviation of image-space filter amplification.
+
+        # Image-space corruptions.
+        self.noise            = float(noise)            # Probability multiplier for additive RGB noise.
+        self.cutout           = float(cutout)           # Probability multiplier for cutout.
+        self.noise_std        = float(noise_std)        # Standard deviation of additive RGB noise.
+        self.cutout_size      = float(cutout_size)      # Size of the cutout rectangle, relative to image dimensions.
+
+        # Setup orthogonal lowpass filter for geometric augmentations.
+        self.register_buffer('Hz_geom', upfirdn2d.setup_filter(wavelets['sym6']))
+
+        # Construct filter bank for image-space filtering.
+        Hz_lo = np.asarray(wavelets['sym2'])            # H(z)
+        Hz_hi = Hz_lo * ((-1) ** np.arange(Hz_lo.size)) # H(-z)
+        Hz_lo2 = np.convolve(Hz_lo, Hz_lo[::-1]) / 2    # H(z) * H(z^-1) / 2
+        Hz_hi2 = np.convolve(Hz_hi, Hz_hi[::-1]) / 2    # H(-z) * H(-z^-1) / 2
+        Hz_fbank = np.eye(4, 1)                         # Bandpass(H(z), b_i)
+        for i in range(1, Hz_fbank.shape[0]):
+            Hz_fbank = np.dstack([Hz_fbank, np.zeros_like(Hz_fbank)]).reshape(Hz_fbank.shape[0], -1)[:, :-1]
+            Hz_fbank = scipy.signal.convolve(Hz_fbank, [Hz_lo2])
+            Hz_fbank[i, (Hz_fbank.shape[1] - Hz_hi2.size) // 2 : (Hz_fbank.shape[1] + Hz_hi2.size) // 2] += Hz_hi2
+        self.register_buffer('Hz_fbank', torch.as_tensor(Hz_fbank, dtype=torch.float32))
+
+    def forward(self, images, debug_percentile=None):
+        assert isinstance(images, torch.Tensor) and images.ndim == 4
+        batch_size, num_channels, height, width = images.shape
+        device = images.device
+        if debug_percentile is not None:
+            debug_percentile = torch.as_tensor(debug_percentile, dtype=torch.float32, device=device)
+
+        # -------------------------------------
+        # Select parameters for pixel blitting.
+        # -------------------------------------
+
+        # Initialize inverse homogeneous 2D transform: G_inv @ pixel_out ==> pixel_in
+        I_3 = torch.eye(3, device=device)
+        G_inv = I_3
+
+        # Apply x-flip with probability (xflip * strength).
+        if self.xflip > 0:
+            i = torch.floor(torch.rand([batch_size], device=device) * 2)
+            i = torch.where(torch.rand([batch_size], device=device) < self.xflip * self.p, i, torch.zeros_like(i))
+            if debug_percentile is not None:
+                i = torch.full_like(i, torch.floor(debug_percentile * 2))
+            G_inv = G_inv @ scale2d_inv(1 - 2 * i, 1)
+
+        # Apply 90 degree rotations with probability (rotate90 * strength).
+        if self.rotate90 > 0:
+            i = torch.floor(torch.rand([batch_size], device=device) * 4)
+            i = torch.where(torch.rand([batch_size], device=device) < self.rotate90 * self.p, i, torch.zeros_like(i))
+            if debug_percentile is not None:
+                i = torch.full_like(i, torch.floor(debug_percentile * 4))
+            G_inv = G_inv @ rotate2d_inv(-np.pi / 2 * i)
+
+        # Apply integer translation with probability (xint * strength).
+        if self.xint > 0:
+            t = (torch.rand([batch_size, 2], device=device) * 2 - 1) * self.xint_max
+            t = torch.where(torch.rand([batch_size, 1], device=device) < self.xint * self.p, t, torch.zeros_like(t))
+            if debug_percentile is not None:
+                t = torch.full_like(t, (debug_percentile * 2 - 1) * self.xint_max)
+            G_inv = G_inv @ translate2d_inv(torch.round(t[:,0] * width), torch.round(t[:,1] * height))
+
+        # --------------------------------------------------------
+        # Select parameters for general geometric transformations.
+        # --------------------------------------------------------
+
+        # Apply isotropic scaling with probability (scale * strength).
+        if self.scale > 0:
+            s = torch.exp2(torch.randn([batch_size], device=device) * self.scale_std)
+            s = torch.where(torch.rand([batch_size], device=device) < self.scale * self.p, s, torch.ones_like(s))
+            if debug_percentile is not None:
+                s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.scale_std))
+            G_inv = G_inv @ scale2d_inv(s, s)
+
+        # Apply pre-rotation with probability p_rot.
+        p_rot = 1 - torch.sqrt((1 - self.rotate * self.p).clamp(0, 1)) # P(pre OR post) = p
+        if self.rotate > 0:
+            theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.rotate_max
+            theta = torch.where(torch.rand([batch_size], device=device) < p_rot, theta, torch.zeros_like(theta))
+            if debug_percentile is not None:
+                theta = torch.full_like(theta, (debug_percentile * 2 - 1) * np.pi * self.rotate_max)
+            G_inv = G_inv @ rotate2d_inv(-theta) # Before anisotropic scaling.
+
+        # Apply anisotropic scaling with probability (aniso * strength).
+        if self.aniso > 0:
+            s = torch.exp2(torch.randn([batch_size], device=device) * self.aniso_std)
+            s = torch.where(torch.rand([batch_size], device=device) < self.aniso * self.p, s, torch.ones_like(s))
+            if debug_percentile is not None:
+                s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.aniso_std))
+            G_inv = G_inv @ scale2d_inv(s, 1 / s)
+
+        # Apply post-rotation with probability p_rot.
+        if self.rotate > 0:
+            theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.rotate_max
+            theta = torch.where(torch.rand([batch_size], device=device) < p_rot, theta, torch.zeros_like(theta))
+            if debug_percentile is not None:
+                theta = torch.zeros_like(theta)
+            G_inv = G_inv @ rotate2d_inv(-theta) # After anisotropic scaling.
+
+        # Apply fractional translation with probability (xfrac * strength).
+        if self.xfrac > 0:
+            t = torch.randn([batch_size, 2], device=device) * self.xfrac_std
+            t = torch.where(torch.rand([batch_size, 1], device=device) < self.xfrac * self.p, t, torch.zeros_like(t))
+            if debug_percentile is not None:
+                t = torch.full_like(t, torch.erfinv(debug_percentile * 2 - 1) * self.xfrac_std)
+            G_inv = G_inv @ translate2d_inv(t[:,0] * width, t[:,1] * height)
+
+        # ----------------------------------
+        # Execute geometric transformations.
+        # ----------------------------------
+
+        # Execute if the transform is not identity.
+        if G_inv is not I_3:
+
+            # Calculate padding.
+            cx = (width - 1) / 2
+            cy = (height - 1) / 2
+            cp = matrix([-cx, -cy, 1], [cx, -cy, 1], [cx, cy, 1], [-cx, cy, 1], device=device) # [idx, xyz]
+            cp = G_inv @ cp.t() # [batch, xyz, idx]
+            Hz_pad = self.Hz_geom.shape[0] // 4
+            margin = cp[:, :2, :].permute(1, 0, 2).flatten(1) # [xy, batch * idx]
+            margin = torch.cat([-margin, margin]).max(dim=1).values # [x0, y0, x1, y1]
+            margin = margin + misc.constant([Hz_pad * 2 - cx, Hz_pad * 2 - cy] * 2, device=device)
+            margin = margin.max(misc.constant([0, 0] * 2, device=device))
+            margin = margin.min(misc.constant([width-1, height-1] * 2, device=device))
+            mx0, my0, mx1, my1 = margin.ceil().to(torch.int32)
+
+            # Pad image and adjust origin.
+            images = torch.nn.functional.pad(input=images, pad=[mx0,mx1,my0,my1], mode='reflect')
+            G_inv = translate2d((mx0 - mx1) / 2, (my0 - my1) / 2) @ G_inv
+
+            # Upsample.
+            images = upfirdn2d.upsample2d(x=images, f=self.Hz_geom, up=2)
+            G_inv = scale2d(2, 2, device=device) @ G_inv @ scale2d_inv(2, 2, device=device)
+            G_inv = translate2d(-0.5, -0.5, device=device) @ G_inv @ translate2d_inv(-0.5, -0.5, device=device)
+
+            # Execute transformation.
+            shape = [batch_size, num_channels, (height + Hz_pad * 2) * 2, (width + Hz_pad * 2) * 2]
+            G_inv = scale2d(2 / images.shape[3], 2 / images.shape[2], device=device) @ G_inv @ scale2d_inv(2 / shape[3], 2 / shape[2], device=device)
+            grid = torch.nn.functional.affine_grid(theta=G_inv[:,:2,:], size=shape, align_corners=False)
+            images = grid_sample_gradfix.grid_sample(images, grid)
+
+            # Downsample and crop.
+            images = upfirdn2d.downsample2d(x=images, f=self.Hz_geom, down=2, padding=-Hz_pad*2, flip_filter=True)
+
+        # --------------------------------------------
+        # Select parameters for color transformations.
+        # --------------------------------------------
+
+        # Initialize homogeneous 3D transformation matrix: C @ color_in ==> color_out
+        I_4 = torch.eye(4, device=device)
+        C = I_4
+
+        # Apply brightness with probability (brightness * strength).
+        if self.brightness > 0:
+            b = torch.randn([batch_size], device=device) * self.brightness_std
+            b = torch.where(torch.rand([batch_size], device=device) < self.brightness * self.p, b, torch.zeros_like(b))
+            if debug_percentile is not None:
+                b = torch.full_like(b, torch.erfinv(debug_percentile * 2 - 1) * self.brightness_std)
+            C = translate3d(b, b, b) @ C
+
+        # Apply contrast with probability (contrast * strength).
+        if self.contrast > 0:
+            c = torch.exp2(torch.randn([batch_size], device=device) * self.contrast_std)
+            c = torch.where(torch.rand([batch_size], device=device) < self.contrast * self.p, c, torch.ones_like(c))
+            if debug_percentile is not None:
+                c = torch.full_like(c, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.contrast_std))
+            C = scale3d(c, c, c) @ C
+
+        # Apply luma flip with probability (lumaflip * strength).
+        v = misc.constant(np.asarray([1, 1, 1, 0]) / np.sqrt(3), device=device) # Luma axis.
+        if self.lumaflip > 0:
+            i = torch.floor(torch.rand([batch_size, 1, 1], device=device) * 2)
+            i = torch.where(torch.rand([batch_size, 1, 1], device=device) < self.lumaflip * self.p, i, torch.zeros_like(i))
+            if debug_percentile is not None:
+                i = torch.full_like(i, torch.floor(debug_percentile * 2))
+            C = (I_4 - 2 * v.ger(v) * i) @ C # Householder reflection.
+
+        # Apply hue rotation with probability (hue * strength).
+        if self.hue > 0 and num_channels > 1:
+            theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.hue_max
+            theta = torch.where(torch.rand([batch_size], device=device) < self.hue * self.p, theta, torch.zeros_like(theta))
+            if debug_percentile is not None:
+                theta = torch.full_like(theta, (debug_percentile * 2 - 1) * np.pi * self.hue_max)
+            C = rotate3d(v, theta) @ C # Rotate around v.
+
+        # Apply saturation with probability (saturation * strength).
+        if self.saturation > 0 and num_channels > 1:
+            s = torch.exp2(torch.randn([batch_size, 1, 1], device=device) * self.saturation_std)
+            s = torch.where(torch.rand([batch_size, 1, 1], device=device) < self.saturation * self.p, s, torch.ones_like(s))
+            if debug_percentile is not None:
+                s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.saturation_std))
+            C = (v.ger(v) + (I_4 - v.ger(v)) * s) @ C
+
+        # ------------------------------
+        # Execute color transformations.
+        # ------------------------------
+
+        # Execute if the transform is not identity.
+        if C is not I_4:
+            images = images.reshape([batch_size, num_channels, height * width])
+            if num_channels == 3:
+                images = C[:, :3, :3] @ images + C[:, :3, 3:]
+            elif num_channels == 1:
+                C = C[:, :3, :].mean(dim=1, keepdims=True)
+                images = images * C[:, :, :3].sum(dim=2, keepdims=True) + C[:, :, 3:]
+            elif num_channels == 6:
+                images[:, :3] = C[:, :3, :3] @ images[:, :3] + C[:, :3, 3:]
+                images[:, 3:] = C[:, :3, :3] @ images[:, 3:] + C[:, :3, 3:]
+            else:
+                raise ValueError('Image must be RGB (3 channels) or L (1 channel)')
+            images = images.reshape([batch_size, num_channels, height, width])
+
+        # ----------------------
+        # Image-space filtering.
+        # ----------------------
+
+        if self.imgfilter > 0:
+            num_bands = self.Hz_fbank.shape[0]
+            assert len(self.imgfilter_bands) == num_bands
+            expected_power = misc.constant(np.array([10, 1, 1, 1]) / 13, device=device) # Expected power spectrum (1/f).
+
+            # Apply amplification for each band with probability (imgfilter * strength * band_strength).
+            g = torch.ones([batch_size, num_bands], device=device) # Global gain vector (identity).
+            for i, band_strength in enumerate(self.imgfilter_bands):
+                t_i = torch.exp2(torch.randn([batch_size], device=device) * self.imgfilter_std)
+                t_i = torch.where(torch.rand([batch_size], device=device) < self.imgfilter * self.p * band_strength, t_i, torch.ones_like(t_i))
+                if debug_percentile is not None:
+                    t_i = torch.full_like(t_i, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.imgfilter_std)) if band_strength > 0 else torch.ones_like(t_i)
+                t = torch.ones([batch_size, num_bands], device=device)                  # Temporary gain vector.
+                t[:, i] = t_i                                                           # Replace i'th element.
+                t = t / (expected_power * t.square()).sum(dim=-1, keepdims=True).sqrt() # Normalize power.
+                g = g * t                                                               # Accumulate into global gain.
+
+            # Construct combined amplification filter.
+            Hz_prime = g @ self.Hz_fbank                                    # [batch, tap]
+            Hz_prime = Hz_prime.unsqueeze(1).repeat([1, num_channels, 1])   # [batch, channels, tap]
+            Hz_prime = Hz_prime.reshape([batch_size * num_channels, 1, -1]) # [batch * channels, 1, tap]
+
+            # Apply filter.
+            p = self.Hz_fbank.shape[1] // 2
+            images = images.reshape([1, batch_size * num_channels, height, width])
+            images = torch.nn.functional.pad(input=images, pad=[p,p,p,p], mode='reflect')
+            images = conv2d_gradfix.conv2d(input=images, weight=Hz_prime.unsqueeze(2), groups=batch_size*num_channels)
+            images = conv2d_gradfix.conv2d(input=images, weight=Hz_prime.unsqueeze(3), groups=batch_size*num_channels)
+            images = images.reshape([batch_size, num_channels, height, width])
+
+        # ------------------------
+        # Image-space corruptions.
+        # ------------------------
+
+        # Apply additive RGB noise with probability (noise * strength).
+        if self.noise > 0:
+            sigma = torch.randn([batch_size, 1, 1, 1], device=device).abs() * self.noise_std
+            sigma = torch.where(torch.rand([batch_size, 1, 1, 1], device=device) < self.noise * self.p, sigma, torch.zeros_like(sigma))
+            if debug_percentile is not None:
+                sigma = torch.full_like(sigma, torch.erfinv(debug_percentile) * self.noise_std)
+            images = images + torch.randn([batch_size, num_channels, height, width], device=device) * sigma
+
+        # Apply cutout with probability (cutout * strength).
+        if self.cutout > 0:
+            size = torch.full([batch_size, 2, 1, 1, 1], self.cutout_size, device=device)
+            size = torch.where(torch.rand([batch_size, 1, 1, 1, 1], device=device) < self.cutout * self.p, size, torch.zeros_like(size))
+            center = torch.rand([batch_size, 2, 1, 1, 1], device=device)
+            if debug_percentile is not None:
+                size = torch.full_like(size, self.cutout_size)
+                center = torch.full_like(center, debug_percentile)
+            coord_x = torch.arange(width, device=device).reshape([1, 1, 1, -1])
+            coord_y = torch.arange(height, device=device).reshape([1, 1, -1, 1])
+            mask_x = (((coord_x + 0.5) / width - center[:, 0]).abs() >= size[:, 0] / 2)
+            mask_y = (((coord_y + 0.5) / height - center[:, 1]).abs() >= size[:, 1] / 2)
+            mask = torch.logical_or(mask_x, mask_y).to(torch.float32)
+            images = images * mask
+
+        return images
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/training/crosssection_utils.py b/eg3d/training/crosssection_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d49f29534ae98b6c4cbd807677dcc55fd5e5ce
--- /dev/null
+++ b/eg3d/training/crosssection_utils.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import torch
+
+def sample_cross_section(G, ws, resolution=256, w=1.2):
+    axis=0
+    A, B = torch.meshgrid(torch.linspace(w/2, -w/2, resolution, device=ws.device), torch.linspace(-w/2, w/2, resolution, device=ws.device), indexing='ij')
+    A, B = A.reshape(-1, 1), B.reshape(-1, 1)
+    C = torch.zeros_like(A)
+    coordinates = [A, B]
+    coordinates.insert(axis, C)
+    coordinates = torch.cat(coordinates, dim=-1).expand(ws.shape[0], -1, -1)
+
+    sigma = G.sample_mixed(coordinates, torch.randn_like(coordinates), ws)['sigma']
+    return sigma.reshape(-1, 1, resolution, resolution)
+
+# if __name__ == '__main__':
+#     sample_crossection(None)
\ No newline at end of file
diff --git a/eg3d/training/dataset.py b/eg3d/training/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d7c4fb13d1541f9d11af92a76cc859d71f5547
--- /dev/null
+++ b/eg3d/training/dataset.py
@@ -0,0 +1,244 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Streaming images and labels from datasets created with dataset_tool.py."""
+
+import os
+import numpy as np
+import zipfile
+import PIL.Image
+import json
+import torch
+import dnnlib
+
+try:
+    import pyspng
+except ImportError:
+    pyspng = None
+
+#----------------------------------------------------------------------------
+
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self,
+        name,                   # Name of the dataset.
+        raw_shape,              # Shape of the raw image data (NCHW).
+        max_size    = None,     # Artificially limit the size of the dataset. None = no limit. Applied before xflip.
+        use_labels  = False,    # Enable conditioning labels? False = label dimension is zero.
+        xflip       = False,    # Artificially double the size of the dataset via x-flips. Applied after max_size.
+        random_seed = 0,        # Random seed to use when applying max_size.
+    ):
+        self._name = name
+        self._raw_shape = list(raw_shape)
+        self._use_labels = use_labels
+        self._raw_labels = None
+        self._label_shape = None
+
+        # Apply max_size.
+        self._raw_idx = np.arange(self._raw_shape[0], dtype=np.int64)
+        if (max_size is not None) and (self._raw_idx.size > max_size):
+            np.random.RandomState(random_seed).shuffle(self._raw_idx)
+            self._raw_idx = np.sort(self._raw_idx[:max_size])
+
+        # Apply xflip.
+        self._xflip = np.zeros(self._raw_idx.size, dtype=np.uint8)
+        if xflip:
+            self._raw_idx = np.tile(self._raw_idx, 2)
+            self._xflip = np.concatenate([self._xflip, np.ones_like(self._xflip)])
+
+    def _get_raw_labels(self):
+        if self._raw_labels is None:
+            self._raw_labels = self._load_raw_labels() if self._use_labels else None
+            if self._raw_labels is None:
+                self._raw_labels = np.zeros([self._raw_shape[0], 0], dtype=np.float32)
+            assert isinstance(self._raw_labels, np.ndarray)
+            assert self._raw_labels.shape[0] == self._raw_shape[0]
+            assert self._raw_labels.dtype in [np.float32, np.int64]
+            if self._raw_labels.dtype == np.int64:
+                assert self._raw_labels.ndim == 1
+                assert np.all(self._raw_labels >= 0)
+            self._raw_labels_std = self._raw_labels.std(0)
+        return self._raw_labels
+
+    def close(self): # to be overridden by subclass
+        pass
+
+    def _load_raw_image(self, raw_idx): # to be overridden by subclass
+        raise NotImplementedError
+
+    def _load_raw_labels(self): # to be overridden by subclass
+        raise NotImplementedError
+
+    def __getstate__(self):
+        return dict(self.__dict__, _raw_labels=None)
+
+    def __del__(self):
+        try:
+            self.close()
+        except:
+            pass
+
+    def __len__(self):
+        return self._raw_idx.size
+
+    def __getitem__(self, idx):
+        image = self._load_raw_image(self._raw_idx[idx])
+        assert isinstance(image, np.ndarray)
+        assert list(image.shape) == self.image_shape
+        assert image.dtype == np.uint8
+        if self._xflip[idx]:
+            assert image.ndim == 3 # CHW
+            image = image[:, :, ::-1]
+        return image.copy(), self.get_label(idx)
+
+    def get_label(self, idx):
+        label = self._get_raw_labels()[self._raw_idx[idx]]
+        if label.dtype == np.int64:
+            onehot = np.zeros(self.label_shape, dtype=np.float32)
+            onehot[label] = 1
+            label = onehot
+        return label.copy()
+
+    def get_details(self, idx):
+        d = dnnlib.EasyDict()
+        d.raw_idx = int(self._raw_idx[idx])
+        d.xflip = (int(self._xflip[idx]) != 0)
+        d.raw_label = self._get_raw_labels()[d.raw_idx].copy()
+        return d
+
+    def get_label_std(self):
+        return self._raw_labels_std
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def image_shape(self):
+        return list(self._raw_shape[1:])
+
+    @property
+    def num_channels(self):
+        assert len(self.image_shape) == 3 # CHW
+        return self.image_shape[0]
+
+    @property
+    def resolution(self):
+        assert len(self.image_shape) == 3 # CHW
+        assert self.image_shape[1] == self.image_shape[2]
+        return self.image_shape[1]
+
+    @property
+    def label_shape(self):
+        if self._label_shape is None:
+            raw_labels = self._get_raw_labels()
+            if raw_labels.dtype == np.int64:
+                self._label_shape = [int(np.max(raw_labels)) + 1]
+            else:
+                self._label_shape = raw_labels.shape[1:]
+        return list(self._label_shape)
+
+    @property
+    def label_dim(self):
+        assert len(self.label_shape) == 1
+        return self.label_shape[0]
+
+    @property
+    def has_labels(self):
+        return any(x != 0 for x in self.label_shape)
+
+    @property
+    def has_onehot_labels(self):
+        return self._get_raw_labels().dtype == np.int64
+
+#----------------------------------------------------------------------------
+
+class ImageFolderDataset(Dataset):
+    def __init__(self,
+        path,                   # Path to directory or zip.
+        resolution      = None, # Ensure specific resolution, None = highest available.
+        **super_kwargs,         # Additional arguments for the Dataset base class.
+    ):
+        self._path = path
+        self._zipfile = None
+
+        if os.path.isdir(self._path):
+            self._type = 'dir'
+            self._all_fnames = {os.path.relpath(os.path.join(root, fname), start=self._path) for root, _dirs, files in os.walk(self._path) for fname in files}
+        elif self._file_ext(self._path) == '.zip':
+            self._type = 'zip'
+            self._all_fnames = set(self._get_zipfile().namelist())
+        else:
+            raise IOError('Path must point to a directory or zip')
+
+        PIL.Image.init()
+        self._image_fnames = sorted(fname for fname in self._all_fnames if self._file_ext(fname) in PIL.Image.EXTENSION)
+        if len(self._image_fnames) == 0:
+            raise IOError('No image files found in the specified path')
+
+        name = os.path.splitext(os.path.basename(self._path))[0]
+        raw_shape = [len(self._image_fnames)] + list(self._load_raw_image(0).shape)
+        if resolution is not None and (raw_shape[2] != resolution or raw_shape[3] != resolution):
+            raise IOError('Image files do not match the specified resolution')
+        super().__init__(name=name, raw_shape=raw_shape, **super_kwargs)
+
+    @staticmethod
+    def _file_ext(fname):
+        return os.path.splitext(fname)[1].lower()
+
+    def _get_zipfile(self):
+        assert self._type == 'zip'
+        if self._zipfile is None:
+            self._zipfile = zipfile.ZipFile(self._path)
+        return self._zipfile
+
+    def _open_file(self, fname):
+        if self._type == 'dir':
+            return open(os.path.join(self._path, fname), 'rb')
+        if self._type == 'zip':
+            return self._get_zipfile().open(fname, 'r')
+        return None
+
+    def close(self):
+        try:
+            if self._zipfile is not None:
+                self._zipfile.close()
+        finally:
+            self._zipfile = None
+
+    def __getstate__(self):
+        return dict(super().__getstate__(), _zipfile=None)
+
+    def _load_raw_image(self, raw_idx):
+        fname = self._image_fnames[raw_idx]
+        with self._open_file(fname) as f:
+            if pyspng is not None and self._file_ext(fname) == '.png':
+                image = pyspng.load(f.read())
+            else:
+                image = np.array(PIL.Image.open(f))
+        if image.ndim == 2:
+            image = image[:, :, np.newaxis] # HW => HWC
+        image = image.transpose(2, 0, 1) # HWC => CHW
+        return image
+
+    def _load_raw_labels(self):
+        fname = 'dataset.json'
+        if fname not in self._all_fnames:
+            return None
+        with self._open_file(fname) as f:
+            labels = json.load(f)['labels']
+        if labels is None:
+            return None
+        labels = dict(labels)
+        labels = [labels[fname.replace('\\', '/')] for fname in self._image_fnames]
+        labels = np.array(labels)
+        labels = labels.astype({1: np.int64, 2: np.float32}[labels.ndim])
+        return labels
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/training/dual_discriminator.py b/eg3d/training/dual_discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..99bfb5a2a5b3b14c6824813b6977be86b43f7ccc
--- /dev/null
+++ b/eg3d/training/dual_discriminator.py
@@ -0,0 +1,249 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Discriminator architectures from the paper
+"Efficient Geometry-aware 3D Generative Adversarial Networks"."""
+
+import numpy as np
+import torch
+from torch_utils import persistence
+from torch_utils.ops import upfirdn2d
+from training.networks_stylegan2 import DiscriminatorBlock, MappingNetwork, DiscriminatorEpilogue
+
+@persistence.persistent_class
+class SingleDiscriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = 256,      # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        sr_upsample_factor  = 1,        # Ignored for SingleDiscriminator
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+
+        common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+
+    def forward(self, img, c, update_emas=False, **block_kwargs):
+        img = img['image']
+
+        _ = update_emas # unused
+        x = None
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+
+        cmap = None
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+
+    def extra_repr(self):
+        return f'c_dim={self.c_dim:d}, img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d}'
+
+#----------------------------------------------------------------------------
+
+def filtered_resizing(image_orig_tensor, size, f, filter_mode='antialiased'):
+    if filter_mode == 'antialiased':
+        ada_filtered_64 = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False, antialias=True)
+    elif filter_mode == 'classic':
+        ada_filtered_64 = upfirdn2d.upsample2d(image_orig_tensor, f, up=2)
+        ada_filtered_64 = torch.nn.functional.interpolate(ada_filtered_64, size=(size * 2 + 2, size * 2 + 2), mode='bilinear', align_corners=False)
+        ada_filtered_64 = upfirdn2d.downsample2d(ada_filtered_64, f, down=2, flip_filter=True, padding=-1)
+    elif filter_mode == 'none':
+        ada_filtered_64 = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False)
+    elif type(filter_mode) == float:
+        assert 0 < filter_mode < 1
+
+        filtered = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False, antialias=True)
+        aliased  = torch.nn.functional.interpolate(image_orig_tensor, size=(size, size), mode='bilinear', align_corners=False, antialias=False)
+        ada_filtered_64 = (1 - filter_mode) * aliased + (filter_mode) * filtered
+        
+    return ada_filtered_64
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class DualDiscriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = 256,      # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        disc_c_noise        = 0,        # Corrupt camera parameters with X std dev of noise before disc. pose conditioning.
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        img_channels *= 2
+
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+
+        common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+        self.disc_c_noise = disc_c_noise
+
+    def forward(self, img, c, update_emas=False, **block_kwargs):
+        image_raw = filtered_resizing(img['image_raw'], size=img['image'].shape[-1], f=self.resample_filter)
+        img = torch.cat([img['image'], image_raw], 1)
+
+        _ = update_emas # unused
+        x = None
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+
+        cmap = None
+        if self.c_dim > 0:
+            if self.disc_c_noise > 0: c += torch.randn_like(c) * c.std(0) * self.disc_c_noise
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+
+    def extra_repr(self):
+        return f'c_dim={self.c_dim:d}, img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class DummyDualDiscriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = 256,      # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        img_channels *= 2
+
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+
+        common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+
+        self.raw_fade = 1
+
+    def forward(self, img, c, update_emas=False, **block_kwargs):
+        self.raw_fade = max(0, self.raw_fade - 1/(500000/32))
+
+        image_raw = filtered_resizing(img['image_raw'], size=img['image'].shape[-1], f=self.resample_filter) * self.raw_fade
+        img = torch.cat([img['image'], image_raw], 1)
+
+        _ = update_emas # unused
+        x = None
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+
+        cmap = None
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+
+    def extra_repr(self):
+        return f'c_dim={self.c_dim:d}, img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d}'
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/training/loss.py b/eg3d/training/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c637a6f81bb8d458449c355831c733fcb0cacd
--- /dev/null
+++ b/eg3d/training/loss.py
@@ -0,0 +1,292 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Loss functions."""
+
+import numpy as np
+import torch
+from torch_utils import training_stats
+from torch_utils.ops import conv2d_gradfix
+from torch_utils.ops import upfirdn2d
+from training.dual_discriminator import filtered_resizing
+
+#----------------------------------------------------------------------------
+
+class Loss:
+    def accumulate_gradients(self, phase, real_img, real_c, gen_z, gen_c, gain, cur_nimg): # to be overridden by subclass
+        raise NotImplementedError()
+
+#----------------------------------------------------------------------------
+
+class StyleGAN2Loss(Loss):
+    def __init__(self, device, G, D, augment_pipe=None, r1_gamma=10, style_mixing_prob=0, pl_weight=0, pl_batch_shrink=2, pl_decay=0.01, pl_no_weight_grad=False, blur_init_sigma=0, blur_fade_kimg=0, r1_gamma_init=0, r1_gamma_fade_kimg=0, neural_rendering_resolution_initial=64, neural_rendering_resolution_final=None, neural_rendering_resolution_fade_kimg=0, gpc_reg_fade_kimg=1000, gpc_reg_prob=None, dual_discrimination=False, filter_mode='antialiased'):
+        super().__init__()
+        self.device             = device
+        self.G                  = G
+        self.D                  = D
+        self.augment_pipe       = augment_pipe
+        self.r1_gamma           = r1_gamma
+        self.style_mixing_prob  = style_mixing_prob
+        self.pl_weight          = pl_weight
+        self.pl_batch_shrink    = pl_batch_shrink
+        self.pl_decay           = pl_decay
+        self.pl_no_weight_grad  = pl_no_weight_grad
+        self.pl_mean            = torch.zeros([], device=device)
+        self.blur_init_sigma    = blur_init_sigma
+        self.blur_fade_kimg     = blur_fade_kimg
+        self.r1_gamma_init      = r1_gamma_init
+        self.r1_gamma_fade_kimg = r1_gamma_fade_kimg
+        self.neural_rendering_resolution_initial = neural_rendering_resolution_initial
+        self.neural_rendering_resolution_final = neural_rendering_resolution_final
+        self.neural_rendering_resolution_fade_kimg = neural_rendering_resolution_fade_kimg
+        self.gpc_reg_fade_kimg = gpc_reg_fade_kimg
+        self.gpc_reg_prob = gpc_reg_prob
+        self.dual_discrimination = dual_discrimination
+        self.filter_mode = filter_mode
+        self.resample_filter = upfirdn2d.setup_filter([1,3,3,1], device=device)
+        self.blur_raw_target = True
+        assert self.gpc_reg_prob is None or (0 <= self.gpc_reg_prob <= 1)
+
+    def run_G(self, z, c, swapping_prob, neural_rendering_resolution, update_emas=False):
+        if swapping_prob is not None:
+            c_swapped = torch.roll(c.clone(), 1, 0)
+            c_gen_conditioning = torch.where(torch.rand((c.shape[0], 1), device=c.device) < swapping_prob, c_swapped, c)
+        else:
+            c_gen_conditioning = torch.zeros_like(c)
+
+        ws = self.G.mapping(z, c_gen_conditioning, update_emas=update_emas)
+        if self.style_mixing_prob > 0:
+            with torch.autograd.profiler.record_function('style_mixing'):
+                cutoff = torch.empty([], dtype=torch.int64, device=ws.device).random_(1, ws.shape[1])
+                cutoff = torch.where(torch.rand([], device=ws.device) < self.style_mixing_prob, cutoff, torch.full_like(cutoff, ws.shape[1]))
+                ws[:, cutoff:] = self.G.mapping(torch.randn_like(z), c, update_emas=False)[:, cutoff:]
+        gen_output = self.G.synthesis(ws, c, neural_rendering_resolution=neural_rendering_resolution, update_emas=update_emas)
+        return gen_output, ws
+
+    def run_D(self, img, c, blur_sigma=0, blur_sigma_raw=0, update_emas=False):
+        blur_size = np.floor(blur_sigma * 3)
+        if blur_size > 0:
+            with torch.autograd.profiler.record_function('blur'):
+                f = torch.arange(-blur_size, blur_size + 1, device=img['image'].device).div(blur_sigma).square().neg().exp2()
+                img['image'] = upfirdn2d.filter2d(img['image'], f / f.sum())
+
+        if self.augment_pipe is not None:
+            augmented_pair = self.augment_pipe(torch.cat([img['image'],
+                                                    torch.nn.functional.interpolate(img['image_raw'], size=img['image'].shape[2:], mode='bilinear', antialias=True)],
+                                                    dim=1))
+            img['image'] = augmented_pair[:, :img['image'].shape[1]]
+            img['image_raw'] = torch.nn.functional.interpolate(augmented_pair[:, img['image'].shape[1]:], size=img['image_raw'].shape[2:], mode='bilinear', antialias=True)
+
+        logits = self.D(img, c, update_emas=update_emas)
+        return logits
+
+    def accumulate_gradients(self, phase, real_img, real_c, gen_z, gen_c, gain, cur_nimg):
+        assert phase in ['Gmain', 'Greg', 'Gboth', 'Dmain', 'Dreg', 'Dboth']
+        if self.G.rendering_kwargs.get('density_reg', 0) == 0:
+            phase = {'Greg': 'none', 'Gboth': 'Gmain'}.get(phase, phase)
+        if self.r1_gamma == 0:
+            phase = {'Dreg': 'none', 'Dboth': 'Dmain'}.get(phase, phase)
+        blur_sigma = max(1 - cur_nimg / (self.blur_fade_kimg * 1e3), 0) * self.blur_init_sigma if self.blur_fade_kimg > 0 else 0
+        r1_gamma = self.r1_gamma
+
+        alpha = min(cur_nimg / (self.gpc_reg_fade_kimg * 1e3), 1) if self.gpc_reg_fade_kimg > 0 else 1
+        swapping_prob = (1 - alpha) * 1 + alpha * self.gpc_reg_prob if self.gpc_reg_prob is not None else None
+
+        if self.neural_rendering_resolution_final is not None:
+            alpha = min(cur_nimg / (self.neural_rendering_resolution_fade_kimg * 1e3), 1)
+            neural_rendering_resolution = int(np.rint(self.neural_rendering_resolution_initial * (1 - alpha) + self.neural_rendering_resolution_final * alpha))
+        else:
+            neural_rendering_resolution = self.neural_rendering_resolution_initial
+
+        real_img_raw = filtered_resizing(real_img, size=neural_rendering_resolution, f=self.resample_filter, filter_mode=self.filter_mode)
+
+        if self.blur_raw_target:
+            blur_size = np.floor(blur_sigma * 3)
+            if blur_size > 0:
+                f = torch.arange(-blur_size, blur_size + 1, device=real_img_raw.device).div(blur_sigma).square().neg().exp2()
+                real_img_raw = upfirdn2d.filter2d(real_img_raw, f / f.sum())
+
+        real_img = {'image': real_img, 'image_raw': real_img_raw}
+
+        # Gmain: Maximize logits for generated images.
+        if phase in ['Gmain', 'Gboth']:
+            with torch.autograd.profiler.record_function('Gmain_forward'):
+                gen_img, _gen_ws = self.run_G(gen_z, gen_c, swapping_prob=swapping_prob, neural_rendering_resolution=neural_rendering_resolution)
+                gen_logits = self.run_D(gen_img, gen_c, blur_sigma=blur_sigma)
+                training_stats.report('Loss/scores/fake', gen_logits)
+                training_stats.report('Loss/signs/fake', gen_logits.sign())
+                loss_Gmain = torch.nn.functional.softplus(-gen_logits)
+                training_stats.report('Loss/G/loss', loss_Gmain)
+            with torch.autograd.profiler.record_function('Gmain_backward'):
+                loss_Gmain.mean().mul(gain).backward()
+
+        # Density Regularization
+        if phase in ['Greg', 'Gboth'] and self.G.rendering_kwargs.get('density_reg', 0) > 0 and self.G.rendering_kwargs['reg_type'] == 'l1':
+            if swapping_prob is not None:
+                c_swapped = torch.roll(gen_c.clone(), 1, 0)
+                c_gen_conditioning = torch.where(torch.rand([], device=gen_c.device) < swapping_prob, c_swapped, gen_c)
+            else:
+                c_gen_conditioning = torch.zeros_like(gen_c)
+
+            ws = self.G.mapping(gen_z, c_gen_conditioning, update_emas=False)
+            if self.style_mixing_prob > 0:
+                with torch.autograd.profiler.record_function('style_mixing'):
+                    cutoff = torch.empty([], dtype=torch.int64, device=ws.device).random_(1, ws.shape[1])
+                    cutoff = torch.where(torch.rand([], device=ws.device) < self.style_mixing_prob, cutoff, torch.full_like(cutoff, ws.shape[1]))
+                    ws[:, cutoff:] = self.G.mapping(torch.randn_like(z), c, update_emas=False)[:, cutoff:]
+            initial_coordinates = torch.rand((ws.shape[0], 1000, 3), device=ws.device) * 2 - 1
+            perturbed_coordinates = initial_coordinates + torch.randn_like(initial_coordinates) * self.G.rendering_kwargs['density_reg_p_dist']
+            all_coordinates = torch.cat([initial_coordinates, perturbed_coordinates], dim=1)
+            sigma = self.G.sample_mixed(all_coordinates, torch.randn_like(all_coordinates), ws, update_emas=False)['sigma']
+            sigma_initial = sigma[:, :sigma.shape[1]//2]
+            sigma_perturbed = sigma[:, sigma.shape[1]//2:]
+
+            TVloss = torch.nn.functional.l1_loss(sigma_initial, sigma_perturbed) * self.G.rendering_kwargs['density_reg']
+            TVloss.mul(gain).backward()
+
+        # Alternative density regularization
+        if phase in ['Greg', 'Gboth'] and self.G.rendering_kwargs.get('density_reg', 0) > 0 and self.G.rendering_kwargs['reg_type'] == 'monotonic-detach':
+            if swapping_prob is not None:
+                c_swapped = torch.roll(gen_c.clone(), 1, 0)
+                c_gen_conditioning = torch.where(torch.rand([], device=gen_c.device) < swapping_prob, c_swapped, gen_c)
+            else:
+                c_gen_conditioning = torch.zeros_like(gen_c)
+
+            ws = self.G.mapping(gen_z, c_gen_conditioning, update_emas=False)
+
+            initial_coordinates = torch.rand((ws.shape[0], 2000, 3), device=ws.device) * 2 - 1 # Front
+
+            perturbed_coordinates = initial_coordinates + torch.tensor([0, 0, -1], device=ws.device) * (1/256) * self.G.rendering_kwargs['box_warp'] # Behind
+            all_coordinates = torch.cat([initial_coordinates, perturbed_coordinates], dim=1)
+            sigma = self.G.sample_mixed(all_coordinates, torch.randn_like(all_coordinates), ws, update_emas=False)['sigma']
+            sigma_initial = sigma[:, :sigma.shape[1]//2]
+            sigma_perturbed = sigma[:, sigma.shape[1]//2:]
+
+            monotonic_loss = torch.relu(sigma_initial.detach() - sigma_perturbed).mean() * 10
+            monotonic_loss.mul(gain).backward()
+
+
+            if swapping_prob is not None:
+                c_swapped = torch.roll(gen_c.clone(), 1, 0)
+                c_gen_conditioning = torch.where(torch.rand([], device=gen_c.device) < swapping_prob, c_swapped, gen_c)
+            else:
+                c_gen_conditioning = torch.zeros_like(gen_c)
+
+            ws = self.G.mapping(gen_z, c_gen_conditioning, update_emas=False)
+            if self.style_mixing_prob > 0:
+                with torch.autograd.profiler.record_function('style_mixing'):
+                    cutoff = torch.empty([], dtype=torch.int64, device=ws.device).random_(1, ws.shape[1])
+                    cutoff = torch.where(torch.rand([], device=ws.device) < self.style_mixing_prob, cutoff, torch.full_like(cutoff, ws.shape[1]))
+                    ws[:, cutoff:] = self.G.mapping(torch.randn_like(z), c, update_emas=False)[:, cutoff:]
+            initial_coordinates = torch.rand((ws.shape[0], 1000, 3), device=ws.device) * 2 - 1
+            perturbed_coordinates = initial_coordinates + torch.randn_like(initial_coordinates) * (1/256) * self.G.rendering_kwargs['box_warp']
+            all_coordinates = torch.cat([initial_coordinates, perturbed_coordinates], dim=1)
+            sigma = self.G.sample_mixed(all_coordinates, torch.randn_like(all_coordinates), ws, update_emas=False)['sigma']
+            sigma_initial = sigma[:, :sigma.shape[1]//2]
+            sigma_perturbed = sigma[:, sigma.shape[1]//2:]
+
+            TVloss = torch.nn.functional.l1_loss(sigma_initial, sigma_perturbed) * self.G.rendering_kwargs['density_reg']
+            TVloss.mul(gain).backward()
+
+        # Alternative density regularization
+        if phase in ['Greg', 'Gboth'] and self.G.rendering_kwargs.get('density_reg', 0) > 0 and self.G.rendering_kwargs['reg_type'] == 'monotonic-fixed':
+            if swapping_prob is not None:
+                c_swapped = torch.roll(gen_c.clone(), 1, 0)
+                c_gen_conditioning = torch.where(torch.rand([], device=gen_c.device) < swapping_prob, c_swapped, gen_c)
+            else:
+                c_gen_conditioning = torch.zeros_like(gen_c)
+
+            ws = self.G.mapping(gen_z, c_gen_conditioning, update_emas=False)
+
+            initial_coordinates = torch.rand((ws.shape[0], 2000, 3), device=ws.device) * 2 - 1 # Front
+
+            perturbed_coordinates = initial_coordinates + torch.tensor([0, 0, -1], device=ws.device) * (1/256) * self.G.rendering_kwargs['box_warp'] # Behind
+            all_coordinates = torch.cat([initial_coordinates, perturbed_coordinates], dim=1)
+            sigma = self.G.sample_mixed(all_coordinates, torch.randn_like(all_coordinates), ws, update_emas=False)['sigma']
+            sigma_initial = sigma[:, :sigma.shape[1]//2]
+            sigma_perturbed = sigma[:, sigma.shape[1]//2:]
+
+            monotonic_loss = torch.relu(sigma_initial - sigma_perturbed).mean() * 10
+            monotonic_loss.mul(gain).backward()
+
+
+            if swapping_prob is not None:
+                c_swapped = torch.roll(gen_c.clone(), 1, 0)
+                c_gen_conditioning = torch.where(torch.rand([], device=gen_c.device) < swapping_prob, c_swapped, gen_c)
+            else:
+                c_gen_conditioning = torch.zeros_like(gen_c)
+
+            ws = self.G.mapping(gen_z, c_gen_conditioning, update_emas=False)
+            if self.style_mixing_prob > 0:
+                with torch.autograd.profiler.record_function('style_mixing'):
+                    cutoff = torch.empty([], dtype=torch.int64, device=ws.device).random_(1, ws.shape[1])
+                    cutoff = torch.where(torch.rand([], device=ws.device) < self.style_mixing_prob, cutoff, torch.full_like(cutoff, ws.shape[1]))
+                    ws[:, cutoff:] = self.G.mapping(torch.randn_like(z), c, update_emas=False)[:, cutoff:]
+            initial_coordinates = torch.rand((ws.shape[0], 1000, 3), device=ws.device) * 2 - 1
+            perturbed_coordinates = initial_coordinates + torch.randn_like(initial_coordinates) * (1/256) * self.G.rendering_kwargs['box_warp']
+            all_coordinates = torch.cat([initial_coordinates, perturbed_coordinates], dim=1)
+            sigma = self.G.sample_mixed(all_coordinates, torch.randn_like(all_coordinates), ws, update_emas=False)['sigma']
+            sigma_initial = sigma[:, :sigma.shape[1]//2]
+            sigma_perturbed = sigma[:, sigma.shape[1]//2:]
+
+            TVloss = torch.nn.functional.l1_loss(sigma_initial, sigma_perturbed) * self.G.rendering_kwargs['density_reg']
+            TVloss.mul(gain).backward()
+
+        # Dmain: Minimize logits for generated images.
+        loss_Dgen = 0
+        if phase in ['Dmain', 'Dboth']:
+            with torch.autograd.profiler.record_function('Dgen_forward'):
+                gen_img, _gen_ws = self.run_G(gen_z, gen_c, swapping_prob=swapping_prob, neural_rendering_resolution=neural_rendering_resolution, update_emas=True)
+                gen_logits = self.run_D(gen_img, gen_c, blur_sigma=blur_sigma, update_emas=True)
+                training_stats.report('Loss/scores/fake', gen_logits)
+                training_stats.report('Loss/signs/fake', gen_logits.sign())
+                loss_Dgen = torch.nn.functional.softplus(gen_logits)
+            with torch.autograd.profiler.record_function('Dgen_backward'):
+                loss_Dgen.mean().mul(gain).backward()
+
+        # Dmain: Maximize logits for real images.
+        # Dr1: Apply R1 regularization.
+        if phase in ['Dmain', 'Dreg', 'Dboth']:
+            name = 'Dreal' if phase == 'Dmain' else 'Dr1' if phase == 'Dreg' else 'Dreal_Dr1'
+            with torch.autograd.profiler.record_function(name + '_forward'):
+                real_img_tmp_image = real_img['image'].detach().requires_grad_(phase in ['Dreg', 'Dboth'])
+                real_img_tmp_image_raw = real_img['image_raw'].detach().requires_grad_(phase in ['Dreg', 'Dboth'])
+                real_img_tmp = {'image': real_img_tmp_image, 'image_raw': real_img_tmp_image_raw}
+
+                real_logits = self.run_D(real_img_tmp, real_c, blur_sigma=blur_sigma)
+                training_stats.report('Loss/scores/real', real_logits)
+                training_stats.report('Loss/signs/real', real_logits.sign())
+
+                loss_Dreal = 0
+                if phase in ['Dmain', 'Dboth']:
+                    loss_Dreal = torch.nn.functional.softplus(-real_logits)
+                    training_stats.report('Loss/D/loss', loss_Dgen + loss_Dreal)
+
+                loss_Dr1 = 0
+                if phase in ['Dreg', 'Dboth']:
+                    if self.dual_discrimination:
+                        with torch.autograd.profiler.record_function('r1_grads'), conv2d_gradfix.no_weight_gradients():
+                            r1_grads = torch.autograd.grad(outputs=[real_logits.sum()], inputs=[real_img_tmp['image'], real_img_tmp['image_raw']], create_graph=True, only_inputs=True)
+                            r1_grads_image = r1_grads[0]
+                            r1_grads_image_raw = r1_grads[1]
+                        r1_penalty = r1_grads_image.square().sum([1,2,3]) + r1_grads_image_raw.square().sum([1,2,3])
+                    else: # single discrimination
+                        with torch.autograd.profiler.record_function('r1_grads'), conv2d_gradfix.no_weight_gradients():
+                            r1_grads = torch.autograd.grad(outputs=[real_logits.sum()], inputs=[real_img_tmp['image']], create_graph=True, only_inputs=True)
+                            r1_grads_image = r1_grads[0]
+                        r1_penalty = r1_grads_image.square().sum([1,2,3])
+                    loss_Dr1 = r1_penalty * (r1_gamma / 2)
+                    training_stats.report('Loss/r1_penalty', r1_penalty)
+                    training_stats.report('Loss/D/reg', loss_Dr1)
+
+            with torch.autograd.profiler.record_function(name + '_backward'):
+                (loss_Dreal + loss_Dr1).mean().mul(gain).backward()
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/training/networks_stylegan2.py b/eg3d/training/networks_stylegan2.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc562feb19422f1cb0752891495b79d697a71927
--- /dev/null
+++ b/eg3d/training/networks_stylegan2.py
@@ -0,0 +1,796 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Network architectures from the paper
+"Analyzing and Improving the Image Quality of StyleGAN".
+Matches the original implementation of configs E-F by Karras et al. at
+https://github.com/NVlabs/stylegan2/blob/master/training/networks_stylegan2.py"""
+
+import numpy as np
+import torch
+from torch_utils import misc
+from torch_utils import persistence
+from torch_utils.ops import conv2d_resample
+from torch_utils.ops import upfirdn2d
+from torch_utils.ops import bias_act
+from torch_utils.ops import fma
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def normalize_2nd_moment(x, dim=1, eps=1e-8):
+    return x * (x.square().mean(dim=dim, keepdim=True) + eps).rsqrt()
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def modulated_conv2d(
+    x,                          # Input tensor of shape [batch_size, in_channels, in_height, in_width].
+    weight,                     # Weight tensor of shape [out_channels, in_channels, kernel_height, kernel_width].
+    styles,                     # Modulation coefficients of shape [batch_size, in_channels].
+    noise           = None,     # Optional noise tensor to add to the output activations.
+    up              = 1,        # Integer upsampling factor.
+    down            = 1,        # Integer downsampling factor.
+    padding         = 0,        # Padding with respect to the upsampled image.
+    resample_filter = None,     # Low-pass filter to apply when resampling activations. Must be prepared beforehand by calling upfirdn2d.setup_filter().
+    demodulate      = True,     # Apply weight demodulation?
+    flip_weight     = True,     # False = convolution, True = correlation (matches torch.nn.functional.conv2d).
+    fused_modconv   = True,     # Perform modulation, convolution, and demodulation as a single fused operation?
+):
+    batch_size = x.shape[0]
+    out_channels, in_channels, kh, kw = weight.shape
+    misc.assert_shape(weight, [out_channels, in_channels, kh, kw]) # [OIkk]
+    misc.assert_shape(x, [batch_size, in_channels, None, None]) # [NIHW]
+    misc.assert_shape(styles, [batch_size, in_channels]) # [NI]
+
+    # Pre-normalize inputs to avoid FP16 overflow.
+    if x.dtype == torch.float16 and demodulate:
+        weight = weight * (1 / np.sqrt(in_channels * kh * kw) / weight.norm(float('inf'), dim=[1,2,3], keepdim=True)) # max_Ikk
+        styles = styles / styles.norm(float('inf'), dim=1, keepdim=True) # max_I
+
+    # Calculate per-sample weights and demodulation coefficients.
+    w = None
+    dcoefs = None
+    if demodulate or fused_modconv:
+        w = weight.unsqueeze(0) # [NOIkk]
+        w = w * styles.reshape(batch_size, 1, -1, 1, 1) # [NOIkk]
+    if demodulate:
+        dcoefs = (w.square().sum(dim=[2,3,4]) + 1e-8).rsqrt() # [NO]
+    if demodulate and fused_modconv:
+        w = w * dcoefs.reshape(batch_size, -1, 1, 1, 1) # [NOIkk]
+
+    # Execute by scaling the activations before and after the convolution.
+    if not fused_modconv:
+        x = x * styles.to(x.dtype).reshape(batch_size, -1, 1, 1)
+        x = conv2d_resample.conv2d_resample(x=x, w=weight.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, flip_weight=flip_weight)
+        if demodulate and noise is not None:
+            x = fma.fma(x, dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1), noise.to(x.dtype))
+        elif demodulate:
+            x = x * dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1)
+        elif noise is not None:
+            x = x.add_(noise.to(x.dtype))
+        return x
+
+    # Execute as one fused op using grouped convolution.
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        batch_size = int(batch_size)
+    misc.assert_shape(x, [batch_size, in_channels, None, None])
+    x = x.reshape(1, -1, *x.shape[2:])
+    w = w.reshape(-1, in_channels, kh, kw)
+    x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, groups=batch_size, flip_weight=flip_weight)
+    x = x.reshape(batch_size, -1, *x.shape[2:])
+    if noise is not None:
+        x = x.add_(noise)
+    return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class FullyConnectedLayer(torch.nn.Module):
+    def __init__(self,
+        in_features,                # Number of input features.
+        out_features,               # Number of output features.
+        bias            = True,     # Apply additive bias before the activation function?
+        activation      = 'linear', # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier   = 1,        # Learning rate multiplier.
+        bias_init       = 0,        # Initial value for the additive bias.
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.activation = activation
+        self.weight = torch.nn.Parameter(torch.randn([out_features, in_features]) / lr_multiplier)
+        self.bias = torch.nn.Parameter(torch.full([out_features], np.float32(bias_init))) if bias else None
+        self.weight_gain = lr_multiplier / np.sqrt(in_features)
+        self.bias_gain = lr_multiplier
+
+    def forward(self, x):
+        w = self.weight.to(x.dtype) * self.weight_gain
+        b = self.bias
+        if b is not None:
+            b = b.to(x.dtype)
+            if self.bias_gain != 1:
+                b = b * self.bias_gain
+
+        if self.activation == 'linear' and b is not None:
+            x = torch.addmm(b.unsqueeze(0), x, w.t())
+        else:
+            x = x.matmul(w.t())
+            x = bias_act.bias_act(x, b, act=self.activation)
+        return x
+
+    def extra_repr(self):
+        return f'in_features={self.in_features:d}, out_features={self.out_features:d}, activation={self.activation:s}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Conv2dLayer(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        kernel_size,                    # Width and height of the convolution kernel.
+        bias            = True,         # Apply additive bias before the activation function?
+        activation      = 'linear',     # Activation function: 'relu', 'lrelu', etc.
+        up              = 1,            # Integer upsampling factor.
+        down            = 1,            # Integer downsampling factor.
+        resample_filter = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp      = None,         # Clamp the output to +-X, None = disable clamping.
+        channels_last   = False,        # Expect the input to have memory_format=channels_last?
+        trainable       = True,         # Update the weights of this layer during training?
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.activation = activation
+        self.up = up
+        self.down = down
+        self.conv_clamp = conv_clamp
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.padding = kernel_size // 2
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
+        self.act_gain = bias_act.activation_funcs[activation].def_gain
+
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        weight = torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format)
+        bias = torch.zeros([out_channels]) if bias else None
+        if trainable:
+            self.weight = torch.nn.Parameter(weight)
+            self.bias = torch.nn.Parameter(bias) if bias is not None else None
+        else:
+            self.register_buffer('weight', weight)
+            if bias is not None:
+                self.register_buffer('bias', bias)
+            else:
+                self.bias = None
+
+    def forward(self, x, gain=1):
+        w = self.weight * self.weight_gain
+        b = self.bias.to(x.dtype) if self.bias is not None else None
+        flip_weight = (self.up == 1) # slightly faster
+        x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=self.resample_filter, up=self.up, down=self.down, padding=self.padding, flip_weight=flip_weight)
+
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        x = bias_act.bias_act(x, b, act=self.activation, gain=act_gain, clamp=act_clamp)
+        return x
+
+    def extra_repr(self):
+        return ' '.join([
+            f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}, activation={self.activation:s},',
+            f'up={self.up}, down={self.down}'])
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class MappingNetwork(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality, 0 = no latent.
+        c_dim,                      # Conditioning label (C) dimensionality, 0 = no label.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        num_ws,                     # Number of intermediate latents to output, None = do not broadcast.
+        num_layers      = 8,        # Number of mapping layers.
+        embed_features  = None,     # Label embedding dimensionality, None = same as w_dim.
+        layer_features  = None,     # Number of intermediate features in the mapping layers, None = same as w_dim.
+        activation      = 'lrelu',  # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier   = 0.01,     # Learning rate multiplier for the mapping layers.
+        w_avg_beta      = 0.998,    # Decay for tracking the moving average of W during training, None = do not track.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.num_ws = num_ws
+        self.num_layers = num_layers
+        self.w_avg_beta = w_avg_beta
+
+        if embed_features is None:
+            embed_features = w_dim
+        if c_dim == 0:
+            embed_features = 0
+        if layer_features is None:
+            layer_features = w_dim
+        features_list = [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim]
+
+        if c_dim > 0:
+            self.embed = FullyConnectedLayer(c_dim, embed_features)
+        for idx in range(num_layers):
+            in_features = features_list[idx]
+            out_features = features_list[idx + 1]
+            layer = FullyConnectedLayer(in_features, out_features, activation=activation, lr_multiplier=lr_multiplier)
+            setattr(self, f'fc{idx}', layer)
+
+        if num_ws is not None and w_avg_beta is not None:
+            self.register_buffer('w_avg', torch.zeros([w_dim]))
+
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False):
+        # Embed, normalize, and concat inputs.
+        x = None
+        with torch.autograd.profiler.record_function('input'):
+            if self.z_dim > 0:
+                misc.assert_shape(z, [None, self.z_dim])
+                x = normalize_2nd_moment(z.to(torch.float32))
+            if self.c_dim > 0:
+                misc.assert_shape(c, [None, self.c_dim])
+                y = normalize_2nd_moment(self.embed(c.to(torch.float32)))
+                x = torch.cat([x, y], dim=1) if x is not None else y
+
+        # Main layers.
+        for idx in range(self.num_layers):
+            layer = getattr(self, f'fc{idx}')
+            x = layer(x)
+
+        # Update moving average of W.
+        if update_emas and self.w_avg_beta is not None:
+            with torch.autograd.profiler.record_function('update_w_avg'):
+                self.w_avg.copy_(x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta))
+
+        # Broadcast.
+        if self.num_ws is not None:
+            with torch.autograd.profiler.record_function('broadcast'):
+                x = x.unsqueeze(1).repeat([1, self.num_ws, 1])
+
+        # Apply truncation.
+        if truncation_psi != 1:
+            with torch.autograd.profiler.record_function('truncate'):
+                assert self.w_avg_beta is not None
+                if self.num_ws is None or truncation_cutoff is None:
+                    x = self.w_avg.lerp(x, truncation_psi)
+                else:
+                    x[:, :truncation_cutoff] = self.w_avg.lerp(x[:, :truncation_cutoff], truncation_psi)
+        return x
+
+    def extra_repr(self):
+        return f'z_dim={self.z_dim:d}, c_dim={self.c_dim:d}, w_dim={self.w_dim:d}, num_ws={self.num_ws:d}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisLayer(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        w_dim,                          # Intermediate latent (W) dimensionality.
+        resolution,                     # Resolution of this layer.
+        kernel_size     = 3,            # Convolution kernel size.
+        up              = 1,            # Integer upsampling factor.
+        use_noise       = True,         # Enable noise input?
+        activation      = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+        resample_filter = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp      = None,         # Clamp the output of convolution layers to +-X, None = disable clamping.
+        channels_last   = False,        # Use channels_last format for the weights?
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.up = up
+        self.use_noise = use_noise
+        self.activation = activation
+        self.conv_clamp = conv_clamp
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.padding = kernel_size // 2
+        self.act_gain = bias_act.activation_funcs[activation].def_gain
+
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        self.weight = torch.nn.Parameter(torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format))
+        if use_noise:
+            self.register_buffer('noise_const', torch.randn([resolution, resolution]))
+            self.noise_strength = torch.nn.Parameter(torch.zeros([]))
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+
+    def forward(self, x, w, noise_mode='random', fused_modconv=True, gain=1):
+        assert noise_mode in ['random', 'const', 'none']
+        in_resolution = self.resolution // self.up
+        misc.assert_shape(x, [None, self.in_channels, in_resolution, in_resolution])
+        styles = self.affine(w)
+
+        noise = None
+        if self.use_noise and noise_mode == 'random':
+            noise = torch.randn([x.shape[0], 1, self.resolution, self.resolution], device=x.device) * self.noise_strength
+        if self.use_noise and noise_mode == 'const':
+            noise = self.noise_const * self.noise_strength
+
+        flip_weight = (self.up == 1) # slightly faster
+        x = modulated_conv2d(x=x, weight=self.weight, styles=styles, noise=noise, up=self.up,
+            padding=self.padding, resample_filter=self.resample_filter, flip_weight=flip_weight, fused_modconv=fused_modconv)
+
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        x = bias_act.bias_act(x, self.bias.to(x.dtype), act=self.activation, gain=act_gain, clamp=act_clamp)
+        return x
+
+    def extra_repr(self):
+        return ' '.join([
+            f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}, w_dim={self.w_dim:d},',
+            f'resolution={self.resolution:d}, up={self.up}, activation={self.activation:s}'])
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class ToRGBLayer(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, w_dim, kernel_size=1, conv_clamp=None, channels_last=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.w_dim = w_dim
+        self.conv_clamp = conv_clamp
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        self.weight = torch.nn.Parameter(torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format))
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
+
+    def forward(self, x, w, fused_modconv=True):
+        styles = self.affine(w) * self.weight_gain
+        x = modulated_conv2d(x=x, weight=self.weight, styles=styles, demodulate=False, fused_modconv=fused_modconv)
+        x = bias_act.bias_act(x, self.bias.to(x.dtype), clamp=self.conv_clamp)
+        return x
+
+    def extra_repr(self):
+        return f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}, w_dim={self.w_dim:d}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisBlock(torch.nn.Module):
+    def __init__(self,
+        in_channels,                            # Number of input channels, 0 = first block.
+        out_channels,                           # Number of output channels.
+        w_dim,                                  # Intermediate latent (W) dimensionality.
+        resolution,                             # Resolution of this block.
+        img_channels,                           # Number of output color channels.
+        is_last,                                # Is this the last block?
+        architecture            = 'skip',       # Architecture: 'orig', 'skip', 'resnet'.
+        resample_filter         = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp              = 256,          # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16                = False,        # Use FP16 for this block?
+        fp16_channels_last      = False,        # Use channels-last memory format with FP16?
+        fused_modconv_default   = True,         # Default value of fused_modconv. 'inference_only' = True for inference, False for training.
+        **layer_kwargs,                         # Arguments for SynthesisLayer.
+    ):
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.is_last = is_last
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = (use_fp16 and fp16_channels_last)
+        self.fused_modconv_default = fused_modconv_default
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.num_conv = 0
+        self.num_torgb = 0
+
+        if in_channels == 0:
+            self.const = torch.nn.Parameter(torch.randn([out_channels, resolution, resolution]))
+
+        if in_channels != 0:
+            self.conv0 = SynthesisLayer(in_channels, out_channels, w_dim=w_dim, resolution=resolution, up=2,
+                resample_filter=resample_filter, conv_clamp=conv_clamp, channels_last=self.channels_last, **layer_kwargs)
+            self.num_conv += 1
+
+        self.conv1 = SynthesisLayer(out_channels, out_channels, w_dim=w_dim, resolution=resolution,
+            conv_clamp=conv_clamp, channels_last=self.channels_last, **layer_kwargs)
+        self.num_conv += 1
+
+        if is_last or architecture == 'skip':
+            self.torgb = ToRGBLayer(out_channels, img_channels, w_dim=w_dim,
+                conv_clamp=conv_clamp, channels_last=self.channels_last)
+            self.num_torgb += 1
+
+        if in_channels != 0 and architecture == 'resnet':
+            self.skip = Conv2dLayer(in_channels, out_channels, kernel_size=1, bias=False, up=2,
+                resample_filter=resample_filter, channels_last=self.channels_last)
+
+    def forward(self, x, img, ws, force_fp32=False, fused_modconv=None, update_emas=False, **layer_kwargs):
+        _ = update_emas # unused
+        misc.assert_shape(ws, [None, self.num_conv + self.num_torgb, self.w_dim])
+        w_iter = iter(ws.unbind(dim=1))
+        if ws.device.type != 'cuda':
+            force_fp32 = True
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format
+        if fused_modconv is None:
+            fused_modconv = self.fused_modconv_default
+        if fused_modconv == 'inference_only':
+            fused_modconv = (not self.training)
+
+        # Input.
+        if self.in_channels == 0:
+            x = self.const.to(dtype=dtype, memory_format=memory_format)
+            x = x.unsqueeze(0).repeat([ws.shape[0], 1, 1, 1])
+        else:
+            misc.assert_shape(x, [None, self.in_channels, self.resolution // 2, self.resolution // 2])
+            x = x.to(dtype=dtype, memory_format=memory_format)
+
+        # Main layers.
+        if self.in_channels == 0:
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+        elif self.architecture == 'resnet':
+            y = self.skip(x, gain=np.sqrt(0.5))
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, gain=np.sqrt(0.5), **layer_kwargs)
+            x = y.add_(x)
+        else:
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+
+        # ToRGB.
+        if img is not None:
+            misc.assert_shape(img, [None, self.img_channels, self.resolution // 2, self.resolution // 2])
+            img = upfirdn2d.upsample2d(img, self.resample_filter)
+        if self.is_last or self.architecture == 'skip':
+            y = self.torgb(x, next(w_iter), fused_modconv=fused_modconv)
+            y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format)
+            img = img.add_(y) if img is not None else y
+
+        assert x.dtype == dtype
+        assert img is None or img.dtype == torch.float32
+        return x, img
+
+    def extra_repr(self):
+        return f'resolution={self.resolution:d}, architecture={self.architecture:s}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisNetwork(torch.nn.Module):
+    def __init__(self,
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output image resolution.
+        img_channels,               # Number of color channels.
+        channel_base    = 32768,    # Overall multiplier for the number of channels.
+        channel_max     = 512,      # Maximum number of channels in any layer.
+        num_fp16_res    = 4,        # Use FP16 for the N highest resolutions.
+        **block_kwargs,             # Arguments for SynthesisBlock.
+    ):
+        assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0
+        super().__init__()
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.num_fp16_res = num_fp16_res
+        self.block_resolutions = [2 ** i for i in range(2, self.img_resolution_log2 + 1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+
+        self.num_ws = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res // 2] if res > 4 else 0
+            out_channels = channels_dict[res]
+            use_fp16 = (res >= fp16_resolution)
+            is_last = (res == self.img_resolution)
+            block = SynthesisBlock(in_channels, out_channels, w_dim=w_dim, resolution=res,
+                img_channels=img_channels, is_last=is_last, use_fp16=use_fp16, **block_kwargs)
+            self.num_ws += block.num_conv
+            if is_last:
+                self.num_ws += block.num_torgb
+            setattr(self, f'b{res}', block)
+
+    def forward(self, ws, **block_kwargs):
+        block_ws = []
+        with torch.autograd.profiler.record_function('split_ws'):
+            misc.assert_shape(ws, [None, self.num_ws, self.w_dim])
+            ws = ws.to(torch.float32)
+            w_idx = 0
+            for res in self.block_resolutions:
+                block = getattr(self, f'b{res}')
+                block_ws.append(ws.narrow(1, w_idx, block.num_conv + block.num_torgb))
+                w_idx += block.num_conv
+
+        x = img = None
+        for res, cur_ws in zip(self.block_resolutions, block_ws):
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, cur_ws, **block_kwargs)
+        return img
+
+    def extra_repr(self):
+        return ' '.join([
+            f'w_dim={self.w_dim:d}, num_ws={self.num_ws:d},',
+            f'img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d},',
+            f'num_fp16_res={self.num_fp16_res:d}'])
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Generator(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality.
+        c_dim,                      # Conditioning label (C) dimensionality.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output resolution.
+        img_channels,               # Number of output color channels.
+        mapping_kwargs      = {},   # Arguments for MappingNetwork.
+        **synthesis_kwargs,         # Arguments for SynthesisNetwork.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.synthesis = SynthesisNetwork(w_dim=w_dim, img_resolution=img_resolution, img_channels=img_channels, **synthesis_kwargs)
+        self.num_ws = self.synthesis.num_ws
+        self.mapping = MappingNetwork(z_dim=z_dim, c_dim=c_dim, w_dim=w_dim, num_ws=self.num_ws, **mapping_kwargs)
+
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False, **synthesis_kwargs):
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        img = self.synthesis(ws, update_emas=update_emas, **synthesis_kwargs)
+        return img
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class DiscriminatorBlock(torch.nn.Module):
+    def __init__(self,
+        in_channels,                        # Number of input channels, 0 = first block.
+        tmp_channels,                       # Number of intermediate channels.
+        out_channels,                       # Number of output channels.
+        resolution,                         # Resolution of this block.
+        img_channels,                       # Number of input color channels.
+        first_layer_idx,                    # Index of the first layer.
+        architecture        = 'resnet',     # Architecture: 'orig', 'skip', 'resnet'.
+        activation          = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+        resample_filter     = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp          = None,         # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16            = False,        # Use FP16 for this block?
+        fp16_channels_last  = False,        # Use channels-last memory format with FP16?
+        freeze_layers       = 0,            # Freeze-D: Number of layers to freeze.
+    ):
+        assert in_channels in [0, tmp_channels]
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.first_layer_idx = first_layer_idx
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = (use_fp16 and fp16_channels_last)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+
+        self.num_layers = 0
+        def trainable_gen():
+            while True:
+                layer_idx = self.first_layer_idx + self.num_layers
+                trainable = (layer_idx >= freeze_layers)
+                self.num_layers += 1
+                yield trainable
+        trainable_iter = trainable_gen()
+
+        if in_channels == 0 or architecture == 'skip':
+            self.fromrgb = Conv2dLayer(img_channels, tmp_channels, kernel_size=1, activation=activation,
+                trainable=next(trainable_iter), conv_clamp=conv_clamp, channels_last=self.channels_last)
+
+        self.conv0 = Conv2dLayer(tmp_channels, tmp_channels, kernel_size=3, activation=activation,
+            trainable=next(trainable_iter), conv_clamp=conv_clamp, channels_last=self.channels_last)
+
+        self.conv1 = Conv2dLayer(tmp_channels, out_channels, kernel_size=3, activation=activation, down=2,
+            trainable=next(trainable_iter), resample_filter=resample_filter, conv_clamp=conv_clamp, channels_last=self.channels_last)
+
+        if architecture == 'resnet':
+            self.skip = Conv2dLayer(tmp_channels, out_channels, kernel_size=1, bias=False, down=2,
+                trainable=next(trainable_iter), resample_filter=resample_filter, channels_last=self.channels_last)
+
+    def forward(self, x, img, force_fp32=False):
+        if (x if x is not None else img).device.type != 'cuda':
+            force_fp32 = True
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format
+
+        # Input.
+        if x is not None:
+            misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution])
+            x = x.to(dtype=dtype, memory_format=memory_format)
+
+        # FromRGB.
+        if self.in_channels == 0 or self.architecture == 'skip':
+            misc.assert_shape(img, [None, self.img_channels, self.resolution, self.resolution])
+            img = img.to(dtype=dtype, memory_format=memory_format)
+            y = self.fromrgb(img)
+            x = x + y if x is not None else y
+            img = upfirdn2d.downsample2d(img, self.resample_filter) if self.architecture == 'skip' else None
+
+        # Main layers.
+        if self.architecture == 'resnet':
+            y = self.skip(x, gain=np.sqrt(0.5))
+            x = self.conv0(x)
+            x = self.conv1(x, gain=np.sqrt(0.5))
+            x = y.add_(x)
+        else:
+            x = self.conv0(x)
+            x = self.conv1(x)
+
+        assert x.dtype == dtype
+        return x, img
+
+    def extra_repr(self):
+        return f'resolution={self.resolution:d}, architecture={self.architecture:s}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class MinibatchStdLayer(torch.nn.Module):
+    def __init__(self, group_size, num_channels=1):
+        super().__init__()
+        self.group_size = group_size
+        self.num_channels = num_channels
+
+    def forward(self, x):
+        N, C, H, W = x.shape
+        with misc.suppress_tracer_warnings(): # as_tensor results are registered as constants
+            G = torch.min(torch.as_tensor(self.group_size), torch.as_tensor(N)) if self.group_size is not None else N
+        F = self.num_channels
+        c = C // F
+
+        y = x.reshape(G, -1, F, c, H, W)    # [GnFcHW] Split minibatch N into n groups of size G, and channels C into F groups of size c.
+        y = y - y.mean(dim=0)               # [GnFcHW] Subtract mean over group.
+        y = y.square().mean(dim=0)          # [nFcHW]  Calc variance over group.
+        y = (y + 1e-8).sqrt()               # [nFcHW]  Calc stddev over group.
+        y = y.mean(dim=[2,3,4])             # [nF]     Take average over channels and pixels.
+        y = y.reshape(-1, F, 1, 1)          # [nF11]   Add missing dimensions.
+        y = y.repeat(G, 1, H, W)            # [NFHW]   Replicate over group and pixels.
+        x = torch.cat([x, y], dim=1)        # [NCHW]   Append to input as new channels.
+        return x
+
+    def extra_repr(self):
+        return f'group_size={self.group_size}, num_channels={self.num_channels:d}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class DiscriminatorEpilogue(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        cmap_dim,                       # Dimensionality of mapped conditioning label, 0 = no label.
+        resolution,                     # Resolution of this block.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        mbstd_group_size    = 4,        # Group size for the minibatch standard deviation layer, None = entire minibatch.
+        mbstd_num_channels  = 1,        # Number of features for the minibatch standard deviation layer, 0 = disable.
+        activation          = 'lrelu',  # Activation function: 'relu', 'lrelu', etc.
+        conv_clamp          = None,     # Clamp the output of convolution layers to +-X, None = disable clamping.
+    ):
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.cmap_dim = cmap_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.architecture = architecture
+
+        if architecture == 'skip':
+            self.fromrgb = Conv2dLayer(img_channels, in_channels, kernel_size=1, activation=activation)
+        self.mbstd = MinibatchStdLayer(group_size=mbstd_group_size, num_channels=mbstd_num_channels) if mbstd_num_channels > 0 else None
+        self.conv = Conv2dLayer(in_channels + mbstd_num_channels, in_channels, kernel_size=3, activation=activation, conv_clamp=conv_clamp)
+        self.fc = FullyConnectedLayer(in_channels * (resolution ** 2), in_channels, activation=activation)
+        self.out = FullyConnectedLayer(in_channels, 1 if cmap_dim == 0 else cmap_dim)
+
+    def forward(self, x, img, cmap, force_fp32=False):
+        misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution]) # [NCHW]
+        _ = force_fp32 # unused
+        dtype = torch.float32
+        memory_format = torch.contiguous_format
+
+        # FromRGB.
+        x = x.to(dtype=dtype, memory_format=memory_format)
+        if self.architecture == 'skip':
+            misc.assert_shape(img, [None, self.img_channels, self.resolution, self.resolution])
+            img = img.to(dtype=dtype, memory_format=memory_format)
+            x = x + self.fromrgb(img)
+
+        # Main layers.
+        if self.mbstd is not None:
+            x = self.mbstd(x)
+        x = self.conv(x)
+        x = self.fc(x.flatten(1))
+        x = self.out(x)
+
+        # Conditioning.
+        if self.cmap_dim > 0:
+            misc.assert_shape(cmap, [None, self.cmap_dim])
+            x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim))
+
+        assert x.dtype == dtype
+        return x
+
+    def extra_repr(self):
+        return f'resolution={self.resolution:d}, architecture={self.architecture:s}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Discriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = 256,      # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+
+        common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+
+    def forward(self, img, c, update_emas=False, **block_kwargs):
+        _ = update_emas # unused
+        x = None
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+
+        cmap = None
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+
+    def extra_repr(self):
+        return f'c_dim={self.c_dim:d}, img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d}'
+
+#----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/eg3d/training/networks_stylegan3.py b/eg3d/training/networks_stylegan3.py
new file mode 100644
index 0000000000000000000000000000000000000000..40e5508803feb7d4ebdd49ce140051f6a549cf9c
--- /dev/null
+++ b/eg3d/training/networks_stylegan3.py
@@ -0,0 +1,517 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Generator architecture from the paper
+"Alias-Free Generative Adversarial Networks"."""
+
+import numpy as np
+import scipy.signal
+import scipy.optimize
+import torch
+from torch_utils import misc
+from torch_utils import persistence
+from torch_utils.ops import conv2d_gradfix
+from torch_utils.ops import filtered_lrelu
+from torch_utils.ops import bias_act
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def modulated_conv2d(
+    x,                  # Input tensor: [batch_size, in_channels, in_height, in_width]
+    w,                  # Weight tensor: [out_channels, in_channels, kernel_height, kernel_width]
+    s,                  # Style tensor: [batch_size, in_channels]
+    demodulate  = True, # Apply weight demodulation?
+    padding     = 0,    # Padding: int or [padH, padW]
+    input_gain  = None, # Optional scale factors for the input channels: [], [in_channels], or [batch_size, in_channels]
+):
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        batch_size = int(x.shape[0])
+    out_channels, in_channels, kh, kw = w.shape
+    misc.assert_shape(w, [out_channels, in_channels, kh, kw]) # [OIkk]
+    misc.assert_shape(x, [batch_size, in_channels, None, None]) # [NIHW]
+    misc.assert_shape(s, [batch_size, in_channels]) # [NI]
+
+    # Pre-normalize inputs.
+    if demodulate:
+        w = w * w.square().mean([1,2,3], keepdim=True).rsqrt()
+        s = s * s.square().mean().rsqrt()
+
+    # Modulate weights.
+    w = w.unsqueeze(0) # [NOIkk]
+    w = w * s.unsqueeze(1).unsqueeze(3).unsqueeze(4) # [NOIkk]
+
+    # Demodulate weights.
+    if demodulate:
+        dcoefs = (w.square().sum(dim=[2,3,4]) + 1e-8).rsqrt() # [NO]
+        w = w * dcoefs.unsqueeze(2).unsqueeze(3).unsqueeze(4) # [NOIkk]
+
+    # Apply input scaling.
+    if input_gain is not None:
+        input_gain = input_gain.expand(batch_size, in_channels) # [NI]
+        w = w * input_gain.unsqueeze(1).unsqueeze(3).unsqueeze(4) # [NOIkk]
+
+    # Execute as one fused op using grouped convolution.
+    x = x.reshape(1, -1, *x.shape[2:])
+    w = w.reshape(-1, in_channels, kh, kw)
+    x = conv2d_gradfix.conv2d(input=x, weight=w.to(x.dtype), padding=padding, groups=batch_size)
+    x = x.reshape(batch_size, -1, *x.shape[2:])
+    return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class FullyConnectedLayer(torch.nn.Module):
+    def __init__(self,
+        in_features,                # Number of input features.
+        out_features,               # Number of output features.
+        activation      = 'linear', # Activation function: 'relu', 'lrelu', etc.
+        bias            = True,     # Apply additive bias before the activation function?
+        lr_multiplier   = 1,        # Learning rate multiplier.
+        weight_init     = 1,        # Initial standard deviation of the weight tensor.
+        bias_init       = 0,        # Initial value of the additive bias.
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.activation = activation
+        self.weight = torch.nn.Parameter(torch.randn([out_features, in_features]) * (weight_init / lr_multiplier))
+        bias_init = np.broadcast_to(np.asarray(bias_init, dtype=np.float32), [out_features])
+        self.bias = torch.nn.Parameter(torch.from_numpy(bias_init / lr_multiplier)) if bias else None
+        self.weight_gain = lr_multiplier / np.sqrt(in_features)
+        self.bias_gain = lr_multiplier
+
+    def forward(self, x):
+        w = self.weight.to(x.dtype) * self.weight_gain
+        b = self.bias
+        if b is not None:
+            b = b.to(x.dtype)
+            if self.bias_gain != 1:
+                b = b * self.bias_gain
+        if self.activation == 'linear' and b is not None:
+            x = torch.addmm(b.unsqueeze(0), x, w.t())
+        else:
+            x = x.matmul(w.t())
+            x = bias_act.bias_act(x, b, act=self.activation)
+        return x
+
+    def extra_repr(self):
+        return f'in_features={self.in_features:d}, out_features={self.out_features:d}, activation={self.activation:s}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class MappingNetwork(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality.
+        c_dim,                      # Conditioning label (C) dimensionality, 0 = no labels.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        num_ws,                     # Number of intermediate latents to output.
+        num_layers      = 2,        # Number of mapping layers.
+        lr_multiplier   = 0.01,     # Learning rate multiplier for the mapping layers.
+        w_avg_beta      = 0.998,    # Decay for tracking the moving average of W during training.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.num_ws = num_ws
+        self.num_layers = num_layers
+        self.w_avg_beta = w_avg_beta
+
+        # Construct layers.
+        self.embed = FullyConnectedLayer(self.c_dim, self.w_dim) if self.c_dim > 0 else None
+        features = [self.z_dim + (self.w_dim if self.c_dim > 0 else 0)] + [self.w_dim] * self.num_layers
+        for idx, in_features, out_features in zip(range(num_layers), features[:-1], features[1:]):
+            layer = FullyConnectedLayer(in_features, out_features, activation='lrelu', lr_multiplier=lr_multiplier)
+            setattr(self, f'fc{idx}', layer)
+        self.register_buffer('w_avg', torch.zeros([w_dim]))
+
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False):
+        misc.assert_shape(z, [None, self.z_dim])
+        if truncation_cutoff is None:
+            truncation_cutoff = self.num_ws
+
+        # Embed, normalize, and concatenate inputs.
+        x = z.to(torch.float32)
+        x = x * (x.square().mean(1, keepdim=True) + 1e-8).rsqrt()
+        if self.c_dim > 0:
+            misc.assert_shape(c, [None, self.c_dim])
+            y = self.embed(c.to(torch.float32))
+            y = y * (y.square().mean(1, keepdim=True) + 1e-8).rsqrt()
+            x = torch.cat([x, y], dim=1) if x is not None else y
+
+        # Execute layers.
+        for idx in range(self.num_layers):
+            x = getattr(self, f'fc{idx}')(x)
+
+        # Update moving average of W.
+        if update_emas:
+            self.w_avg.copy_(x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta))
+
+        # Broadcast and apply truncation.
+        x = x.unsqueeze(1).repeat([1, self.num_ws, 1])
+        if truncation_psi != 1:
+            x[:, :truncation_cutoff] = self.w_avg.lerp(x[:, :truncation_cutoff], truncation_psi)
+        return x
+
+    def extra_repr(self):
+        return f'z_dim={self.z_dim:d}, c_dim={self.c_dim:d}, w_dim={self.w_dim:d}, num_ws={self.num_ws:d}'
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisInput(torch.nn.Module):
+    def __init__(self,
+        w_dim,          # Intermediate latent (W) dimensionality.
+        channels,       # Number of output channels.
+        size,           # Output spatial size: int or [width, height].
+        sampling_rate,  # Output sampling rate.
+        bandwidth,      # Output bandwidth.
+    ):
+        super().__init__()
+        self.w_dim = w_dim
+        self.channels = channels
+        self.size = np.broadcast_to(np.asarray(size), [2])
+        self.sampling_rate = sampling_rate
+        self.bandwidth = bandwidth
+
+        # Draw random frequencies from uniform 2D disc.
+        freqs = torch.randn([self.channels, 2])
+        radii = freqs.square().sum(dim=1, keepdim=True).sqrt()
+        freqs /= radii * radii.square().exp().pow(0.25)
+        freqs *= bandwidth
+        phases = torch.rand([self.channels]) - 0.5
+
+        # Setup parameters and buffers.
+        self.weight = torch.nn.Parameter(torch.randn([self.channels, self.channels]))
+        self.affine = FullyConnectedLayer(w_dim, 4, weight_init=0, bias_init=[1,0,0,0])
+        self.register_buffer('transform', torch.eye(3, 3)) # User-specified inverse transform wrt. resulting image.
+        self.register_buffer('freqs', freqs)
+        self.register_buffer('phases', phases)
+
+    def forward(self, w):
+        # Introduce batch dimension.
+        transforms = self.transform.unsqueeze(0) # [batch, row, col]
+        freqs = self.freqs.unsqueeze(0) # [batch, channel, xy]
+        phases = self.phases.unsqueeze(0) # [batch, channel]
+
+        # Apply learned transformation.
+        t = self.affine(w) # t = (r_c, r_s, t_x, t_y)
+        t = t / t[:, :2].norm(dim=1, keepdim=True) # t' = (r'_c, r'_s, t'_x, t'_y)
+        m_r = torch.eye(3, device=w.device).unsqueeze(0).repeat([w.shape[0], 1, 1]) # Inverse rotation wrt. resulting image.
+        m_r[:, 0, 0] = t[:, 0]  # r'_c
+        m_r[:, 0, 1] = -t[:, 1] # r'_s
+        m_r[:, 1, 0] = t[:, 1]  # r'_s
+        m_r[:, 1, 1] = t[:, 0]  # r'_c
+        m_t = torch.eye(3, device=w.device).unsqueeze(0).repeat([w.shape[0], 1, 1]) # Inverse translation wrt. resulting image.
+        m_t[:, 0, 2] = -t[:, 2] # t'_x
+        m_t[:, 1, 2] = -t[:, 3] # t'_y
+        transforms = m_r @ m_t @ transforms # First rotate resulting image, then translate, and finally apply user-specified transform.
+
+        # Transform frequencies.
+        phases = phases + (freqs @ transforms[:, :2, 2:]).squeeze(2)
+        freqs = freqs @ transforms[:, :2, :2]
+
+        # Dampen out-of-band frequencies that may occur due to the user-specified transform.
+        amplitudes = (1 - (freqs.norm(dim=2) - self.bandwidth) / (self.sampling_rate / 2 - self.bandwidth)).clamp(0, 1)
+
+        # Construct sampling grid.
+        theta = torch.eye(2, 3, device=w.device)
+        theta[0, 0] = 0.5 * self.size[0] / self.sampling_rate
+        theta[1, 1] = 0.5 * self.size[1] / self.sampling_rate
+        grids = torch.nn.functional.affine_grid(theta.unsqueeze(0), [1, 1, self.size[1], self.size[0]], align_corners=False)
+
+        # Compute Fourier features.
+        x = (grids.unsqueeze(3) @ freqs.permute(0, 2, 1).unsqueeze(1).unsqueeze(2)).squeeze(3) # [batch, height, width, channel]
+        x = x + phases.unsqueeze(1).unsqueeze(2)
+        x = torch.sin(x * (np.pi * 2))
+        x = x * amplitudes.unsqueeze(1).unsqueeze(2)
+
+        # Apply trainable mapping.
+        weight = self.weight / np.sqrt(self.channels)
+        x = x @ weight.t()
+
+        # Ensure correct shape.
+        x = x.permute(0, 3, 1, 2) # [batch, channel, height, width]
+        misc.assert_shape(x, [w.shape[0], self.channels, int(self.size[1]), int(self.size[0])])
+        return x
+
+    def extra_repr(self):
+        return '\n'.join([
+            f'w_dim={self.w_dim:d}, channels={self.channels:d}, size={list(self.size)},',
+            f'sampling_rate={self.sampling_rate:g}, bandwidth={self.bandwidth:g}'])
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisLayer(torch.nn.Module):
+    def __init__(self,
+        w_dim,                          # Intermediate latent (W) dimensionality.
+        is_torgb,                       # Is this the final ToRGB layer?
+        is_critically_sampled,          # Does this layer use critical sampling?
+        use_fp16,                       # Does this layer use FP16?
+
+        # Input & output specifications.
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        in_size,                        # Input spatial size: int or [width, height].
+        out_size,                       # Output spatial size: int or [width, height].
+        in_sampling_rate,               # Input sampling rate (s).
+        out_sampling_rate,              # Output sampling rate (s).
+        in_cutoff,                      # Input cutoff frequency (f_c).
+        out_cutoff,                     # Output cutoff frequency (f_c).
+        in_half_width,                  # Input transition band half-width (f_h).
+        out_half_width,                 # Output Transition band half-width (f_h).
+
+        # Hyperparameters.
+        conv_kernel         = 3,        # Convolution kernel size. Ignored for final the ToRGB layer.
+        filter_size         = 6,        # Low-pass filter size relative to the lower resolution when up/downsampling.
+        lrelu_upsampling    = 2,        # Relative sampling rate for leaky ReLU. Ignored for final the ToRGB layer.
+        use_radial_filters  = False,    # Use radially symmetric downsampling filter? Ignored for critically sampled layers.
+        conv_clamp          = 256,      # Clamp the output to [-X, +X], None = disable clamping.
+        magnitude_ema_beta  = 0.999,    # Decay rate for the moving average of input magnitudes.
+    ):
+        super().__init__()
+        self.w_dim = w_dim
+        self.is_torgb = is_torgb
+        self.is_critically_sampled = is_critically_sampled
+        self.use_fp16 = use_fp16
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.in_size = np.broadcast_to(np.asarray(in_size), [2])
+        self.out_size = np.broadcast_to(np.asarray(out_size), [2])
+        self.in_sampling_rate = in_sampling_rate
+        self.out_sampling_rate = out_sampling_rate
+        self.tmp_sampling_rate = max(in_sampling_rate, out_sampling_rate) * (1 if is_torgb else lrelu_upsampling)
+        self.in_cutoff = in_cutoff
+        self.out_cutoff = out_cutoff
+        self.in_half_width = in_half_width
+        self.out_half_width = out_half_width
+        self.conv_kernel = 1 if is_torgb else conv_kernel
+        self.conv_clamp = conv_clamp
+        self.magnitude_ema_beta = magnitude_ema_beta
+
+        # Setup parameters and buffers.
+        self.affine = FullyConnectedLayer(self.w_dim, self.in_channels, bias_init=1)
+        self.weight = torch.nn.Parameter(torch.randn([self.out_channels, self.in_channels, self.conv_kernel, self.conv_kernel]))
+        self.bias = torch.nn.Parameter(torch.zeros([self.out_channels]))
+        self.register_buffer('magnitude_ema', torch.ones([]))
+
+        # Design upsampling filter.
+        self.up_factor = int(np.rint(self.tmp_sampling_rate / self.in_sampling_rate))
+        assert self.in_sampling_rate * self.up_factor == self.tmp_sampling_rate
+        self.up_taps = filter_size * self.up_factor if self.up_factor > 1 and not self.is_torgb else 1
+        self.register_buffer('up_filter', self.design_lowpass_filter(
+            numtaps=self.up_taps, cutoff=self.in_cutoff, width=self.in_half_width*2, fs=self.tmp_sampling_rate))
+
+        # Design downsampling filter.
+        self.down_factor = int(np.rint(self.tmp_sampling_rate / self.out_sampling_rate))
+        assert self.out_sampling_rate * self.down_factor == self.tmp_sampling_rate
+        self.down_taps = filter_size * self.down_factor if self.down_factor > 1 and not self.is_torgb else 1
+        self.down_radial = use_radial_filters and not self.is_critically_sampled
+        self.register_buffer('down_filter', self.design_lowpass_filter(
+            numtaps=self.down_taps, cutoff=self.out_cutoff, width=self.out_half_width*2, fs=self.tmp_sampling_rate, radial=self.down_radial))
+
+        # Compute padding.
+        pad_total = (self.out_size - 1) * self.down_factor + 1 # Desired output size before downsampling.
+        pad_total -= (self.in_size + self.conv_kernel - 1) * self.up_factor # Input size after upsampling.
+        pad_total += self.up_taps + self.down_taps - 2 # Size reduction caused by the filters.
+        pad_lo = (pad_total + self.up_factor) // 2 # Shift sample locations according to the symmetric interpretation (Appendix C.3).
+        pad_hi = pad_total - pad_lo
+        self.padding = [int(pad_lo[0]), int(pad_hi[0]), int(pad_lo[1]), int(pad_hi[1])]
+
+    def forward(self, x, w, noise_mode='random', force_fp32=False, update_emas=False):
+        assert noise_mode in ['random', 'const', 'none'] # unused
+        misc.assert_shape(x, [None, self.in_channels, int(self.in_size[1]), int(self.in_size[0])])
+        misc.assert_shape(w, [x.shape[0], self.w_dim])
+
+        # Track input magnitude.
+        if update_emas:
+            with torch.autograd.profiler.record_function('update_magnitude_ema'):
+                magnitude_cur = x.detach().to(torch.float32).square().mean()
+                self.magnitude_ema.copy_(magnitude_cur.lerp(self.magnitude_ema, self.magnitude_ema_beta))
+        input_gain = self.magnitude_ema.rsqrt()
+
+        # Execute affine layer.
+        styles = self.affine(w)
+        if self.is_torgb:
+            weight_gain = 1 / np.sqrt(self.in_channels * (self.conv_kernel ** 2))
+            styles = styles * weight_gain
+
+        # Execute modulated conv2d.
+        dtype = torch.float16 if (self.use_fp16 and not force_fp32 and x.device.type == 'cuda') else torch.float32
+        x = modulated_conv2d(x=x.to(dtype), w=self.weight, s=styles,
+            padding=self.conv_kernel-1, demodulate=(not self.is_torgb), input_gain=input_gain)
+
+        # Execute bias, filtered leaky ReLU, and clamping.
+        gain = 1 if self.is_torgb else np.sqrt(2)
+        slope = 1 if self.is_torgb else 0.2
+        x = filtered_lrelu.filtered_lrelu(x=x, fu=self.up_filter, fd=self.down_filter, b=self.bias.to(x.dtype),
+            up=self.up_factor, down=self.down_factor, padding=self.padding, gain=gain, slope=slope, clamp=self.conv_clamp)
+
+        # Ensure correct shape and dtype.
+        misc.assert_shape(x, [None, self.out_channels, int(self.out_size[1]), int(self.out_size[0])])
+        assert x.dtype == dtype
+        return x
+
+    @staticmethod
+    def design_lowpass_filter(numtaps, cutoff, width, fs, radial=False):
+        assert numtaps >= 1
+
+        # Identity filter.
+        if numtaps == 1:
+            return None
+
+        # Separable Kaiser low-pass filter.
+        if not radial:
+            f = scipy.signal.firwin(numtaps=numtaps, cutoff=cutoff, width=width, fs=fs)
+            return torch.as_tensor(f, dtype=torch.float32)
+
+        # Radially symmetric jinc-based filter.
+        x = (np.arange(numtaps) - (numtaps - 1) / 2) / fs
+        r = np.hypot(*np.meshgrid(x, x))
+        f = scipy.special.j1(2 * cutoff * (np.pi * r)) / (np.pi * r)
+        beta = scipy.signal.kaiser_beta(scipy.signal.kaiser_atten(numtaps, width / (fs / 2)))
+        w = np.kaiser(numtaps, beta)
+        f *= np.outer(w, w)
+        f /= np.sum(f)
+        return torch.as_tensor(f, dtype=torch.float32)
+
+    def extra_repr(self):
+        return '\n'.join([
+            f'w_dim={self.w_dim:d}, is_torgb={self.is_torgb},',
+            f'is_critically_sampled={self.is_critically_sampled}, use_fp16={self.use_fp16},',
+            f'in_sampling_rate={self.in_sampling_rate:g}, out_sampling_rate={self.out_sampling_rate:g},',
+            f'in_cutoff={self.in_cutoff:g}, out_cutoff={self.out_cutoff:g},',
+            f'in_half_width={self.in_half_width:g}, out_half_width={self.out_half_width:g},',
+            f'in_size={list(self.in_size)}, out_size={list(self.out_size)},',
+            f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}'])
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisNetwork(torch.nn.Module):
+    def __init__(self,
+        w_dim,                          # Intermediate latent (W) dimensionality.
+        img_resolution,                 # Output image resolution.
+        img_channels,                   # Number of color channels.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_layers          = 14,       # Total number of layers, excluding Fourier features and ToRGB.
+        num_critical        = 2,        # Number of critically sampled layers at the end.
+        first_cutoff        = 2,        # Cutoff frequency of the first layer (f_{c,0}).
+        first_stopband      = 2**2.1,   # Minimum stopband of the first layer (f_{t,0}).
+        last_stopband_rel   = 2**0.3,   # Minimum stopband of the last layer, expressed relative to the cutoff.
+        margin_size         = 10,       # Number of additional pixels outside the image.
+        output_scale        = 0.25,     # Scale factor for the output image.
+        num_fp16_res        = 4,        # Use FP16 for the N highest resolutions.
+        **layer_kwargs,                 # Arguments for SynthesisLayer.
+    ):
+        super().__init__()
+        self.w_dim = w_dim
+        self.num_ws = num_layers + 2
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.num_layers = num_layers
+        self.num_critical = num_critical
+        self.margin_size = margin_size
+        self.output_scale = output_scale
+        self.num_fp16_res = num_fp16_res
+
+        # Geometric progression of layer cutoffs and min. stopbands.
+        last_cutoff = self.img_resolution / 2 # f_{c,N}
+        last_stopband = last_cutoff * last_stopband_rel # f_{t,N}
+        exponents = np.minimum(np.arange(self.num_layers + 1) / (self.num_layers - self.num_critical), 1)
+        cutoffs = first_cutoff * (last_cutoff / first_cutoff) ** exponents # f_c[i]
+        stopbands = first_stopband * (last_stopband / first_stopband) ** exponents # f_t[i]
+
+        # Compute remaining layer parameters.
+        sampling_rates = np.exp2(np.ceil(np.log2(np.minimum(stopbands * 2, self.img_resolution)))) # s[i]
+        half_widths = np.maximum(stopbands, sampling_rates / 2) - cutoffs # f_h[i]
+        sizes = sampling_rates + self.margin_size * 2
+        sizes[-2:] = self.img_resolution
+        channels = np.rint(np.minimum((channel_base / 2) / cutoffs, channel_max))
+        channels[-1] = self.img_channels
+
+        # Construct layers.
+        self.input = SynthesisInput(
+            w_dim=self.w_dim, channels=int(channels[0]), size=int(sizes[0]),
+            sampling_rate=sampling_rates[0], bandwidth=cutoffs[0])
+        self.layer_names = []
+        for idx in range(self.num_layers + 1):
+            prev = max(idx - 1, 0)
+            is_torgb = (idx == self.num_layers)
+            is_critically_sampled = (idx >= self.num_layers - self.num_critical)
+            use_fp16 = (sampling_rates[idx] * (2 ** self.num_fp16_res) > self.img_resolution)
+            layer = SynthesisLayer(
+                w_dim=self.w_dim, is_torgb=is_torgb, is_critically_sampled=is_critically_sampled, use_fp16=use_fp16,
+                in_channels=int(channels[prev]), out_channels= int(channels[idx]),
+                in_size=int(sizes[prev]), out_size=int(sizes[idx]),
+                in_sampling_rate=int(sampling_rates[prev]), out_sampling_rate=int(sampling_rates[idx]),
+                in_cutoff=cutoffs[prev], out_cutoff=cutoffs[idx],
+                in_half_width=half_widths[prev], out_half_width=half_widths[idx],
+                **layer_kwargs)
+            name = f'L{idx}_{layer.out_size[0]}_{layer.out_channels}'
+            setattr(self, name, layer)
+            self.layer_names.append(name)
+
+    def forward(self, ws, **layer_kwargs):
+        misc.assert_shape(ws, [None, self.num_ws, self.w_dim])
+        ws = ws.to(torch.float32).unbind(dim=1)
+
+        # Execute layers.
+        x = self.input(ws[0])
+        for name, w in zip(self.layer_names, ws[1:]):
+            x = getattr(self, name)(x, w, **layer_kwargs)
+        if self.output_scale != 1:
+            x = x * self.output_scale
+
+        # Ensure correct shape and dtype.
+        misc.assert_shape(x, [None, self.img_channels, self.img_resolution, self.img_resolution])
+        x = x.to(torch.float32)
+        return x
+
+    def extra_repr(self):
+        return '\n'.join([
+            f'w_dim={self.w_dim:d}, num_ws={self.num_ws:d},',
+            f'img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d},',
+            f'num_layers={self.num_layers:d}, num_critical={self.num_critical:d},',
+            f'margin_size={self.margin_size:d}, num_fp16_res={self.num_fp16_res:d}'])
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Generator(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality.
+        c_dim,                      # Conditioning label (C) dimensionality.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output resolution.
+        img_channels,               # Number of output color channels.
+        mapping_kwargs      = {},   # Arguments for MappingNetwork.
+        **synthesis_kwargs,         # Arguments for SynthesisNetwork.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.synthesis = SynthesisNetwork(w_dim=w_dim, img_resolution=img_resolution, img_channels=img_channels, **synthesis_kwargs)
+        self.num_ws = self.synthesis.num_ws
+        self.mapping = MappingNetwork(z_dim=z_dim, c_dim=c_dim, w_dim=w_dim, num_ws=self.num_ws, **mapping_kwargs)
+
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False, **synthesis_kwargs):
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        img = self.synthesis(ws, update_emas=update_emas, **synthesis_kwargs)
+        return img
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/training/superresolution.py b/eg3d/training/superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfa1425b1c692dcb6127489d5cb03d82015d596f
--- /dev/null
+++ b/eg3d/training/superresolution.py
@@ -0,0 +1,322 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Superresolution network architectures from the paper
+"Efficient Geometry-aware 3D Generative Adversarial Networks"."""
+
+import torch
+from training.networks_stylegan2 import Conv2dLayer, SynthesisLayer, ToRGBLayer
+from torch_utils.ops import upfirdn2d
+from torch_utils import persistence
+from torch_utils import misc
+
+from training.networks_stylegan2 import SynthesisBlock
+import numpy as np
+from training.networks_stylegan3 import SynthesisLayer as AFSynthesisLayer
+
+
+#----------------------------------------------------------------------------
+
+# for 512x512 generation
+@persistence.persistent_class
+class SuperresolutionHybrid8X(torch.nn.Module):
+    def __init__(self, channels, img_resolution, sr_num_fp16_res, sr_antialias,
+                num_fp16_res=4, conv_clamp=None, channel_base=None, channel_max=None,# IGNORE
+                **block_kwargs):
+        super().__init__()
+        assert img_resolution == 512
+
+        use_fp16 = sr_num_fp16_res > 0
+        self.input_resolution = 128
+        self.sr_antialias = sr_antialias
+        self.block0 = SynthesisBlock(channels, 128, w_dim=512, resolution=256,
+                img_channels=3, is_last=False, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.block1 = SynthesisBlock(128, 64, w_dim=512, resolution=512,
+                img_channels=3, is_last=True, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+
+    def forward(self, rgb, x, ws, **block_kwargs):
+        ws = ws[:, -1:, :].repeat(1, 3, 1)
+
+        if x.shape[-1] != self.input_resolution:
+            x = torch.nn.functional.interpolate(x, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+            rgb = torch.nn.functional.interpolate(rgb, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+
+        x, rgb = self.block0(x, rgb, ws, **block_kwargs)
+        x, rgb = self.block1(x, rgb, ws, **block_kwargs)
+        return rgb
+
+#----------------------------------------------------------------------------
+
+# for 256x256 generation
+@persistence.persistent_class
+class SuperresolutionHybrid4X(torch.nn.Module):
+    def __init__(self, channels, img_resolution, sr_num_fp16_res, sr_antialias,
+                num_fp16_res=4, conv_clamp=None, channel_base=None, channel_max=None,# IGNORE
+                **block_kwargs):
+        super().__init__()
+        assert img_resolution == 256
+        use_fp16 = sr_num_fp16_res > 0
+        self.sr_antialias = sr_antialias
+        self.input_resolution = 128
+        self.block0 = SynthesisBlockNoUp(channels, 128, w_dim=512, resolution=128,
+                img_channels=3, is_last=False, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.block1 = SynthesisBlock(128, 64, w_dim=512, resolution=256,
+                img_channels=3, is_last=True, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+
+    def forward(self, rgb, x, ws, **block_kwargs):
+        ws = ws[:, -1:, :].repeat(1, 3, 1)
+
+        if x.shape[-1] < self.input_resolution:
+            x = torch.nn.functional.interpolate(x, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+            rgb = torch.nn.functional.interpolate(rgb, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+
+        x, rgb = self.block0(x, rgb, ws, **block_kwargs)
+        x, rgb = self.block1(x, rgb, ws, **block_kwargs)
+        return rgb
+
+#----------------------------------------------------------------------------
+
+# for 128 x 128 generation
+@persistence.persistent_class
+class SuperresolutionHybrid2X(torch.nn.Module):
+    def __init__(self, channels, img_resolution, sr_num_fp16_res, sr_antialias,
+                num_fp16_res=4, conv_clamp=None, channel_base=None, channel_max=None,# IGNORE
+                **block_kwargs):
+        super().__init__()
+        assert img_resolution == 128
+
+        use_fp16 = sr_num_fp16_res > 0
+        self.input_resolution = 64
+        self.sr_antialias = sr_antialias
+        self.block0 = SynthesisBlockNoUp(channels, 128, w_dim=512, resolution=64,
+                img_channels=3, is_last=False, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.block1 = SynthesisBlock(128, 64, w_dim=512, resolution=128,
+                img_channels=3, is_last=True, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+
+    def forward(self, rgb, x, ws, **block_kwargs):
+        ws = ws[:, -1:, :].repeat(1, 3, 1)
+
+        if x.shape[-1] != self.input_resolution:
+            x = torch.nn.functional.interpolate(x, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+            rgb = torch.nn.functional.interpolate(rgb, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+
+        x, rgb = self.block0(x, rgb, ws, **block_kwargs)
+        x, rgb = self.block1(x, rgb, ws, **block_kwargs)
+        return rgb
+
+#----------------------------------------------------------------------------
+
+# TODO: Delete (here for backwards compatibility with old 256x256 models)
+@persistence.persistent_class
+class SuperresolutionHybridDeepfp32(torch.nn.Module):
+    def __init__(self, channels, img_resolution, sr_num_fp16_res,
+                num_fp16_res=4, conv_clamp=None, channel_base=None, channel_max=None,# IGNORE
+                **block_kwargs):
+        super().__init__()
+        assert img_resolution == 256
+        use_fp16 = sr_num_fp16_res > 0
+
+        self.input_resolution = 128
+        self.block0 = SynthesisBlockNoUp(channels, 128, w_dim=512, resolution=128,
+                img_channels=3, is_last=False, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.block1 = SynthesisBlock(128, 64, w_dim=512, resolution=256,
+                img_channels=3, is_last=True, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter([1,3,3,1]))
+
+    def forward(self, rgb, x, ws, **block_kwargs):
+        ws = ws[:, -1:, :].repeat(1, 3, 1)
+
+        if x.shape[-1] < self.input_resolution:
+            x = torch.nn.functional.interpolate(x, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False)
+            rgb = torch.nn.functional.interpolate(rgb, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False)
+
+        x, rgb = self.block0(x, rgb, ws, **block_kwargs)
+        x, rgb = self.block1(x, rgb, ws, **block_kwargs)
+        return rgb
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisBlockNoUp(torch.nn.Module):
+    def __init__(self,
+        in_channels,                            # Number of input channels, 0 = first block.
+        out_channels,                           # Number of output channels.
+        w_dim,                                  # Intermediate latent (W) dimensionality.
+        resolution,                             # Resolution of this block.
+        img_channels,                           # Number of output color channels.
+        is_last,                                # Is this the last block?
+        architecture            = 'skip',       # Architecture: 'orig', 'skip', 'resnet'.
+        resample_filter         = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp              = 256,          # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16                = False,        # Use FP16 for this block?
+        fp16_channels_last      = False,        # Use channels-last memory format with FP16?
+        fused_modconv_default   = True,         # Default value of fused_modconv. 'inference_only' = True for inference, False for training.
+        **layer_kwargs,                         # Arguments for SynthesisLayer.
+    ):
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.is_last = is_last
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = (use_fp16 and fp16_channels_last)
+        self.fused_modconv_default = fused_modconv_default
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.num_conv = 0
+        self.num_torgb = 0
+
+        if in_channels == 0:
+            self.const = torch.nn.Parameter(torch.randn([out_channels, resolution, resolution]))
+
+        if in_channels != 0:
+            self.conv0 = SynthesisLayer(in_channels, out_channels, w_dim=w_dim, resolution=resolution,
+                conv_clamp=conv_clamp, channels_last=self.channels_last, **layer_kwargs)
+            self.num_conv += 1
+
+        self.conv1 = SynthesisLayer(out_channels, out_channels, w_dim=w_dim, resolution=resolution,
+            conv_clamp=conv_clamp, channels_last=self.channels_last, **layer_kwargs)
+        self.num_conv += 1
+
+        if is_last or architecture == 'skip':
+            self.torgb = ToRGBLayer(out_channels, img_channels, w_dim=w_dim,
+                conv_clamp=conv_clamp, channels_last=self.channels_last)
+            self.num_torgb += 1
+
+        if in_channels != 0 and architecture == 'resnet':
+            self.skip = Conv2dLayer(in_channels, out_channels, kernel_size=1, bias=False, up=2,
+                resample_filter=resample_filter, channels_last=self.channels_last)
+
+    def forward(self, x, img, ws, force_fp32=False, fused_modconv=None, update_emas=False, **layer_kwargs):
+        _ = update_emas # unused
+        misc.assert_shape(ws, [None, self.num_conv + self.num_torgb, self.w_dim])
+        w_iter = iter(ws.unbind(dim=1))
+        if ws.device.type != 'cuda':
+            force_fp32 = True
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format
+        if fused_modconv is None:
+            fused_modconv = self.fused_modconv_default
+        if fused_modconv == 'inference_only':
+            fused_modconv = (not self.training)
+
+        # Input.
+        if self.in_channels == 0:
+            x = self.const.to(dtype=dtype, memory_format=memory_format)
+            x = x.unsqueeze(0).repeat([ws.shape[0], 1, 1, 1])
+        else:
+            misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution])
+            x = x.to(dtype=dtype, memory_format=memory_format)
+
+        # Main layers.
+        if self.in_channels == 0:
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+        elif self.architecture == 'resnet':
+            y = self.skip(x, gain=np.sqrt(0.5))
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, gain=np.sqrt(0.5), **layer_kwargs)
+            x = y.add_(x)
+        else:
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+
+        # ToRGB.
+        # if img is not None:
+            # misc.assert_shape(img, [None, self.img_channels, self.resolution // 2, self.resolution // 2])
+            # img = upfirdn2d.upsample2d(img, self.resample_filter)
+        if self.is_last or self.architecture == 'skip':
+            y = self.torgb(x, next(w_iter), fused_modconv=fused_modconv)
+            y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format)
+            img = img.add_(y) if img is not None else y
+
+        assert x.dtype == dtype
+        assert img is None or img.dtype == torch.float32
+        return x, img
+
+    def extra_repr(self):
+        return f'resolution={self.resolution:d}, architecture={self.architecture:s}'
+
+
+#----------------------------------------------------------------------------
+
+# for 512x512 generation
+@persistence.persistent_class
+class SuperresolutionHybrid8XDC(torch.nn.Module):
+    def __init__(self, channels, img_resolution, sr_num_fp16_res, sr_antialias,
+                num_fp16_res=4, conv_clamp=None, channel_base=None, channel_max=None,# IGNORE
+                **block_kwargs):
+        super().__init__()
+        assert img_resolution == 512
+
+        use_fp16 = sr_num_fp16_res > 0
+        self.input_resolution = 128
+        self.sr_antialias = sr_antialias
+        self.block0 = SynthesisBlock(channels, 256, w_dim=512, resolution=256,
+                img_channels=3, is_last=False, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.block1 = SynthesisBlock(256, 128, w_dim=512, resolution=512,
+                img_channels=3, is_last=True, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+
+    def forward(self, rgb, x, ws, **block_kwargs):
+        ws = ws[:, -1:, :].repeat(1, 3, 1)
+
+        if x.shape[-1] != self.input_resolution:
+            x = torch.nn.functional.interpolate(x, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+            rgb = torch.nn.functional.interpolate(rgb, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+
+        x, rgb = self.block0(x, rgb, ws, **block_kwargs)
+        x, rgb = self.block1(x, rgb, ws, **block_kwargs)
+        return rgb
+
+
+
+class SuperresolutionHybrid8XDC_afhq(torch.nn.Module):
+    def __init__(self, channels, img_resolution, sr_num_fp16_res, sr_antialias,
+                num_fp16_res=4, conv_clamp=None, channel_base=None, channel_max=None,# IGNORE
+                **block_kwargs):
+        super().__init__()
+        assert img_resolution == 512
+
+        use_fp16 = sr_num_fp16_res > 0
+        self.input_resolution = 128
+        self.sr_antialias = sr_antialias
+        self.block0 = SynthesisBlock(channels, 128, w_dim=512, resolution=256,
+                img_channels=3, is_last=False, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+        self.block1 = SynthesisBlock(128, 64, w_dim=512, resolution=512,
+                img_channels=3, is_last=True, use_fp16=use_fp16, conv_clamp=(256 if use_fp16 else None), **block_kwargs)
+
+    def forward(self, rgb, x, ws, **block_kwargs):
+        ws = ws[:, -1:, :].repeat(1, 3, 1)
+
+        if x.shape[-1] != self.input_resolution:
+            x = torch.nn.functional.interpolate(x, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+            rgb = torch.nn.functional.interpolate(rgb, size=(self.input_resolution, self.input_resolution),
+                                                  mode='bilinear', align_corners=False, antialias=self.sr_antialias)
+
+        x, rgb = self.block0(x, rgb, ws, **block_kwargs)
+        x, rgb = self.block1(x, rgb, ws, **block_kwargs)
+        return rgb
+
+#----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/eg3d/training/training_loop.py b/eg3d/training/training_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..1338bb7dd4359dd0f21679e059534aeed9780890
--- /dev/null
+++ b/eg3d/training/training_loop.py
@@ -0,0 +1,475 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""Main training loop."""
+
+import os
+import time
+import copy
+import json
+import pickle
+import psutil
+import PIL.Image
+import numpy as np
+import torch
+import dnnlib
+from torch_utils import misc
+from torch_utils import training_stats
+from torch_utils.ops import conv2d_gradfix
+from torch_utils.ops import grid_sample_gradfix
+
+import legacy
+from metrics import metric_main
+from camera_utils import LookAtPoseSampler
+from training.crosssection_utils import sample_cross_section
+
+#----------------------------------------------------------------------------
+
+def setup_snapshot_image_grid(training_set, random_seed=0):
+    rnd = np.random.RandomState(random_seed)
+    gw = np.clip(7680 // training_set.image_shape[2], 7, 32)
+    gh = np.clip(4320 // training_set.image_shape[1], 4, 32)
+
+    # No labels => show random subset of training samples.
+    if not training_set.has_labels:
+        all_indices = list(range(len(training_set)))
+        rnd.shuffle(all_indices)
+        grid_indices = [all_indices[i % len(all_indices)] for i in range(gw * gh)]
+
+    else:
+        # Group training samples by label.
+        label_groups = dict() # label => [idx, ...]
+        for idx in range(len(training_set)):
+            label = tuple(training_set.get_details(idx).raw_label.flat[::-1])
+            if label not in label_groups:
+                label_groups[label] = []
+            label_groups[label].append(idx)
+
+        # Reorder.
+        label_order = list(label_groups.keys())
+        rnd.shuffle(label_order)
+        for label in label_order:
+            rnd.shuffle(label_groups[label])
+
+        # Organize into grid.
+        grid_indices = []
+        for y in range(gh):
+            label = label_order[y % len(label_order)]
+            indices = label_groups[label]
+            grid_indices += [indices[x % len(indices)] for x in range(gw)]
+            label_groups[label] = [indices[(i + gw) % len(indices)] for i in range(len(indices))]
+
+    # Load data.
+    images, labels = zip(*[training_set[i] for i in grid_indices])
+    return (gw, gh), np.stack(images), np.stack(labels)
+
+#----------------------------------------------------------------------------
+
+def save_image_grid(img, fname, drange, grid_size):
+    lo, hi = drange
+    img = np.asarray(img, dtype=np.float32)
+    img = (img - lo) * (255 / (hi - lo))
+    img = np.rint(img).clip(0, 255).astype(np.uint8)
+
+    gw, gh = grid_size
+    _N, C, H, W = img.shape
+    img = img.reshape([gh, gw, C, H, W])
+    img = img.transpose(0, 3, 1, 4, 2)
+    img = img.reshape([gh * H, gw * W, C])
+
+    assert C in [1, 3]
+    if C == 1:
+        PIL.Image.fromarray(img[:, :, 0], 'L').save(fname)
+    if C == 3:
+        PIL.Image.fromarray(img, 'RGB').save(fname)
+
+#----------------------------------------------------------------------------
+
+def training_loop(
+    run_dir                 = '.',      # Output directory.
+    training_set_kwargs     = {},       # Options for training set.
+    data_loader_kwargs      = {},       # Options for torch.utils.data.DataLoader.
+    G_kwargs                = {},       # Options for generator network.
+    D_kwargs                = {},       # Options for discriminator network.
+    G_opt_kwargs            = {},       # Options for generator optimizer.
+    D_opt_kwargs            = {},       # Options for discriminator optimizer.
+    augment_kwargs          = None,     # Options for augmentation pipeline. None = disable.
+    loss_kwargs             = {},       # Options for loss function.
+    metrics                 = [],       # Metrics to evaluate during training.
+    random_seed             = 0,        # Global random seed.
+    num_gpus                = 1,        # Number of GPUs participating in the training.
+    rank                    = 0,        # Rank of the current process in [0, num_gpus[.
+    batch_size              = 4,        # Total batch size for one training iteration. Can be larger than batch_gpu * num_gpus.
+    batch_gpu               = 4,        # Number of samples processed at a time by one GPU.
+    ema_kimg                = 10,       # Half-life of the exponential moving average (EMA) of generator weights.
+    ema_rampup              = 0.05,     # EMA ramp-up coefficient. None = no rampup.
+    G_reg_interval          = None,     # How often to perform regularization for G? None = disable lazy regularization.
+    D_reg_interval          = 16,       # How often to perform regularization for D? None = disable lazy regularization.
+    augment_p               = 0,        # Initial value of augmentation probability.
+    ada_target              = None,     # ADA target value. None = fixed p.
+    ada_interval            = 4,        # How often to perform ADA adjustment?
+    ada_kimg                = 500,      # ADA adjustment speed, measured in how many kimg it takes for p to increase/decrease by one unit.
+    total_kimg              = 25000,    # Total length of the training, measured in thousands of real images.
+    kimg_per_tick           = 4,        # Progress snapshot interval.
+    image_snapshot_ticks    = 50,       # How often to save image snapshots? None = disable.
+    network_snapshot_ticks  = 50,       # How often to save network snapshots? None = disable.
+    resume_pkl              = None,     # Network pickle to resume training from.
+    resume_kimg             = 0,        # First kimg to report when resuming training.
+    cudnn_benchmark         = True,     # Enable torch.backends.cudnn.benchmark?
+    abort_fn                = None,     # Callback function for determining whether to abort training. Must return consistent results across ranks.
+    progress_fn             = None,     # Callback function for updating training progress. Called for all ranks.
+    freeze_dec_sr           = False,
+):
+    # Initialize.
+    start_time = time.time()
+    device = torch.device('cuda', rank)
+    np.random.seed(random_seed * num_gpus + rank)
+    torch.manual_seed(random_seed * num_gpus + rank)
+    torch.backends.cudnn.benchmark = cudnn_benchmark    # Improves training speed.
+    torch.backends.cuda.matmul.allow_tf32 = False       # Improves numerical accuracy.
+    torch.backends.cudnn.allow_tf32 = False             # Improves numerical accuracy.
+    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False  # Improves numerical accuracy.
+    conv2d_gradfix.enabled = True                       # Improves training speed. # TODO: ENABLE
+    grid_sample_gradfix.enabled = False                  # Avoids errors with the augmentation pipe.
+
+    # Load training set.
+    if rank == 0:
+        print('Loading training set...')
+    training_set = dnnlib.util.construct_class_by_name(**training_set_kwargs) # subclass of training.dataset.Dataset
+    training_set_sampler = misc.InfiniteSampler(dataset=training_set, rank=rank, num_replicas=num_gpus, seed=random_seed)
+    training_set_iterator = iter(torch.utils.data.DataLoader(dataset=training_set, sampler=training_set_sampler, batch_size=batch_size//num_gpus, **data_loader_kwargs))
+    if rank == 0:
+        print()
+        print('Num images: ', len(training_set))
+        print('Image shape:', training_set.image_shape)
+        print('Label shape:', training_set.label_shape)
+        print()
+
+    # Construct networks.
+    if rank == 0:
+        print('Constructing networks...')
+    common_kwargs = dict(c_dim=training_set.label_dim, img_resolution=training_set.resolution, img_channels=training_set.num_channels)
+    G = dnnlib.util.construct_class_by_name(**G_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
+    G.register_buffer('dataset_label_std', torch.tensor(training_set.get_label_std()).to(device))
+    D = dnnlib.util.construct_class_by_name(**D_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
+    G_ema = copy.deepcopy(G).eval()
+    if freeze_dec_sr:
+        for p in G.renderer.parameters():
+            p.requires_grad_(False)
+        for p in G.ray_sampler.parameters():
+            p.requires_grad_(False)
+        for p in G.superresolution.parameters():
+            p.requires_grad_(False)
+        for p in G.decoder.parameters():
+            p.requires_grad_(False)
+
+
+    # Resume from existing pickle.
+    if (resume_pkl is not None) and (rank == 0):
+        print(f'Resuming from "{resume_pkl}"')
+        with dnnlib.util.open_url(resume_pkl) as f:
+            resume_data = legacy.load_network_pkl(f)
+        for name, module in [('G', G), ('D', D), ('G_ema', G_ema)]:
+            misc.copy_params_and_buffers(resume_data[name], module, require_all=False)
+
+    # Print network summary tables.
+    if rank == 0:
+        z = torch.empty([batch_gpu, G.z_dim], device=device)
+        c = torch.empty([batch_gpu, G.c_dim], device=device)
+        img = misc.print_module_summary(G, [z, c])
+        misc.print_module_summary(D, [img, c])
+
+    # Setup augmentation.
+    if rank == 0:
+        print('Setting up augmentation...')
+    augment_pipe = None
+    ada_stats = None
+    if (augment_kwargs is not None) and (augment_p > 0 or ada_target is not None):
+        augment_pipe = dnnlib.util.construct_class_by_name(**augment_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
+        augment_pipe.p.copy_(torch.as_tensor(augment_p))
+        if ada_target is not None:
+            ada_stats = training_stats.Collector(regex='Loss/signs/real')
+
+    # Distribute across GPUs.
+    if rank == 0:
+        print(f'Distributing across {num_gpus} GPUs...')
+    for module in [G, D, G_ema, augment_pipe]:
+        if module is not None:
+            for param in misc.params_and_buffers(module):
+                if param.numel() > 0 and num_gpus > 1:
+                    torch.distributed.broadcast(param, src=0)
+
+    # Setup training phases.
+    if rank == 0:
+        print('Setting up training phases...')
+    loss = dnnlib.util.construct_class_by_name(device=device, G=G, D=D, augment_pipe=augment_pipe, **loss_kwargs) # subclass of training.loss.Loss
+    phases = []
+    for name, module, opt_kwargs, reg_interval in [('G', G, G_opt_kwargs, G_reg_interval), ('D', D, D_opt_kwargs, D_reg_interval)]:
+        if reg_interval is None:
+            opt = dnnlib.util.construct_class_by_name(params=module.parameters(), **opt_kwargs) # subclass of torch.optim.Optimizer
+            phases += [dnnlib.EasyDict(name=name+'both', module=module, opt=opt, interval=1)]
+        else: # Lazy regularization.
+            mb_ratio = reg_interval / (reg_interval + 1)
+            opt_kwargs = dnnlib.EasyDict(opt_kwargs)
+            opt_kwargs.lr = opt_kwargs.lr * mb_ratio
+            opt_kwargs.betas = [beta ** mb_ratio for beta in opt_kwargs.betas]
+            opt = dnnlib.util.construct_class_by_name(module.parameters(), **opt_kwargs) # subclass of torch.optim.Optimizer
+            phases += [dnnlib.EasyDict(name=name+'main', module=module, opt=opt, interval=1)]
+            phases += [dnnlib.EasyDict(name=name+'reg', module=module, opt=opt, interval=reg_interval)]
+    for phase in phases:
+        phase.start_event = None
+        phase.end_event = None
+        if rank == 0:
+            phase.start_event = torch.cuda.Event(enable_timing=True)
+            phase.end_event = torch.cuda.Event(enable_timing=True)
+
+    # Export sample images.
+    grid_size = None
+    grid_z = None
+    grid_c = None
+    if rank == 0:
+        print('Exporting sample images...')
+        grid_size, images, labels = setup_snapshot_image_grid(training_set=training_set)
+        save_image_grid(images, os.path.join(run_dir, 'reals.png'), drange=[0,255], grid_size=grid_size)
+        grid_z = torch.randn([labels.shape[0], G.z_dim], device=device).split(batch_gpu)
+        grid_c = torch.from_numpy(labels).to(device).split(batch_gpu)
+
+    # Initialize logs.
+    if rank == 0:
+        print('Initializing logs...')
+    stats_collector = training_stats.Collector(regex='.*')
+    stats_metrics = dict()
+    stats_jsonl = None
+    stats_tfevents = None
+    if rank == 0:
+        stats_jsonl = open(os.path.join(run_dir, 'stats.jsonl'), 'wt')
+        try:
+            import torch.utils.tensorboard as tensorboard
+            stats_tfevents = tensorboard.SummaryWriter(run_dir)
+        except ImportError as err:
+            print('Skipping tfevents export:', err)
+
+    # Train.
+    if rank == 0:
+        print(f'Training for {total_kimg} kimg...')
+        print()
+    cur_nimg = resume_kimg * 1000
+    cur_tick = 0
+    tick_start_nimg = cur_nimg
+    tick_start_time = time.time()
+    maintenance_time = tick_start_time - start_time
+    batch_idx = 0
+    if progress_fn is not None:
+        progress_fn(0, total_kimg)
+    while True:
+
+        # Fetch training data.
+        with torch.autograd.profiler.record_function('data_fetch'):
+            phase_real_img, phase_real_c = next(training_set_iterator)
+            phase_real_img = (phase_real_img.to(device).to(torch.float32) / 127.5 - 1).split(batch_gpu)
+            phase_real_c = phase_real_c.to(device).split(batch_gpu)
+            all_gen_z = torch.randn([len(phases) * batch_size, G.z_dim], device=device)
+            all_gen_z = [phase_gen_z.split(batch_gpu) for phase_gen_z in all_gen_z.split(batch_size)]
+            all_gen_c = [training_set.get_label(np.random.randint(len(training_set))) for _ in range(len(phases) * batch_size)]
+            all_gen_c = torch.from_numpy(np.stack(all_gen_c)).pin_memory().to(device)
+            all_gen_c = [phase_gen_c.split(batch_gpu) for phase_gen_c in all_gen_c.split(batch_size)]
+
+        # Execute training phases.
+        for phase, phase_gen_z, phase_gen_c in zip(phases, all_gen_z, all_gen_c):
+            if batch_idx % phase.interval != 0:
+                continue
+            if phase.start_event is not None:
+                phase.start_event.record(torch.cuda.current_stream(device))
+
+            # Accumulate gradients.
+            phase.opt.zero_grad(set_to_none=True)
+            phase.module.requires_grad_(True)
+            for real_img, real_c, gen_z, gen_c in zip(phase_real_img, phase_real_c, phase_gen_z, phase_gen_c):
+                loss.accumulate_gradients(phase=phase.name, real_img=real_img, real_c=real_c, gen_z=gen_z, gen_c=gen_c, gain=phase.interval, cur_nimg=cur_nimg)
+            phase.module.requires_grad_(False)
+
+            # Update weights.
+            with torch.autograd.profiler.record_function(phase.name + '_opt'):
+                params = [param for param in phase.module.parameters() if param.numel() > 0 and param.grad is not None]
+                if len(params) > 0:
+                    flat = torch.cat([param.grad.flatten() for param in params])
+                    if num_gpus > 1:
+                        torch.distributed.all_reduce(flat)
+                        flat /= num_gpus
+                    misc.nan_to_num(flat, nan=0, posinf=1e5, neginf=-1e5, out=flat)
+                    grads = flat.split([param.numel() for param in params])
+                    for param, grad in zip(params, grads):
+                        param.grad = grad.reshape(param.shape)
+                phase.opt.step()
+
+            # Phase done.
+            if phase.end_event is not None:
+                phase.end_event.record(torch.cuda.current_stream(device))
+
+        # Update G_ema.
+        with torch.autograd.profiler.record_function('Gema'):
+            ema_nimg = ema_kimg * 1000
+            if ema_rampup is not None:
+                ema_nimg = min(ema_nimg, cur_nimg * ema_rampup)
+            ema_beta = 0.5 ** (batch_size / max(ema_nimg, 1e-8))
+            for p_ema, p in zip(G_ema.parameters(), G.parameters()):
+                p_ema.copy_(p.lerp(p_ema, ema_beta))
+            for b_ema, b in zip(G_ema.buffers(), G.buffers()):
+                b_ema.copy_(b)
+            G_ema.neural_rendering_resolution = G.neural_rendering_resolution
+            G_ema.rendering_kwargs = G.rendering_kwargs.copy()
+
+        # Update state.
+        cur_nimg += batch_size
+        batch_idx += 1
+
+        # Execute ADA heuristic.
+        if (ada_stats is not None) and (batch_idx % ada_interval == 0):
+            ada_stats.update()
+            adjust = np.sign(ada_stats['Loss/signs/real'] - ada_target) * (batch_size * ada_interval) / (ada_kimg * 1000)
+            augment_pipe.p.copy_((augment_pipe.p + adjust).max(misc.constant(0, device=device)))
+
+        # Perform maintenance tasks once per tick.
+        done = (cur_nimg >= total_kimg * 1000)
+        if (not done) and (cur_tick != 0) and (cur_nimg < tick_start_nimg + kimg_per_tick * 1000):
+            continue
+
+        # Print status line, accumulating the same information in training_stats.
+        tick_end_time = time.time()
+        fields = []
+        fields += [f"tick {training_stats.report0('Progress/tick', cur_tick):<5d}"]
+        fields += [f"kimg {training_stats.report0('Progress/kimg', cur_nimg / 1e3):<8.1f}"]
+        fields += [f"time {dnnlib.util.format_time(training_stats.report0('Timing/total_sec', tick_end_time - start_time)):<12s}"]
+        fields += [f"sec/tick {training_stats.report0('Timing/sec_per_tick', tick_end_time - tick_start_time):<7.1f}"]
+        fields += [f"sec/kimg {training_stats.report0('Timing/sec_per_kimg', (tick_end_time - tick_start_time) / (cur_nimg - tick_start_nimg) * 1e3):<7.2f}"]
+        fields += [f"maintenance {training_stats.report0('Timing/maintenance_sec', maintenance_time):<6.1f}"]
+        fields += [f"cpumem {training_stats.report0('Resources/cpu_mem_gb', psutil.Process(os.getpid()).memory_info().rss / 2**30):<6.2f}"]
+        fields += [f"gpumem {training_stats.report0('Resources/peak_gpu_mem_gb', torch.cuda.max_memory_allocated(device) / 2**30):<6.2f}"]
+        fields += [f"reserved {training_stats.report0('Resources/peak_gpu_mem_reserved_gb', torch.cuda.max_memory_reserved(device) / 2**30):<6.2f}"]
+        torch.cuda.reset_peak_memory_stats()
+        fields += [f"augment {training_stats.report0('Progress/augment', float(augment_pipe.p.cpu()) if augment_pipe is not None else 0):.3f}"]
+        training_stats.report0('Timing/total_hours', (tick_end_time - start_time) / (60 * 60))
+        training_stats.report0('Timing/total_days', (tick_end_time - start_time) / (24 * 60 * 60))
+        if rank == 0:
+            print(' '.join(fields))
+
+        # Check for abort.
+        if (not done) and (abort_fn is not None) and abort_fn():
+            done = True
+            if rank == 0:
+                print()
+                print('Aborting...')
+
+        # Save image snapshot.
+        if (rank == 0) and (image_snapshot_ticks is not None) and (done or cur_tick % image_snapshot_ticks == 0):
+            out = [G_ema(z=z, c=c, noise_mode='const') for z, c in zip(grid_z, grid_c)]
+            images = torch.cat([o['image'].cpu() for o in out]).numpy()
+            images_raw = torch.cat([o['image_raw'].cpu() for o in out]).numpy()
+            images_depth = -torch.cat([o['image_depth'].cpu() for o in out]).numpy()
+            save_image_grid(images, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}.png'), drange=[-1,1], grid_size=grid_size)
+            save_image_grid(images_raw, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}_raw.png'), drange=[-1,1], grid_size=grid_size)
+            save_image_grid(images_depth, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}_depth.png'), drange=[images_depth.min(), images_depth.max()], grid_size=grid_size)
+
+            #--------------------
+            # # Log forward-conditioned images
+
+            # forward_cam2world_pose = LookAtPoseSampler.sample(3.14/2, 3.14/2, torch.tensor([0, 0, 0.2], device=device), radius=2.7, device=device)
+            # intrinsics = torch.tensor([[4.2647, 0, 0.5], [0, 4.2647, 0.5], [0, 0, 1]], device=device)
+            # forward_label = torch.cat([forward_cam2world_pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1)
+
+            # grid_ws = [G_ema.mapping(z, forward_label.expand(z.shape[0], -1)) for z, c in zip(grid_z, grid_c)]
+            # out = [G_ema.synthesis(ws, c=c, noise_mode='const') for ws, c in zip(grid_ws, grid_c)]
+
+            # images = torch.cat([o['image'].cpu() for o in out]).numpy()
+            # images_raw = torch.cat([o['image_raw'].cpu() for o in out]).numpy()
+            # images_depth = -torch.cat([o['image_depth'].cpu() for o in out]).numpy()
+            # save_image_grid(images, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}_f.png'), drange=[-1,1], grid_size=grid_size)
+            # save_image_grid(images_raw, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}_raw_f.png'), drange=[-1,1], grid_size=grid_size)
+            # save_image_grid(images_depth, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}_depth_f.png'), drange=[images_depth.min(), images_depth.max()], grid_size=grid_size)
+
+            #--------------------
+            # # Log Cross sections
+
+            # grid_ws = [G_ema.mapping(z, c.expand(z.shape[0], -1)) for z, c in zip(grid_z, grid_c)]
+            # out = [sample_cross_section(G_ema, ws, w=G.rendering_kwargs['box_warp']) for ws, c in zip(grid_ws, grid_c)]
+            # crossections = torch.cat([o.cpu() for o in out]).numpy()
+            # save_image_grid(crossections, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}_crossection.png'), drange=[-50,100], grid_size=grid_size)
+
+        # Save network snapshot.
+        snapshot_pkl = None
+        snapshot_data = None
+        if (network_snapshot_ticks is not None) and (done or cur_tick % network_snapshot_ticks == 0):
+            snapshot_data = dict(training_set_kwargs=dict(training_set_kwargs))
+            for name, module in [('G', G), ('D', D), ('G_ema', G_ema), ('augment_pipe', augment_pipe)]:
+                if module is not None:
+                    if num_gpus > 1:
+                        misc.check_ddp_consistency(module, ignore_regex=r'.*\.[^.]+_(avg|ema)')
+                    module = copy.deepcopy(module).eval().requires_grad_(False).cpu()
+                snapshot_data[name] = module
+                del module # conserve memory
+            snapshot_pkl = os.path.join(run_dir, f'network-snapshot-{cur_nimg//1000:06d}.pkl')
+            if rank == 0:
+                with open(snapshot_pkl, 'wb') as f:
+                    pickle.dump(snapshot_data, f)
+
+        # Evaluate metrics.
+        if (snapshot_data is not None) and (len(metrics) > 0):
+            if rank == 0:
+                print(run_dir)
+                print('Evaluating metrics...')
+            for metric in metrics:
+                result_dict = metric_main.calc_metric(metric=metric, G=snapshot_data['G_ema'],
+                    dataset_kwargs=training_set_kwargs, num_gpus=num_gpus, rank=rank, device=device)
+                if rank == 0:
+                    metric_main.report_metric(result_dict, run_dir=run_dir, snapshot_pkl=snapshot_pkl)
+                stats_metrics.update(result_dict.results)
+        del snapshot_data # conserve memory
+
+        # Collect statistics.
+        for phase in phases:
+            value = []
+            if (phase.start_event is not None) and (phase.end_event is not None):
+                phase.end_event.synchronize()
+                value = phase.start_event.elapsed_time(phase.end_event)
+            training_stats.report0('Timing/' + phase.name, value)
+        stats_collector.update()
+        stats_dict = stats_collector.as_dict()
+
+        # Update logs.
+        timestamp = time.time()
+        if stats_jsonl is not None:
+            fields = dict(stats_dict, timestamp=timestamp)
+            stats_jsonl.write(json.dumps(fields) + '\n')
+            stats_jsonl.flush()
+        if stats_tfevents is not None:
+            global_step = int(cur_nimg / 1e3)
+            walltime = timestamp - start_time
+            for name, value in stats_dict.items():
+                stats_tfevents.add_scalar(name, value.mean, global_step=global_step, walltime=walltime)
+            for name, value in stats_metrics.items():
+                stats_tfevents.add_scalar(f'Metrics/{name}', value, global_step=global_step, walltime=walltime)
+            stats_tfevents.flush()
+        if progress_fn is not None:
+            progress_fn(cur_nimg // 1000, total_kimg)
+
+        # Update state.
+        cur_tick += 1
+        tick_start_nimg = cur_nimg
+        tick_start_time = time.time()
+        maintenance_time = tick_start_time - tick_end_time
+        if done:
+            break
+
+    # Done.
+    if rank == 0:
+        print()
+        print('Exiting...')
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/training/triplane.py b/eg3d/training/triplane.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b7d48dec681a7b3040c19317dbaf26f6ff7e3cf
--- /dev/null
+++ b/eg3d/training/triplane.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import torch
+from torch_utils import persistence
+from training.networks_stylegan2 import Generator as StyleGAN2Backbone
+from training.volumetric_rendering.renderer import ImportanceRenderer
+from training.volumetric_rendering.ray_sampler import RaySampler
+import dnnlib
+
+@persistence.persistent_class
+class TriPlaneGenerator(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality.
+        c_dim,                      # Conditioning label (C) dimensionality.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output resolution.
+        img_channels,               # Number of output color channels.
+        sr_num_fp16_res     = 0,
+        mapping_kwargs      = {},   # Arguments for MappingNetwork.
+        rendering_kwargs    = {},
+        sr_kwargs = {},
+        **synthesis_kwargs,         # Arguments for SynthesisNetwork.
+    ):
+        super().__init__()
+        self.z_dim=z_dim
+        self.c_dim=c_dim
+        self.w_dim=w_dim
+        self.img_resolution=img_resolution
+        self.img_channels=img_channels
+        self.renderer = ImportanceRenderer()
+        self.ray_sampler = RaySampler()
+        self.backbone = StyleGAN2Backbone(z_dim, c_dim, w_dim, img_resolution=256, img_channels=32*3, mapping_kwargs=mapping_kwargs, **synthesis_kwargs)
+        self.superresolution = dnnlib.util.construct_class_by_name(class_name=rendering_kwargs['superresolution_module'], channels=32, img_resolution=img_resolution, sr_num_fp16_res=sr_num_fp16_res, sr_antialias=rendering_kwargs['sr_antialias'], **sr_kwargs)
+        self.decoder = OSGDecoder(32, {'decoder_lr_mul': rendering_kwargs.get('decoder_lr_mul', 1), 'decoder_output_dim': 32})
+        self.neural_rendering_resolution = 64
+        self.rendering_kwargs = rendering_kwargs
+    
+        self._last_planes = None
+    
+    def mapping(self, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False):
+        if self.rendering_kwargs['c_gen_conditioning_zero']:
+                c = torch.zeros_like(c)
+        return self.backbone.mapping(z, c * self.rendering_kwargs.get('c_scale', 0), truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+
+    def synthesis(self, ws, c, neural_rendering_resolution=None, update_emas=False, cache_backbone=False, use_cached_backbone=False, **synthesis_kwargs):
+        cam2world_matrix = c[:, :16].view(-1, 4, 4)
+        intrinsics = c[:, 16:25].view(-1, 3, 3)
+
+        if neural_rendering_resolution is None:
+            neural_rendering_resolution = self.neural_rendering_resolution
+        else:
+            self.neural_rendering_resolution = neural_rendering_resolution
+
+        # Create a batch of rays for volume rendering
+        ray_origins, ray_directions = self.ray_sampler(cam2world_matrix, intrinsics, neural_rendering_resolution)
+
+        # Create triplanes by running StyleGAN backbone
+        N, M, _ = ray_origins.shape
+        if use_cached_backbone and self._last_planes is not None:
+            planes = self._last_planes
+        else:
+            planes = self.backbone.synthesis(ws, update_emas=update_emas, **synthesis_kwargs)
+        if cache_backbone:
+            self._last_planes = planes
+
+        # Reshape output into three 32-channel planes
+        planes = planes.view(len(planes), 3, 32, planes.shape[-2], planes.shape[-1])
+
+        # Perform volume rendering
+        feature_samples, depth_samples, weights_samples = self.renderer(planes, self.decoder, ray_origins, ray_directions, self.rendering_kwargs) # channels last
+
+        # Reshape into 'raw' neural-rendered image
+        H = W = self.neural_rendering_resolution
+        feature_image = feature_samples.permute(0, 2, 1).reshape(N, feature_samples.shape[-1], H, W).contiguous()
+        depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
+
+        # Run superresolution to get final image
+        rgb_image = feature_image[:, :3]
+        sr_image = self.superresolution(rgb_image, feature_image, ws, noise_mode=self.rendering_kwargs['superresolution_noise_mode'], **{k:synthesis_kwargs[k] for k in synthesis_kwargs.keys() if k != 'noise_mode'})
+
+        return {'image': sr_image, 'image_raw': rgb_image, 'image_depth': depth_image}
+    
+    def sample(self, coordinates, directions, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False, **synthesis_kwargs):
+        # Compute RGB features, density for arbitrary 3D coordinates. Mostly used for extracting shapes. 
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        planes = self.backbone.synthesis(ws, update_emas=update_emas, **synthesis_kwargs)
+        planes = planes.view(len(planes), 3, 32, planes.shape[-2], planes.shape[-1])
+        return self.renderer.run_model(planes, self.decoder, coordinates, directions, self.rendering_kwargs)
+
+    def sample_mixed(self, coordinates, directions, ws, truncation_psi=1, truncation_cutoff=None, update_emas=False, **synthesis_kwargs):
+        # Same as sample, but expects latent vectors 'ws' instead of Gaussian noise 'z'
+        planes = self.backbone.synthesis(ws, update_emas = update_emas, **synthesis_kwargs)
+        planes = planes.view(len(planes), 3, 32, planes.shape[-2], planes.shape[-1])
+        return self.renderer.run_model(planes, self.decoder, coordinates, directions, self.rendering_kwargs)
+
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, neural_rendering_resolution=None, update_emas=False, cache_backbone=False, use_cached_backbone=False, **synthesis_kwargs):
+        # Render a batch of generated images.
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        return self.synthesis(ws, c, update_emas=update_emas, neural_rendering_resolution=neural_rendering_resolution, cache_backbone=cache_backbone, use_cached_backbone=use_cached_backbone, **synthesis_kwargs)
+
+
+from training.networks_stylegan2 import FullyConnectedLayer
+
+class OSGDecoder(torch.nn.Module):
+    def __init__(self, n_features, options):
+        super().__init__()
+        self.hidden_dim = 64
+
+        self.net = torch.nn.Sequential(
+            FullyConnectedLayer(n_features, self.hidden_dim, lr_multiplier=options['decoder_lr_mul']),
+            torch.nn.Softplus(),
+            FullyConnectedLayer(self.hidden_dim, 1 + options['decoder_output_dim'], lr_multiplier=options['decoder_lr_mul'])
+        )
+        
+    def forward(self, sampled_features, ray_directions):
+        # Aggregate features
+        sampled_features = sampled_features.mean(1)
+        x = sampled_features
+
+        N, M, C = x.shape
+        x = x.view(N*M, C)
+
+        x = self.net(x)
+        x = x.view(N, M, -1)
+        rgb = torch.sigmoid(x[..., 1:])*(1 + 2*0.001) - 0.001 # Uses sigmoid clamping from MipNeRF
+        sigma = x[..., 0:1]
+        return {'rgb': rgb, 'sigma': sigma}
diff --git a/eg3d/training/volumetric_rendering/__init__.py b/eg3d/training/volumetric_rendering/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..daba66567a95beabb103f7996198a9675ab20b4a
--- /dev/null
+++ b/eg3d/training/volumetric_rendering/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+# empty
\ No newline at end of file
diff --git a/eg3d/training/volumetric_rendering/math_utils.py b/eg3d/training/volumetric_rendering/math_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cf9d2b811e0acbc7923bc9126e010b52cb1a8af
--- /dev/null
+++ b/eg3d/training/volumetric_rendering/math_utils.py
@@ -0,0 +1,118 @@
+# MIT License
+
+# Copyright (c) 2022 Petr Kellnhofer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+
+def transform_vectors(matrix: torch.Tensor, vectors4: torch.Tensor) -> torch.Tensor:
+    """
+    Left-multiplies MxM @ NxM. Returns NxM.
+    """
+    res = torch.matmul(vectors4, matrix.T)
+    return res
+
+
+def normalize_vecs(vectors: torch.Tensor) -> torch.Tensor:
+    """
+    Normalize vector lengths.
+    """
+    return vectors / (torch.norm(vectors, dim=-1, keepdim=True))
+
+def torch_dot(x: torch.Tensor, y: torch.Tensor):
+    """
+    Dot product of two tensors.
+    """
+    return (x * y).sum(-1)
+
+
+def get_ray_limits_box(rays_o: torch.Tensor, rays_d: torch.Tensor, box_side_length):
+    """
+    Author: Petr Kellnhofer
+    Intersects rays with the [-1, 1] NDC volume.
+    Returns min and max distance of entry.
+    Returns -1 for no intersection.
+    https://www.scratchapixel.com/lessons/3d-basic-rendering/minimal-ray-tracer-rendering-simple-shapes/ray-box-intersection
+    """
+    o_shape = rays_o.shape
+    rays_o = rays_o.detach().reshape(-1, 3)
+    rays_d = rays_d.detach().reshape(-1, 3)
+
+
+    bb_min = [-1*(box_side_length/2), -1*(box_side_length/2), -1*(box_side_length/2)]
+    bb_max = [1*(box_side_length/2), 1*(box_side_length/2), 1*(box_side_length/2)]
+    bounds = torch.tensor([bb_min, bb_max], dtype=rays_o.dtype, device=rays_o.device)
+    is_valid = torch.ones(rays_o.shape[:-1], dtype=bool, device=rays_o.device)
+
+    # Precompute inverse for stability.
+    invdir = 1 / rays_d
+    sign = (invdir < 0).long()
+
+    # Intersect with YZ plane.
+    tmin = (bounds.index_select(0, sign[..., 0])[..., 0] - rays_o[..., 0]) * invdir[..., 0]
+    tmax = (bounds.index_select(0, 1 - sign[..., 0])[..., 0] - rays_o[..., 0]) * invdir[..., 0]
+
+    # Intersect with XZ plane.
+    tymin = (bounds.index_select(0, sign[..., 1])[..., 1] - rays_o[..., 1]) * invdir[..., 1]
+    tymax = (bounds.index_select(0, 1 - sign[..., 1])[..., 1] - rays_o[..., 1]) * invdir[..., 1]
+
+    # Resolve parallel rays.
+    is_valid[torch.logical_or(tmin > tymax, tymin > tmax)] = False
+
+    # Use the shortest intersection.
+    tmin = torch.max(tmin, tymin)
+    tmax = torch.min(tmax, tymax)
+
+    # Intersect with XY plane.
+    tzmin = (bounds.index_select(0, sign[..., 2])[..., 2] - rays_o[..., 2]) * invdir[..., 2]
+    tzmax = (bounds.index_select(0, 1 - sign[..., 2])[..., 2] - rays_o[..., 2]) * invdir[..., 2]
+
+    # Resolve parallel rays.
+    is_valid[torch.logical_or(tmin > tzmax, tzmin > tmax)] = False
+
+    # Use the shortest intersection.
+    tmin = torch.max(tmin, tzmin)
+    tmax = torch.min(tmax, tzmax)
+
+    # Mark invalid.
+    tmin[torch.logical_not(is_valid)] = -1
+    tmax[torch.logical_not(is_valid)] = -2
+
+    return tmin.reshape(*o_shape[:-1], 1), tmax.reshape(*o_shape[:-1], 1)
+
+
+def linspace(start: torch.Tensor, stop: torch.Tensor, num: int):
+    """
+    Creates a tensor of shape [num, *start.shape] whose values are evenly spaced from start to end, inclusive.
+    Replicates but the multi-dimensional bahaviour of numpy.linspace in PyTorch.
+    """
+    # create a tensor of 'num' steps from 0 to 1
+    steps = torch.arange(num, dtype=torch.float32, device=start.device) / (num - 1)
+
+    # reshape the 'steps' tensor to [-1, *([1]*start.ndim)] to allow for broadcastings
+    # - using 'steps.reshape([-1, *([1]*start.ndim)])' would be nice here but torchscript
+    #   "cannot statically infer the expected size of a list in this contex", hence the code below
+    for i in range(start.ndim):
+        steps = steps.unsqueeze(-1)
+
+    # the output starts at 'start' and increments until 'stop' in each dimension
+    out = start[None] + steps * (stop - start)[None]
+
+    return out
diff --git a/eg3d/training/volumetric_rendering/ray_marcher.py b/eg3d/training/volumetric_rendering/ray_marcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2c427f7499adf3d2a456d2a1f2d2724daa04621
--- /dev/null
+++ b/eg3d/training/volumetric_rendering/ray_marcher.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""
+The ray marcher takes the raw output of the implicit representation and uses the volume rendering equation to produce composited colors and depths.
+Based off of the implementation in MipNeRF (this one doesn't do any cone tracing though!)
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class MipRayMarcher2(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+
+    def run_forward(self, colors, densities, depths, rendering_options):
+        deltas = depths[:, :, 1:] - depths[:, :, :-1]
+        colors_mid = (colors[:, :, :-1] + colors[:, :, 1:]) / 2
+        densities_mid = (densities[:, :, :-1] + densities[:, :, 1:]) / 2
+        depths_mid = (depths[:, :, :-1] + depths[:, :, 1:]) / 2
+
+
+        if rendering_options['clamp_mode'] == 'softplus':
+            densities_mid = F.softplus(densities_mid - 1) # activation bias of -1 makes things initialize better
+        else:
+            assert False, "MipRayMarcher only supports `clamp_mode`=`softplus`!"
+
+        density_delta = densities_mid * deltas
+
+        alpha = 1 - torch.exp(-density_delta)
+
+        alpha_shifted = torch.cat([torch.ones_like(alpha[:, :, :1]), 1-alpha + 1e-10], -2)
+        weights = alpha * torch.cumprod(alpha_shifted, -2)[:, :, :-1]
+
+        composite_rgb = torch.sum(weights * colors_mid, -2)
+        weight_total = weights.sum(2)
+        composite_depth = torch.sum(weights * depths_mid, -2) / weight_total
+
+        # clip the composite to min/max range of depths
+        composite_depth = torch.nan_to_num(composite_depth, float('inf'))
+        composite_depth = torch.clamp(composite_depth, torch.min(depths), torch.max(depths))
+
+        if rendering_options.get('white_back', False):
+            composite_rgb = composite_rgb + 1 - weight_total
+
+        composite_rgb = composite_rgb * 2 - 1 # Scale to (-1, 1)
+
+        return composite_rgb, composite_depth, weights
+
+
+    def forward(self, colors, densities, depths, rendering_options):
+        composite_rgb, composite_depth, weights = self.run_forward(colors, densities, depths, rendering_options)
+
+        return composite_rgb, composite_depth, weights
\ No newline at end of file
diff --git a/eg3d/training/volumetric_rendering/ray_sampler.py b/eg3d/training/volumetric_rendering/ray_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..00dd07b908497bd07bbe0e394d9eac38acce2b50
--- /dev/null
+++ b/eg3d/training/volumetric_rendering/ray_sampler.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""
+The ray sampler is a module that takes in camera matrices and resolution and batches of rays.
+Expects cam2world matrices that use the OpenCV camera coordinate system conventions.
+"""
+
+import torch
+
+class RaySampler(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ray_origins_h, self.ray_directions, self.depths, self.image_coords, self.rendering_options = None, None, None, None, None
+
+
+    def forward(self, cam2world_matrix, intrinsics, resolution):
+        """
+        Create batches of rays and return origins and directions.
+
+        cam2world_matrix: (N, 4, 4)
+        intrinsics: (N, 3, 3)
+        resolution: int
+
+        ray_origins: (N, M, 3)
+        ray_dirs: (N, M, 2)
+        """
+        N, M = cam2world_matrix.shape[0], resolution**2
+        cam_locs_world = cam2world_matrix[:, :3, 3]
+        fx = intrinsics[:, 0, 0]
+        fy = intrinsics[:, 1, 1]
+        cx = intrinsics[:, 0, 2]
+        cy = intrinsics[:, 1, 2]
+        sk = intrinsics[:, 0, 1]
+
+        uv = torch.stack(torch.meshgrid(torch.arange(resolution, dtype=torch.float32, device=cam2world_matrix.device), torch.arange(resolution, dtype=torch.float32, device=cam2world_matrix.device), indexing='ij')) * (1./resolution) + (0.5/resolution)
+        uv = uv.flip(0).reshape(2, -1).transpose(1, 0)
+        uv = uv.unsqueeze(0).repeat(cam2world_matrix.shape[0], 1, 1)
+
+        x_cam = uv[:, :, 0].view(N, -1)
+        y_cam = uv[:, :, 1].view(N, -1)
+        z_cam = torch.ones((N, M), device=cam2world_matrix.device)
+
+        x_lift = (x_cam - cx.unsqueeze(-1) + cy.unsqueeze(-1)*sk.unsqueeze(-1)/fy.unsqueeze(-1) - sk.unsqueeze(-1)*y_cam/fy.unsqueeze(-1)) / fx.unsqueeze(-1) * z_cam
+        y_lift = (y_cam - cy.unsqueeze(-1)) / fy.unsqueeze(-1) * z_cam
+
+        cam_rel_points = torch.stack((x_lift, y_lift, z_cam, torch.ones_like(z_cam)), dim=-1)
+
+        world_rel_points = torch.bmm(cam2world_matrix, cam_rel_points.permute(0, 2, 1)).permute(0, 2, 1)[:, :, :3]
+
+        ray_dirs = world_rel_points - cam_locs_world[:, None, :]
+        ray_dirs = torch.nn.functional.normalize(ray_dirs, dim=2)
+
+        ray_origins = cam_locs_world.unsqueeze(1).repeat(1, ray_dirs.shape[1], 1)
+
+        return ray_origins, ray_dirs
\ No newline at end of file
diff --git a/eg3d/training/volumetric_rendering/renderer.py b/eg3d/training/volumetric_rendering/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a27aea61be0cc91a8cab14082252ca203b772d2d
--- /dev/null
+++ b/eg3d/training/volumetric_rendering/renderer.py
@@ -0,0 +1,253 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+"""
+The renderer is a module that takes in rays, decides where to sample along each
+ray, and computes pixel colors using the volume rendering equation.
+"""
+
+import math
+import torch
+import torch.nn as nn
+
+from training.volumetric_rendering.ray_marcher import MipRayMarcher2
+from training.volumetric_rendering import math_utils
+
+def generate_planes():
+    """
+    Defines planes by the three vectors that form the "axes" of the
+    plane. Should work with arbitrary number of planes and planes of
+    arbitrary orientation.
+    """
+    return torch.tensor([[[1, 0, 0],
+                            [0, 1, 0],
+                            [0, 0, 1]],
+                            [[1, 0, 0],
+                            [0, 0, 1],
+                            [0, 1, 0]],
+                            [[0, 0, 1],
+                            [1, 0, 0],
+                            [0, 1, 0]]], dtype=torch.float32)
+
+def project_onto_planes(planes, coordinates):
+    """
+    Does a projection of a 3D point onto a batch of 2D planes,
+    returning 2D plane coordinates.
+
+    Takes plane axes of shape n_planes, 3, 3
+    # Takes coordinates of shape N, M, 3
+    # returns projections of shape N*n_planes, M, 2
+    """
+    N, M, C = coordinates.shape
+    n_planes, _, _ = planes.shape
+    coordinates = coordinates.unsqueeze(1).expand(-1, n_planes, -1, -1).reshape(N*n_planes, M, 3)
+    inv_planes = torch.linalg.inv(planes).unsqueeze(0).expand(N, -1, -1, -1).reshape(N*n_planes, 3, 3)
+    projections = torch.bmm(coordinates, inv_planes)
+    return projections[..., :2]
+
+def sample_from_planes(plane_axes, plane_features, coordinates, mode='bilinear', padding_mode='zeros', box_warp=None):
+    assert padding_mode == 'zeros'
+    N, n_planes, C, H, W = plane_features.shape
+    _, M, _ = coordinates.shape
+    plane_features = plane_features.view(N*n_planes, C, H, W)
+
+    coordinates = (2/box_warp) * coordinates # TODO: add specific box bounds
+
+    projected_coordinates = project_onto_planes(plane_axes, coordinates).unsqueeze(1)
+    output_features = torch.nn.functional.grid_sample(plane_features, projected_coordinates.float(), mode=mode, padding_mode=padding_mode, align_corners=False).permute(0, 3, 2, 1).reshape(N, n_planes, M, C)
+    return output_features
+
+def sample_from_3dgrid(grid, coordinates):
+    """
+    Expects coordinates in shape (batch_size, num_points_per_batch, 3)
+    Expects grid in shape (1, channels, H, W, D)
+    (Also works if grid has batch size)
+    Returns sampled features of shape (batch_size, num_points_per_batch, feature_channels)
+    """
+    batch_size, n_coords, n_dims = coordinates.shape
+    sampled_features = torch.nn.functional.grid_sample(grid.expand(batch_size, -1, -1, -1, -1),
+                                                       coordinates.reshape(batch_size, 1, 1, -1, n_dims),
+                                                       mode='bilinear', padding_mode='zeros', align_corners=False)
+    N, C, H, W, D = sampled_features.shape
+    sampled_features = sampled_features.permute(0, 4, 3, 2, 1).reshape(N, H*W*D, C)
+    return sampled_features
+
+class ImportanceRenderer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ray_marcher = MipRayMarcher2()
+        self.plane_axes = generate_planes()
+
+    def forward(self, planes, decoder, ray_origins, ray_directions, rendering_options):
+        self.plane_axes = self.plane_axes.to(ray_origins.device)
+
+        if rendering_options['ray_start'] == rendering_options['ray_end'] == 'auto':
+            ray_start, ray_end = math_utils.get_ray_limits_box(ray_origins, ray_directions, box_side_length=rendering_options['box_warp'])
+            is_ray_valid = ray_end > ray_start
+            if torch.any(is_ray_valid).item():
+                ray_start[~is_ray_valid] = ray_start[is_ray_valid].min()
+                ray_end[~is_ray_valid] = ray_start[is_ray_valid].max()
+            depths_coarse = self.sample_stratified(ray_origins, ray_start, ray_end, rendering_options['depth_resolution'], rendering_options['disparity_space_sampling'])
+        else:
+            # Create stratified depth samples
+            depths_coarse = self.sample_stratified(ray_origins, rendering_options['ray_start'], rendering_options['ray_end'], rendering_options['depth_resolution'], rendering_options['disparity_space_sampling'])
+
+        batch_size, num_rays, samples_per_ray, _ = depths_coarse.shape
+
+        # Coarse Pass
+        sample_coordinates = (ray_origins.unsqueeze(-2) + depths_coarse * ray_directions.unsqueeze(-2)).reshape(batch_size, -1, 3)
+        sample_directions = ray_directions.unsqueeze(-2).expand(-1, -1, samples_per_ray, -1).reshape(batch_size, -1, 3)
+
+
+        out = self.run_model(planes, decoder, sample_coordinates, sample_directions, rendering_options)
+        colors_coarse = out['rgb']
+        densities_coarse = out['sigma']
+        colors_coarse = colors_coarse.reshape(batch_size, num_rays, samples_per_ray, colors_coarse.shape[-1])
+        densities_coarse = densities_coarse.reshape(batch_size, num_rays, samples_per_ray, 1)
+
+        # Fine Pass
+        N_importance = rendering_options['depth_resolution_importance']
+        if N_importance > 0:
+            _, _, weights = self.ray_marcher(colors_coarse, densities_coarse, depths_coarse, rendering_options)
+
+            depths_fine = self.sample_importance(depths_coarse, weights, N_importance)
+
+            sample_directions = ray_directions.unsqueeze(-2).expand(-1, -1, N_importance, -1).reshape(batch_size, -1, 3)
+            sample_coordinates = (ray_origins.unsqueeze(-2) + depths_fine * ray_directions.unsqueeze(-2)).reshape(batch_size, -1, 3)
+
+            out = self.run_model(planes, decoder, sample_coordinates, sample_directions, rendering_options)
+            colors_fine = out['rgb']
+            densities_fine = out['sigma']
+            colors_fine = colors_fine.reshape(batch_size, num_rays, N_importance, colors_fine.shape[-1])
+            densities_fine = densities_fine.reshape(batch_size, num_rays, N_importance, 1)
+
+            all_depths, all_colors, all_densities = self.unify_samples(depths_coarse, colors_coarse, densities_coarse,
+                                                                  depths_fine, colors_fine, densities_fine)
+
+            # Aggregate
+            rgb_final, depth_final, weights = self.ray_marcher(all_colors, all_densities, all_depths, rendering_options)
+        else:
+            rgb_final, depth_final, weights = self.ray_marcher(colors_coarse, densities_coarse, depths_coarse, rendering_options)
+
+
+        return rgb_final, depth_final, weights.sum(2)
+
+    def run_model(self, planes, decoder, sample_coordinates, sample_directions, options):
+        sampled_features = sample_from_planes(self.plane_axes, planes, sample_coordinates, padding_mode='zeros', box_warp=options['box_warp'])
+
+        out = decoder(sampled_features, sample_directions)
+        if options.get('density_noise', 0) > 0:
+            out['sigma'] += torch.randn_like(out['sigma']) * options['density_noise']
+        return out
+
+    def sort_samples(self, all_depths, all_colors, all_densities):
+        _, indices = torch.sort(all_depths, dim=-2)
+        all_depths = torch.gather(all_depths, -2, indices)
+        all_colors = torch.gather(all_colors, -2, indices.expand(-1, -1, -1, all_colors.shape[-1]))
+        all_densities = torch.gather(all_densities, -2, indices.expand(-1, -1, -1, 1))
+        return all_depths, all_colors, all_densities
+
+    def unify_samples(self, depths1, colors1, densities1, depths2, colors2, densities2):
+        all_depths = torch.cat([depths1, depths2], dim = -2)
+        all_colors = torch.cat([colors1, colors2], dim = -2)
+        all_densities = torch.cat([densities1, densities2], dim = -2)
+
+        _, indices = torch.sort(all_depths, dim=-2)
+        all_depths = torch.gather(all_depths, -2, indices)
+        all_colors = torch.gather(all_colors, -2, indices.expand(-1, -1, -1, all_colors.shape[-1]))
+        all_densities = torch.gather(all_densities, -2, indices.expand(-1, -1, -1, 1))
+
+        return all_depths, all_colors, all_densities
+
+    def sample_stratified(self, ray_origins, ray_start, ray_end, depth_resolution, disparity_space_sampling=False):
+        """
+        Return depths of approximately uniformly spaced samples along rays.
+        """
+        N, M, _ = ray_origins.shape
+        if disparity_space_sampling:
+            depths_coarse = torch.linspace(0,
+                                    1,
+                                    depth_resolution,
+                                    device=ray_origins.device).reshape(1, 1, depth_resolution, 1).repeat(N, M, 1, 1)
+            depth_delta = 1/(depth_resolution - 1)
+            depths_coarse += torch.rand_like(depths_coarse) * depth_delta
+            depths_coarse = 1./(1./ray_start * (1. - depths_coarse) + 1./ray_end * depths_coarse)
+        else:
+            if type(ray_start) == torch.Tensor:
+                depths_coarse = math_utils.linspace(ray_start, ray_end, depth_resolution).permute(1,2,0,3)
+                depth_delta = (ray_end - ray_start) / (depth_resolution - 1)
+                depths_coarse += torch.rand_like(depths_coarse) * depth_delta[..., None]
+            else:
+                depths_coarse = torch.linspace(ray_start, ray_end, depth_resolution, device=ray_origins.device).reshape(1, 1, depth_resolution, 1).repeat(N, M, 1, 1)
+                depth_delta = (ray_end - ray_start)/(depth_resolution - 1)
+                depths_coarse += torch.rand_like(depths_coarse) * depth_delta
+
+        return depths_coarse
+
+    def sample_importance(self, z_vals, weights, N_importance):
+        """
+        Return depths of importance sampled points along rays. See NeRF importance sampling for more.
+        """
+        with torch.no_grad():
+            batch_size, num_rays, samples_per_ray, _ = z_vals.shape
+
+            z_vals = z_vals.reshape(batch_size * num_rays, samples_per_ray)
+            weights = weights.reshape(batch_size * num_rays, -1) # -1 to account for loss of 1 sample in MipRayMarcher
+
+            # smooth weights
+            weights = torch.nn.functional.max_pool1d(weights.unsqueeze(1).float(), 2, 1, padding=1)
+            weights = torch.nn.functional.avg_pool1d(weights, 2, 1).squeeze()
+            weights = weights + 0.01
+
+            z_vals_mid = 0.5 * (z_vals[: ,:-1] + z_vals[: ,1:])
+            importance_z_vals = self.sample_pdf(z_vals_mid, weights[:, 1:-1],
+                                             N_importance).detach().reshape(batch_size, num_rays, N_importance, 1)
+        return importance_z_vals
+
+    def sample_pdf(self, bins, weights, N_importance, det=False, eps=1e-5):
+        """
+        Sample @N_importance samples from @bins with distribution defined by @weights.
+        Inputs:
+            bins: (N_rays, N_samples_+1) where N_samples_ is "the number of coarse samples per ray - 2"
+            weights: (N_rays, N_samples_)
+            N_importance: the number of samples to draw from the distribution
+            det: deterministic or not
+            eps: a small number to prevent division by zero
+        Outputs:
+            samples: the sampled samples
+        """
+        N_rays, N_samples_ = weights.shape
+        weights = weights + eps # prevent division by zero (don't do inplace op!)
+        pdf = weights / torch.sum(weights, -1, keepdim=True) # (N_rays, N_samples_)
+        cdf = torch.cumsum(pdf, -1) # (N_rays, N_samples), cumulative distribution function
+        cdf = torch.cat([torch.zeros_like(cdf[: ,:1]), cdf], -1)  # (N_rays, N_samples_+1)
+                                                                   # padded to 0~1 inclusive
+
+        if det:
+            u = torch.linspace(0, 1, N_importance, device=bins.device)
+            u = u.expand(N_rays, N_importance)
+        else:
+            u = torch.rand(N_rays, N_importance, device=bins.device)
+        u = u.contiguous()
+
+        inds = torch.searchsorted(cdf, u, right=True)
+        below = torch.clamp_min(inds-1, 0)
+        above = torch.clamp_max(inds, N_samples_)
+
+        inds_sampled = torch.stack([below, above], -1).view(N_rays, 2*N_importance)
+        cdf_g = torch.gather(cdf, 1, inds_sampled).view(N_rays, N_importance, 2)
+        bins_g = torch.gather(bins, 1, inds_sampled).view(N_rays, N_importance, 2)
+
+        denom = cdf_g[...,1]-cdf_g[...,0]
+        denom[denom<eps] = 1 # denom equals 0 means a bin has weight 0, in which case it will not be sampled
+                             # anyway, therefore any value for it is fine (set to 1 here)
+
+        samples = bins_g[...,0] + (u-cdf_g[...,0])/denom * (bins_g[...,1]-bins_g[...,0])
+        return samples
\ No newline at end of file
diff --git a/eg3d/visualizer.py b/eg3d/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..65779220e0c5105193e5832ff2b472ee4ad5ebc8
--- /dev/null
+++ b/eg3d/visualizer.py
@@ -0,0 +1,324 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import click
+import os
+
+import multiprocessing
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_window
+from gui_utils import imgui_utils
+from gui_utils import gl_utils
+from gui_utils import text_utils
+from viz import renderer
+from viz import pickle_widget
+from viz import latent_widget
+from viz import stylemix_widget
+from viz import trunc_noise_widget
+from viz import performance_widget
+from viz import capture_widget
+from viz import backbone_cache_widget
+from viz import layer_widget
+from viz import pose_widget
+from viz import zoom_widget
+from viz import conditioning_pose_widget
+from viz import render_type_widget
+from viz import render_depth_sample_widget
+
+#----------------------------------------------------------------------------
+
+class Visualizer(imgui_window.ImguiWindow):
+    def __init__(self, capture_dir=None):
+        super().__init__(title='Cat Machine', window_width=3840, window_height=2160)
+
+        # Internals.
+        self._last_error_print  = None
+        self._async_renderer    = AsyncRenderer()
+        self._defer_rendering   = 0
+        self._tex_img           = None
+        self._tex_obj           = None
+
+        # Widget interface.
+        self.args               = dnnlib.EasyDict()
+        self.result             = dnnlib.EasyDict()
+        self.pane_w             = 0
+        self.label_w            = 0
+        self.button_w           = 0
+
+        # Widgets.
+        self.pickle_widget      = pickle_widget.PickleWidget(self)
+        self.latent_widget      = latent_widget.LatentWidget(self)
+        self.stylemix_widget    = stylemix_widget.StyleMixingWidget(self)
+        self.trunc_noise_widget = trunc_noise_widget.TruncationNoiseWidget(self)
+        self.perf_widget        = performance_widget.PerformanceWidget(self)
+        self.capture_widget     = capture_widget.CaptureWidget(self)
+        self.backbone_cache_widget     = backbone_cache_widget.BackboneCacheWidget(self)
+        self.layer_widget       = layer_widget.LayerWidget(self)
+        self.pose_widget        = pose_widget.PoseWidget(self)
+        self.zoom_widget        = zoom_widget.ZoomWidget(self)
+        self.conditioning_pose_widget        = conditioning_pose_widget.ConditioningPoseWidget(self)
+        self.render_type_widget = render_type_widget.RenderTypeWidget(self)
+        self.render_depth_sample_widget = render_depth_sample_widget.RenderDepthSampleWidget(self)
+
+        if capture_dir is not None:
+            self.capture_widget.path = capture_dir
+
+        # Initialize window.
+        self.set_position(0, 0)
+        self._adjust_font_size()
+        self.skip_frame() # Layout may change after first frame.
+
+    def close(self):
+        super().close()
+        if self._async_renderer is not None:
+            self._async_renderer.close()
+            self._async_renderer = None
+
+    def add_recent_pickle(self, pkl, ignore_errors=False):
+        self.pickle_widget.add_recent(pkl, ignore_errors=ignore_errors)
+
+    def load_pickle(self, pkl, ignore_errors=False):
+        self.pickle_widget.load(pkl, ignore_errors=ignore_errors)
+
+    def print_error(self, error):
+        error = str(error)
+        if error != self._last_error_print:
+            print('\n' + error + '\n')
+            self._last_error_print = error
+
+    def defer_rendering(self, num_frames=1):
+        self._defer_rendering = max(self._defer_rendering, num_frames)
+
+    def clear_result(self):
+        self._async_renderer.clear_result()
+
+    def set_async(self, is_async):
+        if is_async != self._async_renderer.is_async:
+            self._async_renderer.set_async(is_async)
+            self.clear_result()
+            if 'image' in self.result:
+                self.result.message = 'Switching rendering process...'
+                self.defer_rendering()
+
+    def _adjust_font_size(self):
+        old = self.font_size
+        self.set_font_size(min(self.content_width / 120, self.content_height / 60))
+        if self.font_size != old:
+            self.skip_frame() # Layout changed.
+
+    def draw_frame(self):
+        self.begin_frame()
+        self.args = dnnlib.EasyDict()
+        self.pane_w = self.font_size * 50
+        self.button_w = self.font_size * 5
+        self.label_w = round(self.font_size * 5.5)
+
+        # Detect mouse dragging in the result area.
+        dragging, dx, dy = imgui_utils.drag_hidden_window('##result_area', x=self.pane_w, y=0, width=self.content_width-self.pane_w, height=self.content_height)
+        if dragging:
+            self.pose_widget.drag(dx, dy)
+
+        # Begin control pane.
+        imgui.set_next_window_position(0, 0)
+        imgui.set_next_window_size(self.pane_w, self.content_height)
+        imgui.begin('##control_pane', closable=False, flags=(imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_MOVE))
+
+        # Widgets.
+        expanded, _visible = imgui_utils.collapsing_header('Network & latent', default=True)
+        self.pickle_widget(expanded)
+        self.pose_widget(expanded)
+        self.zoom_widget(expanded)
+        self.conditioning_pose_widget(expanded)
+        self.render_type_widget(expanded)
+        self.render_depth_sample_widget(expanded)
+        self.latent_widget(expanded)
+        self.stylemix_widget(expanded)
+        self.trunc_noise_widget(expanded)
+        expanded, _visible = imgui_utils.collapsing_header('Performance & capture', default=True)
+        self.perf_widget(expanded)
+        self.capture_widget(expanded)
+        expanded, _visible = imgui_utils.collapsing_header('Layers & channels', default=True)
+        self.backbone_cache_widget(expanded)
+        self.layer_widget(expanded)
+
+        # Render.
+        if self.is_skipping_frames():
+            pass
+        elif self._defer_rendering > 0:
+            self._defer_rendering -= 1
+        elif self.args.pkl is not None:
+            self._async_renderer.set_args(**self.args)
+            result = self._async_renderer.get_result()
+            if result is not None:
+                self.result = result
+
+        # Display.
+        max_w = self.content_width - self.pane_w
+        max_h = self.content_height
+        pos = np.array([self.pane_w + max_w / 2, max_h / 2])
+        if 'image' in self.result:
+            if self._tex_img is not self.result.image:
+                self._tex_img = self.result.image
+                if self._tex_obj is None or not self._tex_obj.is_compatible(image=self._tex_img):
+                    self._tex_obj = gl_utils.Texture(image=self._tex_img, bilinear=False, mipmap=False)
+                else:
+                    self._tex_obj.update(self._tex_img)
+            zoom = min(max_w / self._tex_obj.width, max_h / self._tex_obj.height)
+            # print(zoom)
+            zoom = np.floor(zoom) if zoom >= 1 else zoom
+            # zoom = 1
+            self._tex_obj.draw(pos=pos, zoom=zoom, align=0.5, rint=True)
+        if 'error' in self.result:
+            self.print_error(self.result.error)
+            if 'message' not in self.result:
+                self.result.message = str(self.result.error)
+        if 'message' in self.result:
+            tex = text_utils.get_texture(self.result.message, size=self.font_size, max_width=max_w, max_height=max_h, outline=2)
+            tex.draw(pos=pos, align=0.5, rint=True, color=1)
+
+        # End frame.
+        self._adjust_font_size()
+        imgui.end()
+        self.end_frame()
+
+#----------------------------------------------------------------------------
+
+class AsyncRenderer:
+    def __init__(self):
+        self._closed        = False
+        self._is_async      = False
+        self._cur_args      = None
+        self._cur_result    = None
+        self._cur_stamp     = 0
+        self._renderer_obj  = None
+        self._args_queue    = None
+        self._result_queue  = None
+        self._process       = None
+
+    def close(self):
+        self._closed = True
+        self._renderer_obj = None
+        if self._process is not None:
+            self._process.terminate()
+        self._process = None
+        self._args_queue = None
+        self._result_queue = None
+
+    @property
+    def is_async(self):
+        return self._is_async
+
+    def set_async(self, is_async):
+        self._is_async = is_async
+
+    def set_args(self, **args):
+        assert not self._closed
+        if args != self._cur_args:
+            if self._is_async:
+                self._set_args_async(**args)
+            else:
+                self._set_args_sync(**args)
+            self._cur_args = args
+
+    def _set_args_async(self, **args):
+        if self._process is None:
+            self._args_queue = multiprocessing.Queue()
+            self._result_queue = multiprocessing.Queue()
+            try:
+                multiprocessing.set_start_method('spawn')
+            except RuntimeError:
+                pass
+            self._process = multiprocessing.Process(target=self._process_fn, args=(self._args_queue, self._result_queue), daemon=True)
+            self._process.start()
+        self._args_queue.put([args, self._cur_stamp])
+
+    def _set_args_sync(self, **args):
+        if self._renderer_obj is None:
+            self._renderer_obj = renderer.Renderer()
+        self._cur_result = self._renderer_obj.render(**args)
+
+    def get_result(self):
+        assert not self._closed
+        if self._result_queue is not None:
+            while self._result_queue.qsize() > 0:
+                result, stamp = self._result_queue.get()
+                if stamp == self._cur_stamp:
+                    self._cur_result = result
+        return self._cur_result
+
+    def clear_result(self):
+        assert not self._closed
+        self._cur_args = None
+        self._cur_result = None
+        self._cur_stamp += 1
+
+    @staticmethod
+    def _process_fn(args_queue, result_queue):
+        renderer_obj = renderer.Renderer()
+        cur_args = None
+        cur_stamp = None
+        while True:
+            args, stamp = args_queue.get()
+            while args_queue.qsize() > 0:
+                args, stamp = args_queue.get()
+            if args != cur_args or stamp != cur_stamp:
+                result = renderer_obj.render(**args)
+                if 'error' in result:
+                    result.error = renderer.CapturedException(result.error)
+                result_queue.put([result, stamp])
+                cur_args = args
+                cur_stamp = stamp
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.argument('pkls', metavar='PATH', nargs=-1)
+@click.option('--capture-dir', help='Where to save screenshot captures', metavar='PATH', default=None)
+@click.option('--browse-dir', help='Specify model path for the \'Browse...\' button', metavar='PATH')
+def main(
+    pkls,
+    capture_dir,
+    browse_dir
+):
+    """Interactive model visualizer.
+
+    Optional PATH argument can be used specify which .pkl file to load.
+    """
+    viz = Visualizer(capture_dir=capture_dir)
+
+    if browse_dir is not None:
+        viz.pickle_widget.search_dirs = [browse_dir]
+
+    # List pickles.
+    pretrained = [
+        'https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/ffhq512-128.pkl',
+        'https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/afhqcats512-128.pkl',
+        'https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/ffhqrebalanced512-64.pkl',
+        'https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/ffhqrebalanced512-128.pkl',
+        'https://api.ngc.nvidia.com/v2/models/nvidia/research/eg3d/versions/1/files/shapenetcars128-64.pkl',
+    ]
+
+    # Populate recent pickles list with pretrained model URLs.
+    for url in pretrained:
+        viz.add_recent_pickle(url)
+
+    # Run.
+    while not viz.should_close():
+        viz.draw_frame()
+    viz.close()
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/__init__.py b/eg3d/viz/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfebd04f47e6f6b1b44984c14c23b57d56f72240
--- /dev/null
+++ b/eg3d/viz/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+# empty
diff --git a/eg3d/viz/backbone_cache_widget.py b/eg3d/viz/backbone_cache_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..71f3fb444bf48fa948f15054fc8b3aac73b3e1a5
--- /dev/null
+++ b/eg3d/viz/backbone_cache_widget.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class BackboneCacheWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.cache_backbone = True
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+
+        if show:
+            imgui.text('Cache Backbone')
+            imgui.same_line(viz.label_w + viz.spacing * 4)
+            _clicked, self.cache_backbone = imgui.checkbox('##backbonecache', self.cache_backbone)
+            imgui.same_line(viz.label_w + viz.spacing * 10)
+            imgui.text('Note that when enabled, you may be unable to view intermediate backbone weights below')
+
+        viz.args.do_backbone_caching = self.cache_backbone
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/capture_widget.py b/eg3d/viz/capture_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f214ffae20209795cfb32148a88f4e09091fad
--- /dev/null
+++ b/eg3d/viz/capture_widget.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import os
+import re
+import numpy as np
+import imgui
+import PIL.Image
+from gui_utils import imgui_utils
+from . import renderer
+
+#----------------------------------------------------------------------------
+
+class CaptureWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.path           = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '_screenshots'))
+        self.dump_image     = False
+        self.dump_gui       = False
+        self.defer_frames   = 0
+        self.disabled_time  = 0
+
+    def dump_png(self, image):
+        viz = self.viz
+        try:
+            _height, _width, channels = image.shape
+            assert channels in [1, 3]
+            assert image.dtype == np.uint8
+            os.makedirs(self.path, exist_ok=True)
+            file_id = 0
+            for entry in os.scandir(self.path):
+                if entry.is_file():
+                    match = re.fullmatch(r'(\d+).*', entry.name)
+                    if match:
+                        file_id = max(file_id, int(match.group(1)) + 1)
+            if channels == 1:
+                pil_image = PIL.Image.fromarray(image[:, :, 0], 'L')
+            else:
+                pil_image = PIL.Image.fromarray(image, 'RGB')
+            pil_image.save(os.path.join(self.path, f'{file_id:05d}.png'))
+        except:
+            viz.result.error = renderer.CapturedException()
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            with imgui_utils.grayed_out(self.disabled_time != 0):
+                imgui.text('Capture')
+                imgui.same_line(viz.label_w)
+                _changed, self.path = imgui_utils.input_text('##path', self.path, 1024,
+                    flags=(imgui.INPUT_TEXT_AUTO_SELECT_ALL | imgui.INPUT_TEXT_ENTER_RETURNS_TRUE),
+                    width=(-1 - viz.button_w * 2 - viz.spacing * 2),
+                    help_text='PATH')
+                if imgui.is_item_hovered() and not imgui.is_item_active() and self.path != '':
+                    imgui.set_tooltip(self.path)
+                imgui.same_line()
+                if imgui_utils.button('Save image', width=viz.button_w, enabled=(self.disabled_time == 0 and 'image' in viz.result)):
+                    self.dump_image = True
+                    self.defer_frames = 2
+                    self.disabled_time = 0.5
+                imgui.same_line()
+                if imgui_utils.button('Save GUI', width=-1, enabled=(self.disabled_time == 0)):
+                    self.dump_gui = True
+                    self.defer_frames = 2
+                    self.disabled_time = 0.5
+
+        self.disabled_time = max(self.disabled_time - viz.frame_delta, 0)
+        if self.defer_frames > 0:
+            self.defer_frames -= 1
+        elif self.dump_image:
+            if 'image' in viz.result:
+                self.dump_png(viz.result.image)
+            self.dump_image = False
+        elif self.dump_gui:
+            viz.capture_next_frame()
+            self.dump_gui = False
+        captured_frame = viz.pop_captured_frame()
+        if captured_frame is not None:
+            self.dump_png(captured_frame)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/conditioning_pose_widget.py b/eg3d/viz/conditioning_pose_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..875490104b7e292c01625eb83404aed26f7b70a1
--- /dev/null
+++ b/eg3d/viz/conditioning_pose_widget.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class ConditioningPoseWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.pose       = dnnlib.EasyDict(yaw=0, pitch=0, anim=False, speed=0.25)
+        self.pose_def   = dnnlib.EasyDict(self.pose)
+
+    def drag(self, dx, dy):
+        viz = self.viz
+        self.pose.yaw   += -dx / viz.font_size * 3e-2
+        self.pose.pitch += -dy / viz.font_size * 3e-2
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            imgui.text('Cond Pose')
+            imgui.same_line(viz.label_w)
+            yaw = self.pose.yaw
+            pitch = self.pose.pitch
+            with imgui_utils.item_width(viz.font_size * 5):
+                changed, (new_yaw, new_pitch) = imgui.input_float2('##frac', yaw, pitch, format='%+.2f', flags=imgui.INPUT_TEXT_ENTER_RETURNS_TRUE)
+                if changed:
+                    self.pose.yaw = new_yaw
+                    self.pose.pitch = new_pitch
+            imgui.same_line(viz.label_w + viz.font_size * 13 + viz.spacing * 2)
+            _clicked, dragging, dx, dy = imgui_utils.drag_button('Drag', width=viz.button_w)
+            if dragging:
+                self.drag(dx, dy)
+            imgui.same_line()
+            snapped = dnnlib.EasyDict(self.pose, yaw=round(self.pose.yaw, 1), pitch=round(self.pose.pitch, 1))
+            if imgui_utils.button('Snap', width=viz.button_w, enabled=(self.pose != snapped)):
+                self.pose = snapped
+            imgui.same_line()
+            if imgui_utils.button('Reset', width=-1, enabled=(self.pose != self.pose_def)):
+                self.pose = dnnlib.EasyDict(self.pose_def)
+
+        viz.args.conditioning_yaw   = self.pose.yaw
+        viz.args.conditioning_pitch = self.pose.pitch
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/latent_widget.py b/eg3d/viz/latent_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ce50c4dd37125934152d9db57d88e36c845f5b
--- /dev/null
+++ b/eg3d/viz/latent_widget.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class LatentWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.latent     = dnnlib.EasyDict(x=1, y=0, anim=False, speed=0.25)
+        self.latent_def = dnnlib.EasyDict(self.latent)
+        self.step_y     = 100
+
+    def drag(self, dx, dy):
+        viz = self.viz
+        self.latent.x += dx / viz.font_size * 4e-2
+        self.latent.y += dy / viz.font_size * 4e-2
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            imgui.text('Latent')
+            imgui.same_line(viz.label_w)
+            seed = round(self.latent.x) + round(self.latent.y) * self.step_y
+            with imgui_utils.item_width(viz.font_size * 8):
+                changed, seed = imgui.input_int('##seed', seed, step=0)
+                if changed:
+                    self.latent.x = seed
+                    self.latent.y = 0
+            imgui.same_line(viz.label_w + viz.font_size * 8 + viz.spacing)
+            frac_x = self.latent.x - round(self.latent.x)
+            frac_y = self.latent.y - round(self.latent.y)
+            with imgui_utils.item_width(viz.font_size * 5):
+                changed, (new_frac_x, new_frac_y) = imgui.input_float2('##frac', frac_x, frac_y, format='%+.2f', flags=imgui.INPUT_TEXT_ENTER_RETURNS_TRUE)
+                if changed:
+                    self.latent.x += new_frac_x - frac_x
+                    self.latent.y += new_frac_y - frac_y
+            imgui.same_line(viz.label_w + viz.font_size * 13 + viz.spacing * 2)
+            _clicked, dragging, dx, dy = imgui_utils.drag_button('Drag', width=viz.button_w)
+            if dragging:
+                self.drag(dx, dy)
+            imgui.same_line(viz.label_w + viz.font_size * 13 + viz.button_w + viz.spacing * 3)
+            _clicked, self.latent.anim = imgui.checkbox('Anim', self.latent.anim)
+            imgui.same_line(round(viz.font_size * 28.7))
+            with imgui_utils.item_width(-2 - viz.button_w * 2 - viz.spacing * 2), imgui_utils.grayed_out(not self.latent.anim):
+                changed, speed = imgui.slider_float('##speed', self.latent.speed, -5, 5, format='Speed %.3f', power=3)
+                if changed:
+                    self.latent.speed = speed
+            imgui.same_line()
+            snapped = dnnlib.EasyDict(self.latent, x=round(self.latent.x), y=round(self.latent.y))
+            if imgui_utils.button('Snap', width=viz.button_w, enabled=(self.latent != snapped)):
+                self.latent = snapped
+            imgui.same_line()
+            if imgui_utils.button('Reset', width=-1, enabled=(self.latent != self.latent_def)):
+                self.latent = dnnlib.EasyDict(self.latent_def)
+
+        if self.latent.anim:
+            self.latent.x += viz.frame_delta * self.latent.speed
+        viz.args.w0_seeds = [] # [[seed, weight], ...]
+        for ofs_x, ofs_y in [[0, 0], [1, 0], [0, 1], [1, 1]]:
+            seed_x = np.floor(self.latent.x) + ofs_x
+            seed_y = np.floor(self.latent.y) + ofs_y
+            seed = (int(seed_x) + int(seed_y) * self.step_y) & ((1 << 32) - 1)
+            weight = (1 - abs(self.latent.x - seed_x)) * (1 - abs(self.latent.y - seed_y))
+            if weight > 0:
+                viz.args.w0_seeds.append([seed, weight])
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/layer_widget.py b/eg3d/viz/layer_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..6da25858046af66acbf8521a441d9787a9869137
--- /dev/null
+++ b/eg3d/viz/layer_widget.py
@@ -0,0 +1,185 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class LayerWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.prev_layers    = None
+        self.cur_layer      = None
+        self.sel_channels   = 3
+        self.base_channel   = 0
+        self.img_scale_db   = 0
+        self.img_normalize  = False
+        self.fft_show       = False
+        self.fft_all        = True
+        self.fft_range_db   = 50
+        self.fft_beta       = 8
+        self.refocus        = False
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        layers = viz.result.get('layers', [])
+        if self.prev_layers != layers:
+            self.prev_layers = layers
+            self.refocus = True
+        layer = ([layer for layer in layers if layer.name == self.cur_layer] + [None])[0]
+        if layer is None and len(layers) > 0:
+            layer = layers[-1]
+            self.cur_layer = layer.name
+        num_channels = layer.shape[1] if layer is not None else 0
+        base_channel_max = max(num_channels - self.sel_channels, 0)
+
+        if show:
+            bg_color = [0.16, 0.29, 0.48, 0.2]
+            dim_color = list(imgui.get_style().colors[imgui.COLOR_TEXT])
+            dim_color[-1] *= 0.5
+
+            # Begin list.
+            width = viz.font_size * 28
+            height = imgui.get_text_line_height_with_spacing() * 12 + viz.spacing
+            imgui.push_style_var(imgui.STYLE_FRAME_PADDING, [0, 0])
+            imgui.push_style_color(imgui.COLOR_CHILD_BACKGROUND, *bg_color)
+            imgui.push_style_color(imgui.COLOR_HEADER, 0, 0, 0, 0)
+            imgui.push_style_color(imgui.COLOR_HEADER_HOVERED, 0.16, 0.29, 0.48, 0.5)
+            imgui.push_style_color(imgui.COLOR_HEADER_ACTIVE, 0.16, 0.29, 0.48, 0.9)
+            imgui.begin_child('##list', width=width, height=height, border=True, flags=imgui.WINDOW_ALWAYS_VERTICAL_SCROLLBAR)
+
+            # List items.
+            for layer in layers:
+                selected = (self.cur_layer == layer.name)
+                _opened, selected = imgui.selectable(f'##{layer.name}_selectable', selected)
+                imgui.same_line(viz.spacing)
+                _clicked, selected = imgui.checkbox(f'{layer.name}##radio', selected)
+                if selected:
+                    self.cur_layer = layer.name
+                    if self.refocus:
+                        imgui.set_scroll_here()
+                        viz.skip_frame() # Focus will change on next frame.
+                        self.refocus = False
+                imgui.same_line(width - viz.font_size * 13)
+                imgui.text_colored('x'.join(str(x) for x in layer.shape[2:]), *dim_color)
+                imgui.same_line(width - viz.font_size * 8)
+                imgui.text_colored(str(layer.shape[1]), *dim_color)
+                imgui.same_line(width - viz.font_size * 5)
+                imgui.text_colored(layer.dtype, *dim_color)
+
+            # End list.
+            if len(layers) == 0:
+                imgui.text_colored('No layers found', *dim_color)
+            imgui.end_child()
+            imgui.pop_style_color(4)
+            imgui.pop_style_var(1)
+
+            # Begin options.
+            imgui.same_line()
+            imgui.begin_child('##options', width=-1, height=height, border=False)
+
+            # RGB & normalize.
+            rgb = (self.sel_channels == 3)
+            _clicked, rgb = imgui.checkbox('RGB', rgb)
+            self.sel_channels = 3 if rgb else 1
+            imgui.same_line(viz.font_size * 4)
+            _clicked, self.img_normalize = imgui.checkbox('Normalize', self.img_normalize)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w)
+            if imgui_utils.button('Reset##img_flags', width=-1, enabled=(self.sel_channels != 3 or self.img_normalize)):
+                self.sel_channels = 3
+                self.img_normalize = False
+
+            # Image scale.
+            with imgui_utils.item_width(-1 - viz.button_w - viz.spacing):
+                _changed, self.img_scale_db = imgui.slider_float('##scale', self.img_scale_db, min_value=-40, max_value=40, format='Scale %+.1f dB')
+            imgui.same_line()
+            if imgui_utils.button('Reset##scale', width=-1, enabled=(self.img_scale_db != 0)):
+                self.img_scale_db = 0
+
+            # Base channel.
+            self.base_channel = min(max(self.base_channel, 0), base_channel_max)
+            narrow_w = imgui.get_text_line_height_with_spacing()
+            with imgui_utils.grayed_out(base_channel_max == 0):
+                with imgui_utils.item_width(-1 - viz.button_w - narrow_w * 2 - viz.spacing * 3):
+                    _changed, self.base_channel = imgui.drag_int('##channel', self.base_channel, change_speed=0.05, min_value=0, max_value=base_channel_max, format=f'Channel %d/{num_channels}')
+                imgui.same_line()
+                if imgui_utils.button('-##channel', width=narrow_w):
+                    self.base_channel -= 1
+                imgui.same_line()
+                if imgui_utils.button('+##channel', width=narrow_w):
+                    self.base_channel += 1
+            imgui.same_line()
+            self.base_channel = min(max(self.base_channel, 0), base_channel_max)
+            if imgui_utils.button('Reset##channel', width=-1, enabled=(self.base_channel != 0 and base_channel_max > 0)):
+                self.base_channel = 0
+
+            # Stats.
+            stats = viz.result.get('stats', None)
+            stats = [f'{stats[idx]:g}' if stats is not None else 'N/A' for idx in range(6)]
+            rows = [
+                ['Statistic',   'All channels', 'Selected'],
+                ['Mean',        stats[0],       stats[1]],
+                ['Std',         stats[2],       stats[3]],
+                ['Max',         stats[4],       stats[5]],
+            ]
+            height = imgui.get_text_line_height_with_spacing() * len(rows) + viz.spacing
+            imgui.push_style_color(imgui.COLOR_CHILD_BACKGROUND, *bg_color)
+            imgui.begin_child('##stats', width=-1, height=height, border=True)
+            for y, cols in enumerate(rows):
+                for x, col in enumerate(cols):
+                    if x != 0:
+                        imgui.same_line(viz.font_size * (4 + (x - 1) * 6))
+                    if x == 0 or y == 0:
+                        imgui.text_colored(col, *dim_color)
+                    else:
+                        imgui.text(col)
+            imgui.end_child()
+            imgui.pop_style_color(1)
+
+            # FFT & all.
+            _clicked, self.fft_show = imgui.checkbox('FFT', self.fft_show)
+            imgui.same_line(viz.font_size * 4)
+            with imgui_utils.grayed_out(not self.fft_show or base_channel_max == 0):
+                _clicked, self.fft_all = imgui.checkbox('All channels', self.fft_all)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w)
+            with imgui_utils.grayed_out(not self.fft_show):
+                if imgui_utils.button('Reset##fft_flags', width=-1, enabled=(self.fft_show or not self.fft_all)):
+                    self.fft_show = False
+                    self.fft_all = True
+
+            # FFT range.
+            with imgui_utils.grayed_out(not self.fft_show):
+                with imgui_utils.item_width(-1 - viz.button_w - viz.spacing):
+                    _changed, self.fft_range_db = imgui.slider_float('##fft_range_db', self.fft_range_db, min_value=0.1, max_value=100, format='Range +-%.1f dB')
+                imgui.same_line()
+                if imgui_utils.button('Reset##fft_range_db', width=-1, enabled=(self.fft_range_db != 50)):
+                    self.fft_range_db = 50
+
+            # FFT beta.
+            with imgui_utils.grayed_out(not self.fft_show):
+                with imgui_utils.item_width(-1 - viz.button_w - viz.spacing):
+                    _changed, self.fft_beta = imgui.slider_float('##fft_beta', self.fft_beta, min_value=0, max_value=50, format='Kaiser beta %.2f', power=2.63)
+                imgui.same_line()
+                if imgui_utils.button('Reset##fft_beta', width=-1, enabled=(self.fft_beta != 8)):
+                    self.fft_beta = 8
+
+            # End options.
+            imgui.end_child()
+
+        self.base_channel = min(max(self.base_channel, 0), base_channel_max)
+        viz.args.layer_name = self.cur_layer if len(layers) > 0 and self.cur_layer != layers[-1].name else None
+        viz.args.update(sel_channels=self.sel_channels, base_channel=self.base_channel, img_scale_db=self.img_scale_db, img_normalize=self.img_normalize)
+        viz.args.fft_show = self.fft_show
+        if self.fft_show:
+            viz.args.update(fft_all=self.fft_all, fft_range_db=self.fft_range_db, fft_beta=self.fft_beta)
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/performance_widget.py b/eg3d/viz/performance_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb208a741bf14dd57c70012fa23486902d31427
--- /dev/null
+++ b/eg3d/viz/performance_widget.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import array
+import numpy as np
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class PerformanceWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.gui_times      = [float('nan')] * 60
+        self.render_times   = [float('nan')] * 30
+        self.fps_limit      = 60
+        self.use_vsync      = False
+        self.is_async       = False
+        self.force_fp32     = False
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        self.gui_times = self.gui_times[1:] + [viz.frame_delta]
+        if 'render_time' in viz.result:
+            self.render_times = self.render_times[1:] + [viz.result.render_time]
+            del viz.result.render_time
+
+        if show:
+            imgui.text('GUI')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8):
+                imgui.plot_lines('##gui_times', array.array('f', self.gui_times), scale_min=0)
+            imgui.same_line(viz.label_w + viz.font_size * 9)
+            t = [x for x in self.gui_times if x > 0]
+            t = np.mean(t) if len(t) > 0 else 0
+            imgui.text(f'{t*1e3:.1f} ms' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 14)
+            imgui.text(f'{1/t:.1f} FPS' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 18 + viz.spacing * 3)
+            with imgui_utils.item_width(viz.font_size * 6):
+                _changed, self.fps_limit = imgui.input_int('FPS limit', self.fps_limit, flags=imgui.INPUT_TEXT_ENTER_RETURNS_TRUE)
+                self.fps_limit = min(max(self.fps_limit, 5), 1000)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w * 2 - viz.spacing)
+            _clicked, self.use_vsync = imgui.checkbox('Vertical sync', self.use_vsync)
+
+        if show:
+            imgui.text('Render')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8):
+                imgui.plot_lines('##render_times', array.array('f', self.render_times), scale_min=0)
+            imgui.same_line(viz.label_w + viz.font_size * 9)
+            t = [x for x in self.render_times if x > 0]
+            t = np.mean(t) if len(t) > 0 else 0
+            imgui.text(f'{t*1e3:.1f} ms' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 14)
+            imgui.text(f'{1/t:.1f} FPS' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 18 + viz.spacing * 3)
+            _clicked, self.is_async = imgui.checkbox('Separate process', self.is_async)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w * 2 - viz.spacing)
+            _clicked, self.force_fp32 = imgui.checkbox('Force FP32', self.force_fp32)
+
+        viz.set_fps_limit(self.fps_limit)
+        viz.set_vsync(self.use_vsync)
+        viz.set_async(self.is_async)
+        viz.args.force_fp32 = self.force_fp32
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/pickle_widget.py b/eg3d/viz/pickle_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..e85a8592aa700b551204b92827c11902b1b54851
--- /dev/null
+++ b/eg3d/viz/pickle_widget.py
@@ -0,0 +1,172 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import glob
+import os
+import re
+
+import dnnlib
+import imgui
+import numpy as np
+from gui_utils import imgui_utils
+
+from . import renderer
+
+#----------------------------------------------------------------------------
+
+def _locate_results(pattern):
+    return pattern
+
+#----------------------------------------------------------------------------
+
+class PickleWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.search_dirs    = []
+        self.cur_pkl        = None
+        self.user_pkl       = ''
+        self.recent_pkls    = []
+        self.browse_cache   = dict() # {tuple(path, ...): [dnnlib.EasyDict(), ...], ...}
+        self.browse_refocus = False
+        self.load('', ignore_errors=True)
+
+    def add_recent(self, pkl, ignore_errors=False):
+        try:
+            resolved = self.resolve_pkl(pkl)
+            if resolved not in self.recent_pkls:
+                self.recent_pkls.append(resolved)
+        except:
+            if not ignore_errors:
+                raise
+
+    def load(self, pkl, ignore_errors=False):
+        viz = self.viz
+        viz.clear_result()
+        viz.skip_frame() # The input field will change on next frame.
+        try:
+            resolved = self.resolve_pkl(pkl)
+            name = resolved.replace('\\', '/').split('/')[-1]
+            self.cur_pkl = resolved
+            self.user_pkl = resolved
+            viz.result.message = f'Loading {name}...'
+            viz.defer_rendering()
+            if resolved in self.recent_pkls:
+                self.recent_pkls.remove(resolved)
+            self.recent_pkls.insert(0, resolved)
+        except:
+            self.cur_pkl = None
+            self.user_pkl = pkl
+            if pkl == '':
+                viz.result = dnnlib.EasyDict(message='No network pickle loaded')
+            else:
+                viz.result = dnnlib.EasyDict(error=renderer.CapturedException())
+            if not ignore_errors:
+                raise
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        recent_pkls = [pkl for pkl in self.recent_pkls if pkl != self.user_pkl]
+        if show:
+            imgui.text('Pickle')
+            imgui.same_line(viz.label_w)
+            changed, self.user_pkl = imgui_utils.input_text('##pkl', self.user_pkl, 1024,
+                flags=(imgui.INPUT_TEXT_AUTO_SELECT_ALL | imgui.INPUT_TEXT_ENTER_RETURNS_TRUE),
+                width=(-1 - viz.button_w * 2 - viz.spacing * 2),
+                help_text='<PATH> | <URL> | <RUN_DIR> | <RUN_ID> | <RUN_ID>/<KIMG>.pkl')
+            if changed:
+                self.load(self.user_pkl, ignore_errors=True)
+            if imgui.is_item_hovered() and not imgui.is_item_active() and self.user_pkl != '':
+                imgui.set_tooltip(self.user_pkl)
+            imgui.same_line()
+            if imgui_utils.button('Recent...', width=viz.button_w, enabled=(len(recent_pkls) != 0)):
+                imgui.open_popup('recent_pkls_popup')
+            imgui.same_line()
+            if imgui_utils.button('Browse...', enabled=len(self.search_dirs) > 0, width=-1):
+                imgui.open_popup('browse_pkls_popup')
+                self.browse_cache.clear()
+                self.browse_refocus = True
+
+        if imgui.begin_popup('recent_pkls_popup'):
+            for pkl in recent_pkls:
+                clicked, _state = imgui.menu_item(pkl)
+                if clicked:
+                    self.load(pkl, ignore_errors=True)
+            imgui.end_popup()
+
+        if imgui.begin_popup('browse_pkls_popup'):
+            def recurse(parents):
+                key = tuple(parents)
+                items = self.browse_cache.get(key, None)
+                if items is None:
+                    items = self.list_runs_and_pkls(parents)
+                    self.browse_cache[key] = items
+                for item in items:
+                    if item.type == 'run' and imgui.begin_menu(item.name):
+                        recurse([item.path])
+                        imgui.end_menu()
+                    if item.type == 'pkl':
+                        clicked, _state = imgui.menu_item(item.name)
+                        if clicked:
+                            self.load(item.path, ignore_errors=True)
+                if len(items) == 0:
+                    with imgui_utils.grayed_out():
+                        imgui.menu_item('No results found')
+            recurse(self.search_dirs)
+            if self.browse_refocus:
+                imgui.set_scroll_here()
+                viz.skip_frame() # Focus will change on next frame.
+                self.browse_refocus = False
+            imgui.end_popup()
+
+        paths = viz.pop_drag_and_drop_paths()
+        if paths is not None and len(paths) >= 1:
+            self.load(paths[0], ignore_errors=True)
+
+        viz.args.pkl = self.cur_pkl
+
+    def list_runs_and_pkls(self, parents):
+        items = []
+        run_regex = re.compile(r'\d+-.*')
+        pkl_regex = re.compile(r'network-snapshot-\d+\.pkl')
+        for parent in set(parents):
+            if os.path.isdir(parent):
+                for entry in os.scandir(parent):
+                    if entry.is_dir() and run_regex.fullmatch(entry.name):
+                        items.append(dnnlib.EasyDict(type='run', name=entry.name, path=os.path.join(parent, entry.name)))
+                    if entry.is_file() and pkl_regex.fullmatch(entry.name):
+                        items.append(dnnlib.EasyDict(type='pkl', name=entry.name, path=os.path.join(parent, entry.name)))
+
+        items = sorted(items, key=lambda item: (item.name.replace('_', ' '), item.path))
+        return items
+
+    def resolve_pkl(self, pattern):
+        assert isinstance(pattern, str)
+        assert pattern != ''
+
+        # URL => return as is.
+        if dnnlib.util.is_url(pattern):
+            return pattern
+
+        # Short-hand pattern => locate.
+        path = _locate_results(pattern)
+
+        # Run dir => pick the last saved snapshot.
+        if os.path.isdir(path):
+            pkl_files = sorted(glob.glob(os.path.join(path, 'network-snapshot-*.pkl')))
+            if len(pkl_files) == 0:
+                raise IOError(f'No network pickle found in "{path}"')
+            path = pkl_files[-1]
+
+        # Normalize.
+        path = os.path.abspath(path)
+        return path
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/pose_widget.py b/eg3d/viz/pose_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcb1f1715e1021adf928df2b931a1f23d336275f
--- /dev/null
+++ b/eg3d/viz/pose_widget.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class PoseWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.pose       = dnnlib.EasyDict(yaw=0, pitch=0, anim=False, speed=0.25)
+        self.pose_def   = dnnlib.EasyDict(self.pose)
+
+        self.lookat_point_choice = 0
+        self.lookat_point_option = ['auto',         'ffhq',            'shapenet',         'afhq',         'manual']
+        self.lookat_point_labels = ['Auto Detect',  'FFHQ Default',    'Shapenet Default', 'AFHQ Default', 'Manual']
+        self.lookat_point = (0.0, 0.0, 0.2)
+
+    def drag(self, dx, dy):
+        viz = self.viz
+        self.pose.yaw   += -dx / viz.font_size * 3e-2
+        self.pose.pitch += -dy / viz.font_size * 3e-2
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            imgui.text('Pose')
+            imgui.same_line(viz.label_w)
+            yaw = self.pose.yaw
+            pitch = self.pose.pitch
+            with imgui_utils.item_width(viz.font_size * 5):
+                changed, (new_yaw, new_pitch) = imgui.input_float2('##pose', yaw, pitch, format='%+.2f', flags=imgui.INPUT_TEXT_ENTER_RETURNS_TRUE)
+                if changed:
+                    self.pose.yaw = new_yaw
+                    self.pose.pitch = new_pitch
+            imgui.same_line(viz.label_w + viz.font_size * 13 + viz.spacing * 2)
+            _clicked, dragging, dx, dy = imgui_utils.drag_button('Drag', width=viz.button_w)
+            if dragging:
+                self.drag(dx, dy)
+            imgui.same_line()
+            snapped = dnnlib.EasyDict(self.pose, yaw=round(self.pose.yaw, 1), pitch=round(self.pose.pitch, 1))
+            if imgui_utils.button('Snap', width=viz.button_w, enabled=(self.pose != snapped)):
+                self.pose = snapped
+            imgui.same_line()
+            if imgui_utils.button('Reset', width=-1, enabled=(self.pose != self.pose_def)):
+                self.pose = dnnlib.EasyDict(self.pose_def)
+
+            # New line starts here
+            imgui.text('LookAt Point')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8):
+                _clicked, self.lookat_point_choice = imgui.combo('', self.lookat_point_choice, self.lookat_point_labels)
+            lookat_point = self.lookat_point_option[self.lookat_point_choice]
+            if lookat_point == 'auto':
+                self.lookat_point = None
+            if lookat_point == 'ffhq':
+                self.lookat_point = (0.0, 0.0, 0.2)
+                changes_enabled=False
+            if lookat_point == 'shapenet':
+                self.lookat_point = (0.0, 0.0, 0.0)
+                changes_enabled=False
+            if lookat_point == 'afhq':
+                self.lookat_point = (0.0, 0.0, 0.0)
+                changes_enabled=False
+            if lookat_point == 'manual':
+                if self.lookat_point is None:
+                    self.lookat_point = (0.0, 0.0, 0.0)
+                changes_enabled=True
+            if lookat_point != 'auto':
+                imgui.same_line(viz.label_w + viz.font_size * 13 + viz.spacing * 2)
+                with imgui_utils.item_width(viz.font_size * 16):
+                    with imgui_utils.grayed_out(not changes_enabled):
+                        _changed, self.lookat_point = imgui.input_float3('##lookat', *self.lookat_point, format='%.2f', flags=(imgui.INPUT_TEXT_READ_ONLY if not changes_enabled else 0))
+
+
+        viz.args.yaw   = self.pose.yaw
+        viz.args.pitch = self.pose.pitch
+
+        viz.args.lookat_point = self.lookat_point
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/render_depth_sample_widget.py b/eg3d/viz/render_depth_sample_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..27c48f748e23d465c6200687c8280541df2f28b9
--- /dev/null
+++ b/eg3d/viz/render_depth_sample_widget.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class RenderDepthSampleWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.depth_mult            = 2
+        self.depth_importance_mult = 2
+        self.render_types = [.5, 1, 2, 4]
+        self.labels       = ['0.5x', '1x', '2x', '4x']
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+
+        if show:
+            imgui.text('Render Type')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 4):
+                _clicked, self.depth_mult = imgui.combo('Depth Sample Multiplier', self.depth_mult, self.labels)
+            imgui.same_line(viz.label_w + viz.font_size * 16 + viz.spacing * 2)
+            with imgui_utils.item_width(viz.font_size * 4):
+                _clicked, self.depth_importance_mult = imgui.combo('Depth Sample Importance Multiplier', self.depth_importance_mult, self.labels)
+
+        viz.args.depth_mult = self.render_types[self.depth_mult]
+        viz.args.depth_importance_mult = self.render_types[self.depth_importance_mult]
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/render_type_widget.py b/eg3d/viz/render_type_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..6332ef42245603c1a9618612e2302eb33e6b0e11
--- /dev/null
+++ b/eg3d/viz/render_type_widget.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class RenderTypeWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.render_type = 0
+        self.render_types = ['image',     'image_depth', 'image_raw']
+        self.labels       = ['RGB Image', 'Depth Image', 'Neural Rendered Image']
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+
+        if show:
+            imgui.text('Render Type')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 10):
+                _clicked, self.render_type = imgui.combo('', self.render_type, self.labels)
+
+        viz.args.render_type = self.render_types[self.render_type]
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/renderer.py b/eg3d/viz/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c11f05a9e26aabc3a7bfadfd2ad71942973d3ec8
--- /dev/null
+++ b/eg3d/viz/renderer.py
@@ -0,0 +1,448 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import sys
+import copy
+import traceback
+import numpy as np
+import torch
+import torch.fft
+import torch.nn
+import matplotlib.cm
+import dnnlib
+from torch_utils.ops import upfirdn2d
+import legacy # pylint: disable=import-error
+
+from camera_utils import LookAtPoseSampler
+
+
+
+#----------------------------------------------------------------------------
+
+class CapturedException(Exception):
+    def __init__(self, msg=None):
+        if msg is None:
+            _type, value, _traceback = sys.exc_info()
+            assert value is not None
+            if isinstance(value, CapturedException):
+                msg = str(value)
+            else:
+                msg = traceback.format_exc()
+        assert isinstance(msg, str)
+        super().__init__(msg)
+
+#----------------------------------------------------------------------------
+
+class CaptureSuccess(Exception):
+    def __init__(self, out):
+        super().__init__()
+        self.out = out
+
+#----------------------------------------------------------------------------
+
+def _sinc(x):
+    y = (x * np.pi).abs()
+    z = torch.sin(y) / y.clamp(1e-30, float('inf'))
+    return torch.where(y < 1e-30, torch.ones_like(x), z)
+
+def _lanczos_window(x, a):
+    x = x.abs() / a
+    return torch.where(x < 1, _sinc(x), torch.zeros_like(x))
+
+#----------------------------------------------------------------------------
+
+def _construct_affine_bandlimit_filter(mat, a=3, amax=16, aflt=64, up=4, cutoff_in=1, cutoff_out=1):
+    assert a <= amax < aflt
+    mat = torch.as_tensor(mat).to(torch.float32)
+
+    # Construct 2D filter taps in input & output coordinate spaces.
+    taps = ((torch.arange(aflt * up * 2 - 1, device=mat.device) + 1) / up - aflt).roll(1 - aflt * up)
+    yi, xi = torch.meshgrid(taps, taps)
+    xo, yo = (torch.stack([xi, yi], dim=2) @ mat[:2, :2].t()).unbind(2)
+
+    # Convolution of two oriented 2D sinc filters.
+    fi = _sinc(xi * cutoff_in) * _sinc(yi * cutoff_in)
+    fo = _sinc(xo * cutoff_out) * _sinc(yo * cutoff_out)
+    f = torch.fft.ifftn(torch.fft.fftn(fi) * torch.fft.fftn(fo)).real
+
+    # Convolution of two oriented 2D Lanczos windows.
+    wi = _lanczos_window(xi, a) * _lanczos_window(yi, a)
+    wo = _lanczos_window(xo, a) * _lanczos_window(yo, a)
+    w = torch.fft.ifftn(torch.fft.fftn(wi) * torch.fft.fftn(wo)).real
+
+    # Construct windowed FIR filter.
+    f = f * w
+
+    # Finalize.
+    c = (aflt - amax) * up
+    f = f.roll([aflt * up - 1] * 2, dims=[0,1])[c:-c, c:-c]
+    f = torch.nn.functional.pad(f, [0, 1, 0, 1]).reshape(amax * 2, up, amax * 2, up)
+    f = f / f.sum([0,2], keepdim=True) / (up ** 2)
+    f = f.reshape(amax * 2 * up, amax * 2 * up)[:-1, :-1]
+    return f
+
+#----------------------------------------------------------------------------
+
+def _apply_affine_transformation(x, mat, up=4, **filter_kwargs):
+    _N, _C, H, W = x.shape
+    mat = torch.as_tensor(mat).to(dtype=torch.float32, device=x.device)
+
+    # Construct filter.
+    f = _construct_affine_bandlimit_filter(mat, up=up, **filter_kwargs)
+    assert f.ndim == 2 and f.shape[0] == f.shape[1] and f.shape[0] % 2 == 1
+    p = f.shape[0] // 2
+
+    # Construct sampling grid.
+    theta = mat.inverse()
+    theta[:2, 2] *= 2
+    theta[0, 2] += 1 / up / W
+    theta[1, 2] += 1 / up / H
+    theta[0, :] *= W / (W + p / up * 2)
+    theta[1, :] *= H / (H + p / up * 2)
+    theta = theta[:2, :3].unsqueeze(0).repeat([x.shape[0], 1, 1])
+    g = torch.nn.functional.affine_grid(theta, x.shape, align_corners=False)
+
+    # Resample image.
+    y = upfirdn2d.upsample2d(x=x, f=f, up=up, padding=p)
+    z = torch.nn.functional.grid_sample(y, g, mode='bilinear', padding_mode='zeros', align_corners=False)
+
+    # Form mask.
+    m = torch.zeros_like(y)
+    c = p * 2 + 1
+    m[:, :, c:-c, c:-c] = 1
+    m = torch.nn.functional.grid_sample(m, g, mode='nearest', padding_mode='zeros', align_corners=False)
+    return z, m
+
+#----------------------------------------------------------------------------
+
+class Renderer:
+    def __init__(self):
+        self._device        = torch.device('cuda')
+        self._pkl_data      = dict()    # {pkl: dict | CapturedException, ...}
+        self._networks      = dict()    # {cache_key: torch.nn.Module, ...}
+        self._pinned_bufs   = dict()    # {(shape, dtype): torch.Tensor, ...}
+        self._cmaps         = dict()    # {name: torch.Tensor, ...}
+        self._is_timing     = False
+        self._start_event   = torch.cuda.Event(enable_timing=True)
+        self._end_event     = torch.cuda.Event(enable_timing=True)
+        self._net_layers    = dict()    # {cache_key: [dnnlib.EasyDict, ...], ...}
+        self._last_model_input = None
+
+    def render(self, **args):
+        self._is_timing = True
+        self._start_event.record(torch.cuda.current_stream(self._device))
+        res = dnnlib.EasyDict()
+        try:
+            self._render_impl(res, **args)
+        except:
+            res.error = CapturedException()
+        self._end_event.record(torch.cuda.current_stream(self._device))
+        if 'image' in res:
+            res.image = self.to_cpu(res.image).numpy()
+        if 'stats' in res:
+            res.stats = self.to_cpu(res.stats).numpy()
+        if 'error' in res:
+            res.error = str(res.error)
+        if self._is_timing:
+            self._end_event.synchronize()
+            res.render_time = self._start_event.elapsed_time(self._end_event) * 1e-3
+            self._is_timing = False
+        return res
+
+    def get_network(self, pkl, key, **tweak_kwargs):
+        data = self._pkl_data.get(pkl, None)
+        if data is None:
+            print(f'Loading "{pkl}"... ', end='', flush=True)
+            try:
+                with dnnlib.util.open_url(pkl, verbose=False) as f:
+                    data = legacy.load_network_pkl(f)
+                print('Done.')
+            except:
+                data = CapturedException()
+                print('Failed!')
+            self._pkl_data[pkl] = data
+            self._ignore_timing()
+        if isinstance(data, CapturedException):
+            raise data
+
+        orig_net = data[key]
+        cache_key = (orig_net, self._device, tuple(sorted(tweak_kwargs.items())))
+        net = self._networks.get(cache_key, None)
+        if net is None:
+            try:
+                net = copy.deepcopy(orig_net)
+                net = self._tweak_network(net, **tweak_kwargs)
+                net.to(self._device)
+            except:
+                net = CapturedException()
+            self._networks[cache_key] = net
+            self._ignore_timing()
+        if isinstance(net, CapturedException):
+            raise net
+        return net
+
+    def _tweak_network(self, net):
+        # Print diagnostics.
+
+        RELOAD_MODULES = False
+        if RELOAD_MODULES:
+            from training.triplane import TriPlaneGenerator
+            from torch_utils import misc
+            print("Reloading Modules!")
+            net_new = TriPlaneGenerator(*net.init_args, **net.init_kwargs).eval().requires_grad_(False).to(self._device)
+            misc.copy_params_and_buffers(net, net_new, require_all=True)
+            net_new.neural_rendering_resolution = net.neural_rendering_resolution
+            net_new.rendering_kwargs = net.rendering_kwargs
+            net = net_new
+            # net.rendering_kwargs['ray_start'] = 'auto'
+            # net.rendering_kwargs['ray_end'] = 'auto'
+            # net.rendering_kwargs['avg_camera_pivot'] = [0, 0, 0]
+
+        return net
+
+    def _get_pinned_buf(self, ref):
+        key = (tuple(ref.shape), ref.dtype)
+        buf = self._pinned_bufs.get(key, None)
+        if buf is None:
+            buf = torch.empty(ref.shape, dtype=ref.dtype).pin_memory()
+            self._pinned_bufs[key] = buf
+        return buf
+
+    def to_device(self, buf):
+        return self._get_pinned_buf(buf).copy_(buf).to(self._device)
+
+    def to_cpu(self, buf):
+        return self._get_pinned_buf(buf).copy_(buf).clone()
+
+    def _ignore_timing(self):
+        self._is_timing = False
+
+    def _apply_cmap(self, x, name='viridis'):
+        cmap = self._cmaps.get(name, None)
+        if cmap is None:
+            cmap = matplotlib.cm.get_cmap(name)
+            cmap = cmap(np.linspace(0, 1, num=1024), bytes=True)[:, :3]
+            cmap = self.to_device(torch.from_numpy(cmap))
+            self._cmaps[name] = cmap
+        hi = cmap.shape[0] - 1
+        x = (x * hi + 0.5).clamp(0, hi).to(torch.int64)
+        x = torch.nn.functional.embedding(x, cmap)
+        return x
+
+    def _render_impl(self, res,
+        pkl             = None,
+        w0_seeds        = [[0, 1]],
+        stylemix_idx    = [],
+        stylemix_seed   = 0,
+        trunc_psi       = 1,
+        trunc_cutoff    = 0,
+        random_seed     = 0,
+        noise_mode      = 'const',
+        force_fp32      = False,
+        layer_name      = None,
+        sel_channels    = 3,
+        base_channel    = 0,
+        img_scale_db    = 0,
+        img_normalize   = False,
+        fft_show        = False,
+        fft_all         = True,
+        fft_range_db    = 50,
+        fft_beta        = 8,
+        input_transform = None,
+        untransform     = False,
+
+        yaw             = 0,
+        pitch           = 0,
+        lookat_point    = (0, 0, 0.2),
+        conditioning_yaw    = 0,
+        conditioning_pitch  = 0,
+        focal_length    = 4.2647,
+        render_type     = 'image',
+
+        do_backbone_caching = False,
+
+        depth_mult            = 1,
+        depth_importance_mult = 1,
+    ):
+        # Dig up network details.
+        G = self.get_network(pkl, 'G_ema').eval().requires_grad_(False).to('cuda')
+        res.img_resolution = G.img_resolution
+        res.num_ws = G.backbone.num_ws
+        res.has_noise = any('noise_const' in name for name, _buf in G.backbone.named_buffers())
+        res.has_input_transform = (hasattr(G.backbone, 'input') and hasattr(G.backbone.input, 'transform'))
+
+        # set G rendering kwargs
+        if 'depth_resolution_default' not in G.rendering_kwargs:
+            G.rendering_kwargs['depth_resolution_default'] = G.rendering_kwargs['depth_resolution']
+            G.rendering_kwargs['depth_resolution_importance_default'] = G.rendering_kwargs['depth_resolution_importance']
+
+        G.rendering_kwargs['depth_resolution'] = int(G.rendering_kwargs['depth_resolution_default'] * depth_mult)
+        G.rendering_kwargs['depth_resolution_importance'] = int(G.rendering_kwargs['depth_resolution_importance_default'] * depth_importance_mult)
+
+        # Set input transform.
+        if res.has_input_transform:
+            m = np.eye(3)
+            try:
+                if input_transform is not None:
+                    m = np.linalg.inv(np.asarray(input_transform))
+            except np.linalg.LinAlgError:
+                res.error = CapturedException()
+            G.synthesis.input.transform.copy_(torch.from_numpy(m))
+
+        # Generate random latents.
+        all_seeds = [seed for seed, _weight in w0_seeds] + [stylemix_seed]
+        all_seeds = list(set(all_seeds))
+        all_zs = np.zeros([len(all_seeds), G.z_dim], dtype=np.float32)
+        all_cs = np.zeros([len(all_seeds), G.c_dim], dtype=np.float32)
+        for idx, seed in enumerate(all_seeds):
+            rnd = np.random.RandomState(seed)
+            all_zs[idx] = rnd.randn(G.z_dim)
+        if lookat_point is None:
+            camera_pivot = torch.tensor(G.rendering_kwargs.get('avg_camera_pivot', (0, 0, 0)))
+        else:
+            # override lookat point provided
+            camera_pivot = torch.tensor(lookat_point)
+        camera_radius = G.rendering_kwargs.get('avg_camera_radius', 2.7)
+        forward_cam2world_pose = LookAtPoseSampler.sample(3.14/2 + conditioning_yaw, 3.14/2 + conditioning_pitch, camera_pivot, radius=camera_radius)
+        intrinsics = torch.tensor([[focal_length, 0, 0.5], [0, focal_length, 0.5], [0, 0, 1]])
+        conditioning_params = torch.cat([forward_cam2world_pose.reshape(16), intrinsics.reshape(9)], 0)
+        all_cs[idx, :] = conditioning_params.numpy()
+
+
+        # Run mapping network.
+        # w_avg = G.mapping.w_avg
+        w_avg = G.backbone.mapping.w_avg
+        all_zs = self.to_device(torch.from_numpy(all_zs))
+        all_cs = self.to_device(torch.from_numpy(all_cs))
+        all_ws = G.mapping(z=all_zs, c=all_cs, truncation_psi=trunc_psi, truncation_cutoff=trunc_cutoff) - w_avg
+        all_ws = dict(zip(all_seeds, all_ws))
+
+        # Calculate final W.
+        w = torch.stack([all_ws[seed] * weight for seed, weight in w0_seeds]).sum(dim=0, keepdim=True)
+        stylemix_idx = [idx for idx in stylemix_idx if 0 <= idx < G.backbone.num_ws]
+        if len(stylemix_idx) > 0:
+            w[:, stylemix_idx] = all_ws[stylemix_seed][np.newaxis, stylemix_idx]
+        w += w_avg
+
+        # Run synthesis network.
+        synthesis_kwargs = dnnlib.EasyDict(noise_mode=noise_mode, force_fp32=force_fp32, cache_backbone=do_backbone_caching)
+        torch.manual_seed(random_seed)
+
+        # Set camera params
+        pose = LookAtPoseSampler.sample(3.14/2 + yaw, 3.14/2 + pitch, camera_pivot, radius=camera_radius)
+        intrinsics = torch.tensor([[focal_length, 0, 0.5], [0, focal_length, 0.5], [0, 0, 1]])
+        c = torch.cat([pose.reshape(-1, 16), intrinsics.reshape(-1, 9)], 1).to(w.device)
+
+        # Backbone caching
+        if do_backbone_caching and self._last_model_input is not None and torch.all(self._last_model_input == w):
+            synthesis_kwargs.use_cached_backbone = True
+        else:
+            synthesis_kwargs.use_cached_backbone = False
+        self._last_model_input = w
+        out, layers = self.run_synthesis_net(G, w, c, capture_layer=layer_name, **synthesis_kwargs)
+
+        # Update layer list.
+        cache_key = (G.synthesis, tuple(sorted(synthesis_kwargs.items())))
+        if cache_key not in self._net_layers:
+            if layer_name is not None:
+                torch.manual_seed(random_seed)
+                _out, layers = self.run_synthesis_net(G, w, c, **synthesis_kwargs)
+            self._net_layers[cache_key] = layers
+        res.layers = self._net_layers[cache_key]
+
+        # Untransform.
+        if untransform and res.has_input_transform:
+            out, _mask = _apply_affine_transformation(out.to(torch.float32), G.synthesis.input.transform, amax=6) # Override amax to hit the fast path in upfirdn2d.
+
+        # Select channels and compute statistics.
+        if type(out) == dict:
+            # is model output. query render type
+            out = out[render_type][0].to(torch.float32)
+        else:
+            out = out[0].to(torch.float32)
+
+        if sel_channels > out.shape[0]:
+            sel_channels = 1
+        base_channel = max(min(base_channel, out.shape[0] - sel_channels), 0)
+        sel = out[base_channel : base_channel + sel_channels]
+        res.stats = torch.stack([
+            out.mean(), sel.mean(),
+            out.std(), sel.std(),
+            out.norm(float('inf')), sel.norm(float('inf')),
+        ])
+
+        # normalize if type is 'image_depth'
+        if render_type == 'image_depth':
+            out -= out.min()
+            out /= out.max()
+
+            out -= .5
+            out *= -2
+
+        # Scale and convert to uint8.
+        img = sel
+        if img_normalize:
+            img = img / img.norm(float('inf'), dim=[1,2], keepdim=True).clip(1e-8, 1e8)
+        img = img * (10 ** (img_scale_db / 20))
+        img = (img * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0)
+        res.image = img
+
+        # FFT.
+        if fft_show:
+            sig = out if fft_all else sel
+            sig = sig.to(torch.float32)
+            sig = sig - sig.mean(dim=[1,2], keepdim=True)
+            sig = sig * torch.kaiser_window(sig.shape[1], periodic=False, beta=fft_beta, device=self._device)[None, :, None]
+            sig = sig * torch.kaiser_window(sig.shape[2], periodic=False, beta=fft_beta, device=self._device)[None, None, :]
+            fft = torch.fft.fftn(sig, dim=[1,2]).abs().square().sum(dim=0)
+            fft = fft.roll(shifts=[fft.shape[0] // 2, fft.shape[1] // 2], dims=[0,1])
+            fft = (fft / fft.mean()).log10() * 10 # dB
+            fft = self._apply_cmap((fft / fft_range_db + 1) / 2)
+            res.image = torch.cat([img.expand_as(fft), fft], dim=1)
+
+    @staticmethod
+    def run_synthesis_net(net, *args, capture_layer=None, **kwargs): # => out, layers
+        submodule_names = {mod: name for name, mod in net.named_modules()}
+        unique_names = set()
+        layers = []
+
+        def module_hook(module, _inputs, outputs):
+            outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs]
+            outputs = [out for out in outputs if isinstance(out, torch.Tensor) and out.ndim in [4, 5]]
+            for idx, out in enumerate(outputs):
+                if out.ndim == 5: # G-CNN => remove group dimension.
+                    out = out.mean(2)
+                name = submodule_names[module]
+                if name == '':
+                    name = 'output'
+                if len(outputs) > 1:
+                    name += f':{idx}'
+                if name in unique_names:
+                    suffix = 2
+                    while f'{name}_{suffix}' in unique_names:
+                        suffix += 1
+                    name += f'_{suffix}'
+                unique_names.add(name)
+                shape = [int(x) for x in out.shape]
+                dtype = str(out.dtype).split('.')[-1]
+                layers.append(dnnlib.EasyDict(name=name, shape=shape, dtype=dtype))
+                if name == capture_layer:
+                    raise CaptureSuccess(out)
+
+        hooks = [module.register_forward_hook(module_hook) for module in net.modules()]
+        try:
+            out = net.synthesis(*args, **kwargs)
+        except CaptureSuccess as e:
+            out = e.out
+        for hook in hooks:
+            hook.remove()
+        return out, layers
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/stylemix_widget.py b/eg3d/viz/stylemix_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b84d6426b27bc890cfcf7e74a74ce0569d77847
--- /dev/null
+++ b/eg3d/viz/stylemix_widget.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class StyleMixingWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.seed_def   = 1000
+        self.seed       = self.seed_def
+        self.animate    = False
+        self.enables    = []
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        num_ws = viz.result.get('num_ws', 0)
+        num_enables = viz.result.get('num_ws', 18)
+        self.enables += [False] * max(num_enables - len(self.enables), 0)
+
+        if show:
+            imgui.text('Stylemix')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8), imgui_utils.grayed_out(num_ws == 0):
+                _changed, self.seed = imgui.input_int('##seed', self.seed)
+            imgui.same_line(viz.label_w + viz.font_size * 8 + viz.spacing)
+            with imgui_utils.grayed_out(num_ws == 0):
+                _clicked, self.animate = imgui.checkbox('Anim', self.animate)
+
+            pos2 = imgui.get_content_region_max()[0] - 1 - viz.button_w
+            pos1 = pos2 - imgui.get_text_line_height() - viz.spacing
+            pos0 = viz.label_w + viz.font_size * 12
+            imgui.push_style_var(imgui.STYLE_FRAME_PADDING, [0, 0])
+            for idx in range(num_enables):
+                imgui.same_line(round(pos0 + (pos1 - pos0) * (idx / (num_enables - 1))))
+                if idx == 0:
+                    imgui.set_cursor_pos_y(imgui.get_cursor_pos_y() + 3)
+                with imgui_utils.grayed_out(num_ws == 0):
+                    _clicked, self.enables[idx] = imgui.checkbox(f'##{idx}', self.enables[idx])
+                if imgui.is_item_hovered():
+                    imgui.set_tooltip(f'{idx}')
+            imgui.pop_style_var(1)
+
+            imgui.same_line(pos2)
+            imgui.set_cursor_pos_y(imgui.get_cursor_pos_y() - 3)
+            with imgui_utils.grayed_out(num_ws == 0):
+                if imgui_utils.button('Reset', width=-1, enabled=(self.seed != self.seed_def or self.animate or any(self.enables[:num_enables]))):
+                    self.seed = self.seed_def
+                    self.animate = False
+                    self.enables = [False] * num_enables
+
+        if any(self.enables[:num_ws]):
+            viz.args.stylemix_idx = [idx for idx, enable in enumerate(self.enables) if enable]
+            viz.args.stylemix_seed = self.seed & ((1 << 32) - 1)
+        if self.animate:
+            self.seed += 1
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/trunc_noise_widget.py b/eg3d/viz/trunc_noise_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf19c238f8934a01ef3e99247da3981f938c5336
--- /dev/null
+++ b/eg3d/viz/trunc_noise_widget.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class TruncationNoiseWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.prev_num_ws    = 0
+        self.trunc_psi      = 0.7
+        self.trunc_cutoff   = 7
+        self.noise_enable   = True
+        self.noise_seed     = 0
+        self.noise_anim     = False
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        num_ws = viz.result.get('num_ws', 0)
+        has_noise = viz.result.get('has_noise', False)
+        if num_ws > 0 and num_ws != self.prev_num_ws:
+            if self.trunc_cutoff > num_ws or self.trunc_cutoff == self.prev_num_ws:
+                self.trunc_cutoff = num_ws
+            self.prev_num_ws = num_ws
+
+        if show:
+            imgui.text('Truncate')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 10), imgui_utils.grayed_out(num_ws == 0):
+                _changed, self.trunc_psi = imgui.slider_float('##psi', self.trunc_psi, -1, 2, format='Psi %.2f')
+            imgui.same_line()
+            if num_ws == 0:
+                imgui_utils.button('Cutoff 0', width=(viz.font_size * 8 + viz.spacing), enabled=False)
+            else:
+                with imgui_utils.item_width(viz.font_size * 8 + viz.spacing):
+                    changed, new_cutoff = imgui.slider_int('##cutoff', self.trunc_cutoff, 0, num_ws, format='Cutoff %d')
+                    if changed:
+                        self.trunc_cutoff = min(max(new_cutoff, 0), num_ws)
+
+            with imgui_utils.grayed_out(not has_noise):
+                imgui.same_line()
+                _clicked, self.noise_enable = imgui.checkbox('Noise##enable', self.noise_enable)
+                imgui.same_line(viz.font_size * 28.7)
+                with imgui_utils.grayed_out(not self.noise_enable):
+                    with imgui_utils.item_width(-3 - viz.button_w - viz.spacing - viz.font_size * 4):
+                        _changed, self.noise_seed = imgui.input_int('##seed', self.noise_seed)
+                    imgui.same_line(spacing=0)
+                    _clicked, self.noise_anim = imgui.checkbox('Anim##noise', self.noise_anim)
+
+            is_def_trunc = (self.trunc_psi == 1 and self.trunc_cutoff == num_ws)
+            is_def_noise = (self.noise_enable and self.noise_seed == 0 and not self.noise_anim)
+            with imgui_utils.grayed_out(is_def_trunc and not has_noise):
+                imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w)
+                if imgui_utils.button('Reset', width=-1, enabled=(not is_def_trunc or not is_def_noise)):
+                    self.prev_num_ws = num_ws
+                    self.trunc_psi = 0.7
+                    self.trunc_cutoff = 7
+                    self.noise_enable = True
+                    self.noise_seed = 0
+                    self.noise_anim = False
+
+        if self.noise_anim:
+            self.noise_seed += 1
+        viz.args.update(trunc_psi=self.trunc_psi, trunc_cutoff=self.trunc_cutoff, random_seed=self.noise_seed)
+        viz.args.noise_mode = ('none' if not self.noise_enable else 'const' if self.noise_seed == 0 else 'random')
+
+#----------------------------------------------------------------------------
diff --git a/eg3d/viz/zoom_widget.py b/eg3d/viz/zoom_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff749b0e69a01fdafa605fd1f82bfc3980a9f7dd
--- /dev/null
+++ b/eg3d/viz/zoom_widget.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+from inspect import formatargvalues
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class ZoomWidget:
+    def __init__(self, viz):
+        self.viz         = viz
+        self.fov         = 18.837
+        self.fov_default = 18.837
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            imgui.text('FOV')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 10):
+                _changed, self.fov = imgui.slider_float('##fov', self.fov, 12, 45, format='%.2f Degrees')
+
+            imgui.same_line(viz.label_w + viz.font_size * 13 + viz.button_w + viz.spacing * 3)
+            snapped = round(self.fov)
+            if imgui_utils.button('Snap', width=viz.button_w, enabled=(self.fov != snapped)):
+                self.fov = snapped
+            imgui.same_line()
+            if imgui_utils.button('Reset', width=-1, enabled=(abs(self.fov - self.fov_default)) > .01):
+                self.fov = self.fov_default
+
+        viz.args.focal_length = float(1 / (np.tan(self.fov * 3.14159 / 360) * 1.414))
+#----------------------------------------------------------------------------
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4d0e8267551b0b209ccf69ccd89392f20891b63b
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,42 @@
+name: datid3d
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+dependencies:
+  - python >= 3.8
+  - pip
+  - numpy>=1.20
+  - click>=8.0
+  - pillow=9.3.0
+  - scipy=1.7.1
+  - cudatoolkit=11.6
+  - cudnn=8.3.2
+  - pytorch=1.12.1=py3.9_cuda11.6_cudnn8.3.2_0
+  - torchvision=0.13.1
+  - requests=2.26.0
+  - tqdm=4.62.2
+  - ninja=1.10.2
+  - matplotlib=3.4.2
+  - imageio=2.9.0
+  - x264
+  - ffmpeg=4.3.2
+  - pip:
+    - imgui==1.3.0
+    - glfw==2.2.0
+    - pyopengl==3.1.5
+    - imageio-ffmpeg==0.4.3
+    - tensorflow==2.11.0
+    - pyspng
+    - psutil
+    - mrcfile
+    - tensorboard
+    - transformers
+    - diffusers[torch]
+    - face_alignment
+    - mtcnn
+    - kornia
+    - dominate
+    - gdown
+    - trimesh
+    - gradio
diff --git a/input_imgs/input.png b/input_imgs/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..4593c56c06c61fa55b87515e3ddec8b8abd5058e
Binary files /dev/null and b/input_imgs/input.png differ
diff --git a/pose_estimation/3dface2idr.py b/pose_estimation/3dface2idr.py
new file mode 100644
index 0000000000000000000000000000000000000000..be20f705ecfffdd0c4973d49ea2c1a5446b8e3f3
--- /dev/null
+++ b/pose_estimation/3dface2idr.py
@@ -0,0 +1,130 @@
+import numpy as np
+import os
+import torch
+import json
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--in_root', type=str, default="", help='process folder')
+    parser.add_argument('--out_root', type=str, default="output", help='output folder')
+    args = parser.parse_args()
+    in_root = args.in_root
+
+    def compute_rotation(angles):
+        """
+        Return:
+            rot              -- torch.tensor, size (B, 3, 3) pts @ trans_mat
+
+        Parameters:
+            angles           -- torch.tensor, size (B, 3), radian
+        """
+
+        batch_size = angles.shape[0]
+        ones = torch.ones([batch_size, 1])
+        zeros = torch.zeros([batch_size, 1])
+        x, y, z = angles[:, :1], angles[:, 1:2], angles[:, 2:],
+        
+        rot_x = torch.cat([
+            ones, zeros, zeros,
+            zeros, torch.cos(x), -torch.sin(x), 
+            zeros, torch.sin(x), torch.cos(x)
+        ], dim=1).reshape([batch_size, 3, 3])
+        
+        rot_y = torch.cat([
+            torch.cos(y), zeros, torch.sin(y),
+            zeros, ones, zeros,
+            -torch.sin(y), zeros, torch.cos(y)
+        ], dim=1).reshape([batch_size, 3, 3])
+
+        rot_z = torch.cat([
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z), torch.cos(z), zeros,
+            zeros, zeros, ones
+        ], dim=1).reshape([batch_size, 3, 3])
+
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)[0]
+
+    npys = sorted([x for x in os.listdir(in_root) if x.endswith(".npy")])
+
+    mode = 1 #1 = IDR, 2 = LSX
+    outAll={}
+
+    for src_filename in npys:
+        src = os.path.join(in_root, src_filename)
+        
+        print(src)
+        dict_load=np.load(src, allow_pickle=True)
+
+        angle = dict_load.item()['angle']
+        trans = dict_load.item()['trans'][0]
+        R = compute_rotation(torch.from_numpy(angle)).numpy()
+    
+        trans[2] += -10
+        c = -np.dot(R, trans)
+        pose = np.eye(4)
+        pose[:3, :3] = R
+
+        c *= 0.27 # factor to match tripleganger
+        c[1] += 0.006 # offset to align to tripleganger
+        c[2] += 0.161 # offset to align to tripleganger
+        c = c/np.linalg.norm(c)*2.7  ##yiqian教我放到半球上去
+        pose[0,3] = c[0]
+        pose[1,3] = c[1]
+        pose[2,3] = c[2] 
+
+        focal = 2985.29 # = 1015*1024/224*(300/466.285)#
+        pp = 512#112
+        w = 1024#224
+        h = 1024#224
+
+        if mode==1:
+            count = 0
+            K = np.eye(3)
+            K[0][0] = focal
+            K[1][1] = focal
+            K[0][2] = w/2.0
+            K[1][2] = h/2.0
+            K = K.tolist()
+
+            Rot = np.eye(3)
+            Rot[0, 0] = 1
+            Rot[1, 1] = -1
+            Rot[2, 2] = -1        
+            pose[:3, :3] = np.dot(pose[:3, :3], Rot)
+
+            pose = pose.tolist()
+            out = {}
+            out["intrinsics"] = K
+            out["pose"] = pose
+            out["angle"] = (angle * [1, -1, 1]).flatten().tolist()
+            outAll[src_filename.replace(".npy", ".png")] = out
+
+        elif mode==2:
+
+            dst = os.path.join(in_root, src_filename.replace(".npy", "_lscam.txt"))
+            outCam = open(dst, "w")
+            outCam.write("#focal length\n")
+            outCam.write(str(focal) + " " + str(focal) + "\n")
+
+            outCam.write("#principal point\n")
+            outCam.write(str(pp) + " " + str(pp) + "\n")
+
+            outCam.write("#resolution\n")
+            outCam.write(str(w) + " " + str(h) + "\n")
+
+            outCam.write("#distortion coeffs\n")
+            outCam.write("0 0 0 0\n")
+
+
+            outCam.write("MATRIX :\n")
+            for r in range(4):
+                outCam.write(str(pose[r, 0]) + " " + str(pose[r, 1]) + " " + str(pose[r, 2]) + " " + str(pose[r, 3]) + "\n")
+
+            outCam.close()
+
+    if mode == 1:
+        dst = os.path.join(args.out_root, "cameras.json")
+        with open(dst, "w") as outfile:
+            json.dump(outAll, outfile, indent=4)
diff --git a/pose_estimation/BFM.zip b/pose_estimation/BFM.zip
new file mode 100644
index 0000000000000000000000000000000000000000..6816a3f2b241b25b2c0b9c3425d9e81e4552e481
--- /dev/null
+++ b/pose_estimation/BFM.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccdb5844a45d80f43b79b54337a19908a30d4881c43dc3fa0da67a9b1ecd0015
+size 404095246
diff --git a/pose_estimation/BFM/.gitkeep b/pose_estimation/BFM/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/BFM/01_MorphableModel.mat b/pose_estimation/BFM/01_MorphableModel.mat
new file mode 100644
index 0000000000000000000000000000000000000000..f251485b55d35adac0ad4f1622a47d7a39a1502c
--- /dev/null
+++ b/pose_estimation/BFM/01_MorphableModel.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37b1f0742db356a3b1568a8365a06f5b0fe0ab687ac1c3068c803666cbd4d8e2
+size 240875364
diff --git a/pose_estimation/BFM/BFM_exp_idx.mat b/pose_estimation/BFM/BFM_exp_idx.mat
new file mode 100644
index 0000000000000000000000000000000000000000..5b214a5f8afbc038e6959f7f72141e448e89fb3b
--- /dev/null
+++ b/pose_estimation/BFM/BFM_exp_idx.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62752a2cab3eea148569fb07e367e03535b4ee04aa71ea1a9aed36486d26c612
+size 91931
diff --git a/pose_estimation/BFM/BFM_front_idx.mat b/pose_estimation/BFM/BFM_front_idx.mat
new file mode 100644
index 0000000000000000000000000000000000000000..29d82e79f8b2558a5bf1956ab9e1261d49c2c8dd
--- /dev/null
+++ b/pose_estimation/BFM/BFM_front_idx.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d285dd018563113496127df9c364800183172adb4d3e802f726085dab66b087
+size 44880
diff --git a/pose_estimation/BFM/BFM_model_front.mat b/pose_estimation/BFM/BFM_model_front.mat
new file mode 100644
index 0000000000000000000000000000000000000000..4370da9cf59f5a7266fe5a3a8fbfa0a3538e4bbc
--- /dev/null
+++ b/pose_estimation/BFM/BFM_model_front.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52e081114e0f809fa996ada8eae52887447871f5f89c97321d2c9b766700a505
+size 127170280
diff --git a/pose_estimation/BFM/Exp_Pca.bin b/pose_estimation/BFM/Exp_Pca.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3c1785e6abc52b13e54a573f9f3ebc099915b1e0
--- /dev/null
+++ b/pose_estimation/BFM/Exp_Pca.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f31380e6cbdaf2aeec698db220bac4f221946e4d551d88c092d47ec49b1726
+size 51086404
diff --git a/pose_estimation/BFM/facemodel_info.mat b/pose_estimation/BFM/facemodel_info.mat
new file mode 100644
index 0000000000000000000000000000000000000000..c2e0a3521fc040e59e07fc09384fc140234f006f
--- /dev/null
+++ b/pose_estimation/BFM/facemodel_info.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:529398f76619ae7e22f43c25dd60a2473bcc2bcc8c894fd9c613c68624ce1c04
+size 738861
diff --git a/pose_estimation/BFM/select_vertex_id.mat b/pose_estimation/BFM/select_vertex_id.mat
new file mode 100644
index 0000000000000000000000000000000000000000..feadeff96a0b8e0619461f64a9bdc9e761b14c80
--- /dev/null
+++ b/pose_estimation/BFM/select_vertex_id.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6877a7d634330f25bf1e81bc062b6507ee53ea183838e471fa21b613048fa36b
+size 62299
diff --git a/pose_estimation/BFM/similarity_Lm3D_all.mat b/pose_estimation/BFM/similarity_Lm3D_all.mat
new file mode 100644
index 0000000000000000000000000000000000000000..9f5b0bd4ecffb926128a29cb1bbf9d9081c3d4e7
--- /dev/null
+++ b/pose_estimation/BFM/similarity_Lm3D_all.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53b83ce6e35c50ddc3e97603650cef4970320c157e75c241c844f29c1dcba65a
+size 994
diff --git a/pose_estimation/BFM/std_exp.txt b/pose_estimation/BFM/std_exp.txt
new file mode 100644
index 0000000000000000000000000000000000000000..767b8de4ea1ca78b6f22b98ff2dee4fa345500bb
--- /dev/null
+++ b/pose_estimation/BFM/std_exp.txt
@@ -0,0 +1 @@
+453980 257264 263068 211890 135873 184721 47055.6 72732 62787.4 106226 56708.5 51439.8 34887.1 44378.7 51813.4 31030.7 23354.9 23128.1 19400 21827.6 22767.7 22057.4 19894.3 16172.8 17142.7 10035.3 14727.5 12972.5 10763.8 8953.93 8682.62 8941.81 6342.3 5205.3 7065.65 6083.35 6678.88 4666.63 5082.89 5134.76 4908.16 3964.93 3739.95 3180.09 2470.45 1866.62 1624.71 2423.74 1668.53 1471.65 1194.52 782.102 815.044 835.782 834.937 744.496 575.146 633.76 705.685 753.409 620.306 673.326 766.189 619.866 559.93 357.264 396.472 556.849 455.048 460.592 400.735 326.702 279.428 291.535 326.584 305.664 287.816 283.642 276.19
\ No newline at end of file
diff --git a/pose_estimation/ConfigModels/faceparsing_model.pth b/pose_estimation/ConfigModels/faceparsing_model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ca57f3257ca7715bc340d065764bc249d985c287
--- /dev/null
+++ b/pose_estimation/ConfigModels/faceparsing_model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:468e13ca13a9b43cc0881a9f99083a430e9c0a38abd935431d1c28ee94b26567
+size 53289463
diff --git a/pose_estimation/DataProcess/BiSeNet.py b/pose_estimation/DataProcess/BiSeNet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f4dee8a4964f9fea7f24d76c7195bcf3ed13424
--- /dev/null
+++ b/pose_estimation/DataProcess/BiSeNet.py
@@ -0,0 +1,280 @@
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+from resnet import Resnet18
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+
+
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1, *args, **kwargs):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2d(in_chan,
+                out_chan,
+                kernel_size = ks,
+                stride = stride,
+                padding = padding,
+                bias = False)
+        self.bn = nn.BatchNorm2d(out_chan)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(self.bn(x))
+        return x
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+class BiSeNetOutput(nn.Module):
+    def __init__(self, in_chan, mid_chan, n_classes, *args, **kwargs):
+        super(BiSeNetOutput, self).__init__()
+        self.conv = ConvBNReLU(in_chan, mid_chan, ks=3, stride=1, padding=1)
+        self.conv_out = nn.Conv2d(mid_chan, n_classes, kernel_size=1, bias=False)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_out(x)
+        return x
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class AttentionRefinementModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(AttentionRefinementModule, self).__init__()
+        self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1)
+        self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size= 1, bias=False)
+        self.bn_atten = nn.BatchNorm2d(out_chan)
+        self.sigmoid_atten = nn.Sigmoid()
+        self.init_weight()
+
+    def forward(self, x):
+        feat = self.conv(x)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv_atten(atten)
+        atten = self.bn_atten(atten)
+        atten = self.sigmoid_atten(atten)
+        out = torch.mul(feat, atten)
+        return out
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+
+class ContextPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(ContextPath, self).__init__()
+        self.resnet = Resnet18()
+        self.arm16 = AttentionRefinementModule(256, 128)
+        self.arm32 = AttentionRefinementModule(512, 128)
+        self.conv_head32 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_head16 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_avg = ConvBNReLU(512, 128, ks=1, stride=1, padding=0)
+
+        self.init_weight()
+
+    def forward(self, x):
+        H0, W0 = x.size()[2:]
+        feat8, feat16, feat32 = self.resnet(x)
+        H8, W8 = feat8.size()[2:]
+        H16, W16 = feat16.size()[2:]
+        H32, W32 = feat32.size()[2:]
+
+        avg = F.avg_pool2d(feat32, feat32.size()[2:])
+        avg = self.conv_avg(avg)
+        avg_up = F.interpolate(avg, (H32, W32), mode='nearest')
+
+        feat32_arm = self.arm32(feat32)
+        feat32_sum = feat32_arm + avg_up
+        feat32_up = F.interpolate(feat32_sum, (H16, W16), mode='nearest')
+        feat32_up = self.conv_head32(feat32_up)
+
+        feat16_arm = self.arm16(feat16)
+        feat16_sum = feat16_arm + feat32_up
+        feat16_up = F.interpolate(feat16_sum, (H8, W8), mode='nearest')
+        feat16_up = self.conv_head16(feat16_up)
+
+        return feat8, feat16_up, feat32_up  # x8, x8, x16
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+### This is not used, since I replace this with the resnet feature with the same size
+class SpatialPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(SpatialPath, self).__init__()
+        self.conv1 = ConvBNReLU(3, 64, ks=7, stride=2, padding=3)
+        self.conv2 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv3 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv_out = ConvBNReLU(64, 128, ks=1, stride=1, padding=0)
+        self.init_weight()
+
+    def forward(self, x):
+        feat = self.conv1(x)
+        feat = self.conv2(feat)
+        feat = self.conv3(feat)
+        feat = self.conv_out(feat)
+        return feat
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class FeatureFusionModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(FeatureFusionModule, self).__init__()
+        self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0)
+        self.conv1 = nn.Conv2d(out_chan,
+                out_chan//4,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.conv2 = nn.Conv2d(out_chan//4,
+                out_chan,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+
+    def forward(self, fsp, fcp):
+        fcat = torch.cat([fsp, fcp], dim=1)
+        feat = self.convblk(fcat)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv1(atten)
+        atten = self.relu(atten)
+        atten = self.conv2(atten)
+        atten = self.sigmoid(atten)
+        feat_atten = torch.mul(feat, atten)
+        feat_out = feat_atten + feat
+        return feat_out
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class BiSeNet(nn.Module):
+    def __init__(self, n_classes, *args, **kwargs):
+        super(BiSeNet, self).__init__()
+        self.cp = ContextPath()
+        ## here self.sp is deleted
+        self.ffm = FeatureFusionModule(256, 256)
+        self.conv_out = BiSeNetOutput(256, 256, n_classes)
+        self.conv_out16 = BiSeNetOutput(128, 64, n_classes)
+        self.conv_out32 = BiSeNetOutput(128, 64, n_classes)
+        self.init_weight()
+
+    def forward(self, x):
+        H, W = x.size()[2:]
+        feat_res8, feat_cp8, feat_cp16 = self.cp(x)  # here return res3b1 feature
+        feat_sp = feat_res8  # use res3b1 feature to replace spatial path feature
+        feat_fuse = self.ffm(feat_sp, feat_cp8)
+
+        feat_out = self.conv_out(feat_fuse)
+        feat_out16 = self.conv_out16(feat_cp8)
+        feat_out32 = self.conv_out32(feat_cp16)
+
+        feat_out = F.interpolate(feat_out, (H, W), mode='bilinear', align_corners=True)
+        feat_out16 = F.interpolate(feat_out16, (H, W), mode='bilinear', align_corners=True)
+        feat_out32 = F.interpolate(feat_out32, (H, W), mode='bilinear', align_corners=True)
+        return feat_out, feat_out16, feat_out32
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params = [], [], [], []
+        for name, child in self.named_children():
+            child_wd_params, child_nowd_params = child.get_params()
+            if isinstance(child, FeatureFusionModule) or isinstance(child, BiSeNetOutput):
+                lr_mul_wd_params += child_wd_params
+                lr_mul_nowd_params += child_nowd_params
+            else:
+                wd_params += child_wd_params
+                nowd_params += child_nowd_params
+        return wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params
+
+
+if __name__ == "__main__":
+    net = BiSeNet(19)
+    net.cuda()
+    net.eval()
+    in_ten = torch.randn(16, 3, 640, 480).cuda()
+    out, out16, out32 = net(in_ten)
+    print(out.shape)
+
+    net.get_params()
diff --git a/pose_estimation/DataProcess/Gen_HeadMask.py b/pose_estimation/DataProcess/Gen_HeadMask.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2604f9936d590f943ac9fbf4dd78342211b17e0
--- /dev/null
+++ b/pose_estimation/DataProcess/Gen_HeadMask.py
@@ -0,0 +1,93 @@
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+
+import torch
+from BiSeNet import BiSeNet
+from torchvision.transforms import transforms
+import cv2
+from tqdm import tqdm
+import numpy as np
+from glob import glob
+from correct_head_mask import correct_hair_mask
+import argparse
+
+
+class GenHeadMask(object):
+    def __init__(self, gpu_id) -> None:
+        super().__init__()
+        
+        self.device = torch.device("cuda:%s" % gpu_id)
+        self.model_path = "ConfigModels/faceparsing_model.pth"
+        
+        self.init_model()
+        self.lut = np.zeros((256, ), dtype=np.uint8)
+        self.lut[1:14] = 1
+        self.lut[17] = 2
+        
+        #  ['skin', 'l_brow', 'r_brow', 'l_eye', 'r_eye', 'eye_g', 'l_ear', 'r_ear', 'ear_r',
+        #     'nose', 'mouth', 'u_lip', 'l_lip', 'neck', 'neck_l', 'cloth', 'hair', 'hat']
+
+    def init_model(self):
+        n_classes = 19
+        net = BiSeNet(n_classes=n_classes).to(self.device)
+        net.load_state_dict(torch.load(self.model_path, map_location=self.device))
+        self.net = net
+        self.net.eval()
+
+        self.to_tensor = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        ])
+        
+        
+    def main_process(self, img_dir):
+        img_path_list = [x for x in glob("%s/*.png" % img_dir) if "mask" not in x]  
+        img_path_list = img_path_list + [x for x in glob("%s/*.jpg" % img_dir) if "mask" not in x]  
+        if len(img_path_list) == 0:
+            print("Dir: %s does include any .png and .jpg images." % img_dir)
+            exit(0)
+        img_path_list.sort()
+        loop_bar = tqdm(img_path_list)
+        loop_bar.set_description("Generate head masks")
+        for img_path in loop_bar:
+            save_path = img_path[:-4] + "_mask.png"
+            
+            bgr_img = cv2.imread(img_path)
+            img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB)
+            img = self.to_tensor(img)
+            img = img.unsqueeze(0)
+            img = img.to(self.device)
+            with torch.set_grad_enabled(False):
+                pred_res = self.net(img)
+                out = pred_res[0]
+
+            res = out.squeeze(0).cpu().numpy().argmax(0)
+            res = res.astype(np.uint8)
+            cv2.LUT(res, self.lut, res)
+            
+            res = correct_hair_mask(res)
+            res[res != 0] = 255
+            # temp_img = bgr_img.copy()
+            # temp_img[res == 0] = 255
+            
+            # res = np.concatenate([bgr_img, temp_img], axis=1)
+            cv2.imwrite(save_path, res)
+            
+            
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='The code for generating head mask images.')
+    parser.add_argument("--gpu_id", type=int, default=0)
+    parser.add_argument("--img_dir", type=str, required=True)
+    args = parser.parse_args()
+
+    gpu_id = args.gpu_id
+    img_dir = args.img_dir
+
+    # assert len(sys.argv) == 2
+    # img_dir = sys.argv[1]
+
+    tt = GenHeadMask(gpu_id=gpu_id)
+    tt.main_process(img_dir=img_dir)
+    
\ No newline at end of file
diff --git a/pose_estimation/DataProcess/Gen_Landmark.py b/pose_estimation/DataProcess/Gen_Landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7db90695affb63dd4bbef406df3e993084be0fd
--- /dev/null
+++ b/pose_estimation/DataProcess/Gen_Landmark.py
@@ -0,0 +1,61 @@
+import face_alignment
+import cv2
+import os
+from os.path import join
+import numpy as np
+from tqdm import tqdm
+import json
+from glob import glob
+import argparse
+
+
+class Gen2DLandmarks(object):
+    def __init__(self) -> None:
+        super().__init__()
+        #
+        # print(face_alignment)
+        # print(face_alignment.FaceAlignment)
+        # print(face_alignment.LandmarksType)
+        self.fa_func = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False)
+        # self.fa_func = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=False)
+        
+        
+    def main_process(self, img_dir):
+        
+        img_path_list = [x for x in sorted(glob("%s/*.png" % img_dir)) if "mask" not in x]
+        img_path_list = img_path_list + [x for x in sorted(glob("%s/*.jpg" % img_dir)) if "mask" not in x] 
+        #img_path_list = img_path_list[27:]
+        if len(img_path_list) == 0:
+            print("Dir: %s does include any .png and .jpg images." % img_dir)
+            exit(0)
+        
+        img_path_list.sort()
+
+        for img_path in tqdm(img_path_list, desc="Generate facial landmarks"):
+            
+            img_bgr = cv2.imread(img_path)
+            img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+            res = self.fa_func.get_landmarks(img_rgb)
+            
+            if res is None:
+                print("Warning: can't predict the landmark info of %s" % img_path)
+                
+            # base_name = img_path[img_path.rfind("/") + 1:-4]
+            save_path = img_path[:-4] + "_lm2d.txt"
+            preds = res[0]
+            with open(save_path, "w") as f:
+                for tt in preds:
+                    f.write("%f %f\n"%(tt[0],tt[1]))
+             
+        
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='The code for generating facial landmarks.')
+    # parser.add_argument("--gpu_id", type=int, default=0)
+    parser.add_argument("--img_dir", type=str, required=True)
+    args = parser.parse_args()
+
+    tt = Gen2DLandmarks()
+    tt.main_process(args.img_dir)
+    
\ No newline at end of file
diff --git a/pose_estimation/DataProcess/__init__.py b/pose_estimation/DataProcess/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/DataProcess/correct_head_mask.py b/pose_estimation/DataProcess/correct_head_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c8f955c808b6022daa0a07010be452e5fcc8e8
--- /dev/null
+++ b/pose_estimation/DataProcess/correct_head_mask.py
@@ -0,0 +1,78 @@
+import cv2
+import numpy as np
+
+
+def erosion_hair_region(img_c1u, num_iter):
+    
+    fil = np.array([[     0, 0.25,    0], 
+                    [  0.25, -1.0, 0.25],
+                    [     0, 0.25,    0]])
+
+    ddepth = cv2.CV_32FC1
+    temp_img = img_c1u.copy()
+
+    temp_img[temp_img == 1] = 3
+    temp_img[temp_img == 2] = 1
+    temp_img[temp_img == 3] = 2
+    # cv2.imwrite("./temp_res/trans.png", temp_img)
+    # exit(0)
+    
+    img_f = temp_img.astype(np.float32)
+
+    for _ in range(num_iter):
+        
+        img_res = cv2.filter2D(img_f, ddepth, fil, borderType=cv2.BORDER_CONSTANT) 
+        mask_reion = (img_c1u == 2) * (img_res < -0.01)
+        
+        img_f[mask_reion] = 0.0
+        # cv2.imwrite("./temp_res/temp.pfm", img_f)
+        # cv2.imwrite("./temp_res/img_res.pfm", img_res)
+        # exit(0)
+        # img_c1u[mask_reion] = 0
+        # temp_img[mask_reion] = 0
+    
+    res = img_f.astype(np.uint8)
+    res[res == 1] = 3
+    res[res == 2] = 1
+    res[res == 3] = 2
+    
+    return res
+
+
+def extract_max_region(label_img, tar_value):
+    mask_img = np.zeros_like(label_img)
+    mask_img[label_img == tar_value] = 1
+    num_labels, label_img = cv2.connectedComponents(mask_img, connectivity=8)
+    
+    max_label = -1
+    max_area = -1.0
+    
+    for i in range(1, num_labels):
+        cur_area = np.sum(label_img == i)
+        if cur_area > max_area:
+            max_label = i
+            max_area = cur_area
+    
+    label_img[label_img != max_label] = 0
+    label_img[label_img == max_label] = 255
+    return label_img
+
+
+def remover_free_block(img_c1u):
+    temp_img = img_c1u.copy()
+    
+    temp_img[temp_img > 0.5] = 1
+    label_img = extract_max_region(temp_img, 1)
+    
+    img_c1u[label_img != 255] = 0
+
+    return img_c1u
+
+
+def correct_hair_mask(mask_img):
+    
+    mask_img = remover_free_block(mask_img)
+    mask_img = erosion_hair_region(mask_img, 7)
+    mask_img = remover_free_block(mask_img)
+    
+    return mask_img
\ No newline at end of file
diff --git a/pose_estimation/DataProcess/resnet.py b/pose_estimation/DataProcess/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa2bf95130e9815ba378cb6f73207068b81a04b9
--- /dev/null
+++ b/pose_estimation/DataProcess/resnet.py
@@ -0,0 +1,109 @@
+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as modelzoo
+
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+
+resnet18_url = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_chan, out_chan, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(in_chan, out_chan, stride)
+        self.bn1 = nn.BatchNorm2d(out_chan)
+        self.conv2 = conv3x3(out_chan, out_chan)
+        self.bn2 = nn.BatchNorm2d(out_chan)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        if in_chan != out_chan or stride != 1:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_chan, out_chan,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_chan),
+                )
+
+    def forward(self, x):
+        residual = self.conv1(x)
+        residual = F.relu(self.bn1(residual))
+        residual = self.conv2(residual)
+        residual = self.bn2(residual)
+
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        out = shortcut + residual
+        out = self.relu(out)
+        return out
+
+
+def create_layer_basic(in_chan, out_chan, bnum, stride=1):
+    layers = [BasicBlock(in_chan, out_chan, stride=stride)]
+    for i in range(bnum-1):
+        layers.append(BasicBlock(out_chan, out_chan, stride=1))
+    return nn.Sequential(*layers)
+
+
+class Resnet18(nn.Module):
+    def __init__(self):
+        super(Resnet18, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = create_layer_basic(64, 64, bnum=2, stride=1)
+        self.layer2 = create_layer_basic(64, 128, bnum=2, stride=2)
+        self.layer3 = create_layer_basic(128, 256, bnum=2, stride=2)
+        self.layer4 = create_layer_basic(256, 512, bnum=2, stride=2)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(self.bn1(x))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        feat8 = self.layer2(x) # 1/8
+        feat16 = self.layer3(feat8) # 1/16
+        feat32 = self.layer4(feat16) # 1/32
+        return feat8, feat16, feat32
+
+    def init_weight(self):
+        state_dict = modelzoo.load_url(resnet18_url)
+        self_state_dict = self.state_dict()
+        for k, v in state_dict.items():
+            if 'fc' in k: continue
+            self_state_dict.update({k: v})
+        self.load_state_dict(self_state_dict)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module,  nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+if __name__ == "__main__":
+    net = Resnet18()
+    x = torch.randn(16, 3, 224, 224)
+    out = net(x)
+    print(out[0].size())
+    print(out[1].size())
+    print(out[2].size())
+    net.get_params()
diff --git a/pose_estimation/__init__.py b/pose_estimation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/align_roll.py b/pose_estimation/align_roll.py
new file mode 100644
index 0000000000000000000000000000000000000000..20c485f402997690613ec466beb1cc96015ed39d
--- /dev/null
+++ b/pose_estimation/align_roll.py
@@ -0,0 +1,152 @@
+import multiprocessing
+import os
+import re
+import sys
+import requests
+import html
+import hashlib
+import PIL.Image
+import PIL.ImageFile
+import numpy as np
+import scipy.ndimage
+import threading
+import queue
+import time
+import json
+import uuid
+import glob
+import argparse
+import itertools
+import shutil
+from collections import OrderedDict, defaultdict
+import cv2
+from tqdm import tqdm
+import multiprocessing
+import scipy.io
+
+PIL.ImageFile.LOAD_TRUNCATED_IMAGES = True # avoid "Decompressed Data Too Large" error
+
+#----------------------------------------------------------------------------
+import sys
+name = sys.argv[1]
+custom_folder = sys.argv[2]
+temp_folder = sys.argv[3]
+lm_file = open('%s/%s_lm2d.txt'%(custom_folder,name),'r')
+lm = np.zeros((68,2),dtype=np.float32)
+lines = lm_file.readlines()
+for i in range(68):
+    lm[i,0] = lines[i].strip().split(' ')[0]
+    lm[i,1] = lines[i].strip().split(' ')[1]
+#print(lm)
+#load_model = scipy.io.loadmat('/disk1/jiaxin/eg3d/eg3d/eg3d-pose-detection/align1500_test/epoch_20_000000/jenny.mat')
+
+#import pdb;pdb.set_trace()
+#json_spec = dict(file_url='https://drive.google.com/uc?id=16N0RV4fHI6joBuKbQAoG34V_cQk7vxSA', file_path='ffhq-dataset-v2.json', file_size=267793842, file_md5='425ae20f06a4da1d4dc0f46d40ba5fd6')
+
+#----------------------------------------------------------------------------
+
+def process_image(lm):#item_idx, item, dst_dir="realign1500", output_size=1500, transform_size=4096, enable_padding=True):
+
+    output_size = 1300
+    transform_size =4096
+    enable_padding = True
+
+
+
+    # Parse landmarks.
+    # pylint: disable=unused-variable
+
+    lm_chin          = lm[0  : 17]  # left-right
+    lm_eyebrow_left  = lm[17 : 22]  # left-right
+    lm_eyebrow_right = lm[22 : 27]  # left-right
+    lm_nose          = lm[27 : 31]  # top-down
+    lm_nostrils      = lm[31 : 36]  # top-down
+    lm_eye_left      = lm[36 : 42]  # left-clockwise
+    lm_eye_right     = lm[42 : 48]  # left-clockwise
+    lm_mouth_outer   = lm[48 : 60]  # left-clockwise
+    lm_mouth_inner   = lm[60 : 68]  # left-clockwise
+
+    # Calculate auxiliary vectors.
+    eye_left     = np.mean(lm_eye_left, axis=0)
+    eye_right    = np.mean(lm_eye_right, axis=0)
+    eye_avg      = (eye_left + eye_right) * 0.5
+    eye_to_eye   = eye_right - eye_left
+    mouth_left   = lm_mouth_outer[0]
+    mouth_right  = lm_mouth_outer[6]
+    mouth_avg    = (mouth_left + mouth_right) * 0.5
+    eye_to_mouth = mouth_avg - eye_avg
+
+    # Choose oriented crop rectangle.
+    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+    x /= np.hypot(*x)
+    x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
+    y = np.flipud(x) * [-1, 1]
+    q_scale = 1.8
+    x = q_scale * x
+    y = q_scale * y
+    c = eye_avg + eye_to_mouth * 0.1
+    quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+    qsize = np.hypot(*x) * 2
+
+    # Load in-the-wild image.
+    src_file ='%s/%s.jpg'%(custom_folder,name) 
+    if not os.path.exists(src_file):
+        src_file ='%s/%s.png'%(custom_folder,name) 
+    img = PIL.Image.open(src_file)
+    print(img.size)
+    import time
+
+    # Shrink.
+    start_time = time.time()
+    shrink = int(np.floor(qsize / output_size * 0.5))
+    if shrink > 1:
+        rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
+        img = img.resize(rsize, PIL.Image.ANTIALIAS)
+        quad /= shrink
+        qsize /= shrink
+    # print("shrink--- %s seconds ---" % (time.time() - start_time))
+
+    # Crop.
+    start_time = time.time()
+    border = max(int(np.rint(qsize * 0.1)), 3)
+    crop = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
+    crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]), min(crop[3] + border, img.size[1]))
+    if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
+        img = img.crop(crop)
+        quad -= crop[0:2]
+    # print("crop--- %s seconds ---" % (time.time() - start_time))
+
+    # Pad.
+    start_time = time.time()
+    pad = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
+    pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0), max(pad[3] - img.size[1] + border, 0))
+    if enable_padding and max(pad) > border - 4:
+        pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
+        img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
+        h, w, _ = img.shape
+        y, x, _ = np.ogrid[:h, :w, :1]
+        mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w-1-x) / pad[2]), 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h-1-y) / pad[3]))
+        low_res = cv2.resize(img, (0,0), fx=0.1, fy=0.1, interpolation = cv2.INTER_AREA)
+        blur = qsize * 0.02*0.1
+        low_res = scipy.ndimage.gaussian_filter(low_res, [blur, blur, 0])
+        low_res = cv2.resize(low_res, (img.shape[1], img.shape[0]), interpolation = cv2.INTER_LANCZOS4)
+        img += (low_res - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+        median = cv2.resize(img, (0,0), fx=0.1, fy=0.1, interpolation = cv2.INTER_AREA)
+        median = np.median(median, axis=(0,1))
+        img += (median - img) * np.clip(mask, 0.0, 1.0)
+        img = PIL.Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
+        quad += pad[:2]
+    # print("pad--- %s seconds ---" % (time.time() - start_time))
+
+    # Transform.
+    start_time = time.time()
+    img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
+    if output_size < transform_size:
+        img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
+    # print("transform--- %s seconds ---" % (time.time() - start_time))
+
+    # Save aligned image.
+    os.makedirs('%s/'%(temp_folder),exist_ok=True)
+    img.save('%s/%s.png'%(temp_folder,name))
+   
+process_image(lm)
\ No newline at end of file
diff --git a/pose_estimation/batch_mtcnn.py b/pose_estimation/batch_mtcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..28c77e289fcc0d6a5f2838d0b08b06d0f184399c
--- /dev/null
+++ b/pose_estimation/batch_mtcnn.py
@@ -0,0 +1,80 @@
+import argparse
+import cv2
+import os
+from mtcnn import MTCNN
+import random
+from tqdm import tqdm
+import numpy as np
+
+detector = MTCNN()
+parser = argparse.ArgumentParser()
+parser.add_argument('--in_root', type=str, default="", help='process folder')
+args = parser.parse_args()
+in_root = args.in_root
+
+out_root = os.path.join(in_root, "debug")
+out_detection = os.path.join(in_root, "detections")
+if not os.path.exists(out_root):
+    os.makedirs(out_root)
+if not os.path.exists(out_detection):
+    os.makedirs(out_detection)
+
+imgs = sorted([x for x in os.listdir(in_root) if x.endswith(".jpg") or x.endswith(".png")])
+random.shuffle(imgs)
+for img in tqdm(imgs):
+    src = os.path.join(in_root, img)
+    dst = os.path.join(out_detection, img.replace(".jpg", ".txt").replace(".png", ".txt"))
+
+    if not os.path.exists(dst):
+        image = cv2.cvtColor(cv2.imread(src), cv2.COLOR_BGR2RGB)
+        print(image.shape)
+        result = detector.detect_faces(image)
+
+        if len(result)>0:
+            index = 0
+            if len(result)>1: # if multiple faces, take the biggest face
+                # size = -100000
+                lowest_dist = float('Inf')
+                for r in range(len(result)):
+                    # print(result[r]["box"][0], result[r]["box"][1])
+                    face_pos = np.array(result[r]["box"][:2]) + np.array(result[r]["box"][2:])/2
+
+                    dist_from_center = np.linalg.norm(face_pos - np.array([1500./2, 1500./2]))
+                    if dist_from_center < lowest_dist:
+                        lowest_dist = dist_from_center
+                        index=r
+
+
+                    # size_ = result[r]["box"][2] + result[r]["box"][3]
+                    # if size < size_:
+                    #     size = size_
+                    #     index = r
+
+            # Result is an array with all the bounding boxes detected. We know that for 'ivan.jpg' there is only one.
+            bounding_box = result[index]['box']
+            keypoints = result[index]['keypoints']
+            if result[index]["confidence"] > 0.9:
+
+                cv2.rectangle(image,
+                            (bounding_box[0], bounding_box[1]),
+                            (bounding_box[0]+bounding_box[2], bounding_box[1] + bounding_box[3]),
+                            (0,155,255),
+                            2)
+
+                cv2.circle(image,(keypoints['left_eye']), 2, (0,155,255), 2)
+                cv2.circle(image,(keypoints['right_eye']), 2, (0,155,255), 2)
+                cv2.circle(image,(keypoints['nose']), 2, (0,155,255), 2)
+                cv2.circle(image,(keypoints['mouth_left']), 2, (0,155,255), 2)
+                cv2.circle(image,(keypoints['mouth_right']), 2, (0,155,255), 2)
+
+                dst = os.path.join(out_root, img)
+                # cv2.imwrite(dst, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+
+                dst = os.path.join(out_detection, img.replace(".jpg", ".txt").replace(".png", ".txt"))
+                outLand = open(dst, "w")
+                outLand.write(str(float(keypoints['left_eye'][0])) + " " + str(float(keypoints['left_eye'][1])) + "\n")
+                outLand.write(str(float(keypoints['right_eye'][0])) + " " + str(float(keypoints['right_eye'][1])) + "\n")
+                outLand.write(str(float(keypoints['nose'][0])) + " " +      str(float(keypoints['nose'][1])) + "\n")
+                outLand.write(str(float(keypoints['mouth_left'][0])) + " " + str(float(keypoints['mouth_left'][1])) + "\n")
+                outLand.write(str(float(keypoints['mouth_right'][0])) + " " + str(float(keypoints['mouth_right'][1])) + "\n")
+                outLand.close()
\ No newline at end of file
diff --git a/pose_estimation/check_pose.py b/pose_estimation/check_pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..d537c99aa892f6e6cebe64e37149d900344637ec
--- /dev/null
+++ b/pose_estimation/check_pose.py
@@ -0,0 +1,44 @@
+import numpy as np
+import scipy.io
+def euler2rot(euler):
+    sin, cos = np.sin, np.cos
+    phi, theta, psi = euler[0], euler[1], euler[2]
+    R1 = np.array([[1, 0, 0],
+                   [0, cos(phi), sin(phi)],
+                   [0, -sin(phi), cos(phi)]])
+    R2 = np.array([[cos(theta), 0, -sin(theta)],
+                   [0, 1, 0],
+                   [sin(theta), 0, cos(theta)]])
+    R3 = np.array([[cos(psi), sin(psi), 0],
+                   [-sin(psi), cos(psi), 0],
+                   [0, 0, 1]])
+    R = R1 @ R2 @ R3
+    return R
+
+
+import json
+import os
+from glob import glob
+import sys
+
+temp_folder = sys.argv[1]
+output_folder = sys.argv[2]
+
+import shutil
+pose_template = np.load('util/pose_template.npy')
+glob_names= sorted(glob('%s/cropped_images/*.png'%(temp_folder)))
+
+for name_all in glob_names:
+
+ if os.path.isfile('%s/cropped_images/cameras.json'%(temp_folder)):
+       with open('%s/cropped_images/cameras.json'%(temp_folder), 'r') as file:     
+              labels = json.load(file)#['labels']
+              predict_pose = labels
+ name = os.path.basename(name_all)[:-4]
+ pose= np.array(predict_pose[name+'.png']['pose']).reshape(16)
+ pose_template[:16] = pose
+
+ np.save('%s/'%(output_folder)+name+'.npy',pose_template)
+ shutil.copy('%s/cropped_images/'%(temp_folder)+name+'.png','%s/'%(output_folder)+name+'.png')
+
+
diff --git a/pose_estimation/checkpoints/pretrained/epoch_20.pth b/pose_estimation/checkpoints/pretrained/epoch_20.pth
new file mode 100644
index 0000000000000000000000000000000000000000..97ebd6753f7ca4bcd39d3b82e7109b66a2dbc1fb
--- /dev/null
+++ b/pose_estimation/checkpoints/pretrained/epoch_20.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d17a6b23457b521801baae583cb6a58f7238fe6721fc3d65d76407460e9149b
+size 288860037
diff --git a/pose_estimation/checkpoints/pretrained/test_opt.txt b/pose_estimation/checkpoints/pretrained/test_opt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e5333d2b9533a3c9a02a4d923c81e4bc7546257
--- /dev/null
+++ b/pose_estimation/checkpoints/pretrained/test_opt.txt
@@ -0,0 +1,32 @@
+----------------- Options ---------------
+                add_image: True                          
+               bfm_folder: BFM                           
+                bfm_model: BFM_model_front.mat           
+                 camera_d: 10.0                          
+                   center: 112.0                         
+          checkpoints_dir: ./checkpoints                 
+             dataset_mode: None                          
+                 ddp_port: 12355                         
+        display_per_batch: True                          
+                    epoch: 20                            	[default: latest]
+          eval_batch_nums: inf                           
+                    focal: 1015.0                        
+                  gpu_ids: 0                             
+               img_folder: ./../test_runs/manip_3D_recon_gradio_2023-06-20_08:07:05/1_align_result/	[default: examples]
+                init_path: checkpoints/init_model/resnet50-0676ba61.pth
+                  isTrain: False                         	[default: None]
+                    model: facerecon                     
+                     name: pretrained                    	[default: face_recon]
+                net_recon: resnet50                      
+                    phase: test                          
+               skip_model: False                         
+                    start: 0                             
+                   suffix:                               
+                  use_ddp: False                         	[default: True]
+              use_last_fc: False                         
+                  verbose: False                         
+           vis_batch_nums: 1                             
+               world_size: 1                             
+                    z_far: 15.0                          
+                   z_near: 5.0                           
+----------------- End -------------------
diff --git a/pose_estimation/crop_images.py b/pose_estimation/crop_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c9fab2823c3fdda8cc637fd891ff1f03d57d4a
--- /dev/null
+++ b/pose_estimation/crop_images.py
@@ -0,0 +1,132 @@
+import argparse
+import os
+import json
+
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+# calculating least square problem for image alignment
+def POS(xp, x):
+    npts = xp.shape[1]
+
+    A = np.zeros([2*npts, 8])
+
+    A[0:2*npts-1:2, 0:3] = x.transpose()
+    A[0:2*npts-1:2, 3] = 1
+
+    A[1:2*npts:2, 4:7] = x.transpose()
+    A[1:2*npts:2, 7] = 1
+
+    b = np.reshape(xp.transpose(), [2*npts, 1])
+
+    k, _, _, _ = np.linalg.lstsq(A, b)
+
+    R1 = k[0:3]
+    R2 = k[4:7]
+    sTx = k[3]
+    sTy = k[7]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2
+    t = np.stack([sTx, sTy], axis=0)
+
+    return t, s
+
+def extract_5p(lm):
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
+    lm5p = np.stack([lm[lm_idx[0], :], np.mean(lm[lm_idx[[1, 2]], :], 0), np.mean(
+        lm[lm_idx[[3, 4]], :], 0), lm[lm_idx[5], :], lm[lm_idx[6], :]], axis=0)
+    lm5p = lm5p[[1, 2, 0, 3, 4], :]
+    return lm5p
+
+# resize and crop images for face reconstruction
+def resize_n_crop_img(img, lm, t, s, target_size=1024., mask=None):
+    w0, h0 = img.size
+    w = (w0*s).astype(np.int32)
+    h = (h0*s).astype(np.int32)
+    left = (w/2 - target_size/2 + float((t[0] - w0/2)*s)).astype(np.int32)
+    right = left + target_size
+    up = (h/2 - target_size/2 + float((h0/2 - t[1])*s)).astype(np.int32)
+    below = up + target_size
+    img = img.resize((w, h), resample=Image.LANCZOS)
+    img = img.crop((left, up, right, below))
+
+    if mask is not None:
+        mask = mask.resize((w, h), resample=Image.LANCZOS)
+        mask = mask.crop((left, up, right, below))
+
+    lm = np.stack([lm[:, 0] - t[0] + w0/2, lm[:, 1] -
+                  t[1] + h0/2], axis=1)*s
+    lm = lm - np.reshape(
+            np.array([(w/2 - target_size/2), (h/2-target_size/2)]), [1, 2])
+    return img, lm, mask
+
+
+# utils for face reconstruction
+def align_img(img, lm, lm3D, mask=None, target_size=1024., rescale_factor=466.285):
+    """
+    Return:
+        transparams        --numpy.array  (raw_W, raw_H, scale, tx, ty)
+        img_new            --PIL.Image  (target_size, target_size, 3)
+        lm_new             --numpy.array  (68, 2), y direction is opposite to v direction
+        mask_new           --PIL.Image  (target_size, target_size)
+    
+    Parameters:
+        img                --PIL.Image  (raw_H, raw_W, 3)
+        lm                 --numpy.array  (68, 2), y direction is opposite to v direction
+        lm3D               --numpy.array  (5, 3)
+        mask               --PIL.Image  (raw_H, raw_W, 3)
+    """
+
+    w0, h0 = img.size
+    if lm.shape[0] != 5:
+        lm5p = extract_5p(lm)
+    else:
+        lm5p = lm
+
+    # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face
+    t, s = POS(lm5p.transpose(), lm3D.transpose())
+    s = rescale_factor/s
+
+    # processing the image
+    img_new, lm_new, mask_new = resize_n_crop_img(img, lm, t, s, target_size=target_size, mask=mask)
+    #img_new = img.resize((1024,1024),resample=Image.LANCZOS)
+    #lm_new = lm*1024.0/512.0
+    #mask_new=None
+    # img.save("/home/koki/Projects/Deep3DFaceRecon_pytorch/checkpoints/pretrained/results/iphone/epoch_20_000000/img_new.jpg")    
+    trans_params = np.array([w0, h0, s, t[0][0], t[1][0]])
+    lm_new *= 224/1024.0
+    img_new_low = img_new.resize((224, 224), resample=Image.LANCZOS)
+
+    return trans_params, img_new_low, lm_new, mask_new, img_new
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--indir', type=str, required=True)
+    parser.add_argument('--outdir', type=str, required=True)
+    parser.add_argument('--compress_level', type=int, default=0)
+    args = parser.parse_args()
+
+    with open(os.path.join(args.indir, 'cropping_params.json')) as f:
+        cropping_params = json.load(f)
+
+    os.makedirs(args.outdir, exist_ok=True)
+
+    for im_path, cropping_dict in tqdm(cropping_params.items()):
+        im = Image.open(os.path.join(args.indir, im_path)).convert('RGB')
+
+        _, H = im.size
+        lm = np.array(cropping_dict['lm'])
+        lm = lm.reshape([-1, 2])
+        lm[:, -1] = H - 1 - lm[:, -1]
+
+        _, im_pil, lm, _, im_high = align_img(im, lm, np.array(cropping_dict['lm3d_std']), rescale_factor=cropping_dict['rescale_factor'])
+
+        left = int(im_high.size[0]/2 - cropping_dict['center_crop_size']/2)
+        upper = int(im_high.size[1]/2 - cropping_dict['center_crop_size']/2)
+        right = left + cropping_dict['center_crop_size']
+        lower = upper + cropping_dict['center_crop_size']
+        im_cropped = im_high.crop((left, upper, right,lower))
+        im_cropped = im_cropped.resize((cropping_dict['output_size'], cropping_dict['output_size']), resample=Image.LANCZOS)
+
+        im_cropped.save(os.path.join(args.outdir, os.path.basename(im_path)), compress_level=args.compress_level)
\ No newline at end of file
diff --git a/pose_estimation/data/__init__.py b/pose_estimation/data/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..56fe212697cebaa24c1e914cbc5d1a6c18a379ff
--- /dev/null
+++ b/pose_estimation/data/__init__.py
@@ -0,0 +1,116 @@
+"""This package includes all the modules related to data loading and preprocessing
+
+ To add a custom dataset class called 'dummy', you need to add a file called 'dummy_dataset.py' and define a subclass 'DummyDataset' inherited from BaseDataset.
+ You need to implement four functions:
+    -- <__init__>:                      initialize the class, first call BaseDataset.__init__(self, opt).
+    -- <__len__>:                       return the size of dataset.
+    -- <__getitem__>:                   get a data point from data loader.
+    -- <modify_commandline_options>:    (optionally) add dataset-specific options and set default options.
+
+Now you can use the dataset class by specifying flag '--dataset_mode dummy'.
+See our template dataset class 'template_dataset.py' for more details.
+"""
+import numpy as np
+import importlib
+import torch.utils.data
+from data.base_dataset import BaseDataset
+
+
+def find_dataset_using_name(dataset_name):
+    """Import the module "data/[dataset_name]_dataset.py".
+
+    In the file, the class called DatasetNameDataset() will
+    be instantiated. It has to be a subclass of BaseDataset,
+    and it is case-insensitive.
+    """
+    dataset_filename = "data." + dataset_name + "_dataset"
+    datasetlib = importlib.import_module(dataset_filename)
+
+    dataset = None
+    target_dataset_name = dataset_name.replace('_', '') + 'dataset'
+    for name, cls in datasetlib.__dict__.items():
+        if name.lower() == target_dataset_name.lower() \
+           and issubclass(cls, BaseDataset):
+            dataset = cls
+
+    if dataset is None:
+        raise NotImplementedError("In %s.py, there should be a subclass of BaseDataset with class name that matches %s in lowercase." % (dataset_filename, target_dataset_name))
+
+    return dataset
+
+
+def get_option_setter(dataset_name):
+    """Return the static method <modify_commandline_options> of the dataset class."""
+    dataset_class = find_dataset_using_name(dataset_name)
+    return dataset_class.modify_commandline_options
+
+
+def create_dataset(opt, rank=0):
+    """Create a dataset given the option.
+
+    This function wraps the class CustomDatasetDataLoader.
+        This is the main interface between this package and 'train.py'/'test.py'
+
+    Example:
+        >>> from data import create_dataset
+        >>> dataset = create_dataset(opt)
+    """
+    data_loader = CustomDatasetDataLoader(opt, rank=rank)
+    dataset = data_loader.load_data()
+    return dataset
+
+class CustomDatasetDataLoader():
+    """Wrapper class of Dataset class that performs multi-threaded data loading"""
+
+    def __init__(self, opt, rank=0):
+        """Initialize this class
+
+        Step 1: create a dataset instance given the name [dataset_mode]
+        Step 2: create a multi-threaded data loader.
+        """
+        self.opt = opt
+        dataset_class = find_dataset_using_name(opt.dataset_mode)
+        self.dataset = dataset_class(opt)
+        self.sampler = None
+        print("rank %d %s dataset [%s] was created" % (rank, self.dataset.name, type(self.dataset).__name__))
+        if opt.use_ddp and opt.isTrain:
+            world_size = opt.world_size
+            self.sampler = torch.utils.data.distributed.DistributedSampler(
+                    self.dataset,
+                    num_replicas=world_size,
+                    rank=rank,
+                    shuffle=not opt.serial_batches
+                )
+            self.dataloader = torch.utils.data.DataLoader(
+                        self.dataset,
+                        sampler=self.sampler,
+                        num_workers=int(opt.num_threads / world_size), 
+                        batch_size=int(opt.batch_size / world_size), 
+                        drop_last=True)
+        else:
+            self.dataloader = torch.utils.data.DataLoader(
+                self.dataset,
+                batch_size=opt.batch_size,
+                shuffle=(not opt.serial_batches) and opt.isTrain,
+                num_workers=int(opt.num_threads),
+                drop_last=True
+            )
+
+    def set_epoch(self, epoch):
+        self.dataset.current_epoch = epoch
+        if self.sampler is not None:
+            self.sampler.set_epoch(epoch)
+
+    def load_data(self):
+        return self
+
+    def __len__(self):
+        """Return the number of data in the dataset"""
+        return min(len(self.dataset), self.opt.max_dataset_size)
+
+    def __iter__(self):
+        """Return a batch of data"""
+        for i, data in enumerate(self.dataloader):
+            if i * self.opt.batch_size >= self.opt.max_dataset_size:
+                break
+            yield data
diff --git a/pose_estimation/data/base_dataset.py b/pose_estimation/data/base_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..5f4a1fa8f76aa94a0a5ca945788c402bf67e2d06
--- /dev/null
+++ b/pose_estimation/data/base_dataset.py
@@ -0,0 +1,125 @@
+"""This module implements an abstract base class (ABC) 'BaseDataset' for datasets.
+
+It also includes common transformation functions (e.g., get_transform, __scale_width), which can be later used in subclasses.
+"""
+import random
+import numpy as np
+import torch.utils.data as data
+from PIL import Image
+import torchvision.transforms as transforms
+from abc import ABC, abstractmethod
+
+
+class BaseDataset(data.Dataset, ABC):
+    """This class is an abstract base class (ABC) for datasets.
+
+    To create a subclass, you need to implement the following four functions:
+    -- <__init__>:                      initialize the class, first call BaseDataset.__init__(self, opt).
+    -- <__len__>:                       return the size of dataset.
+    -- <__getitem__>:                   get a data point.
+    -- <modify_commandline_options>:    (optionally) add dataset-specific options and set default options.
+    """
+
+    def __init__(self, opt):
+        """Initialize the class; save the options in the class
+
+        Parameters:
+            opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
+        """
+        self.opt = opt
+        # self.root = opt.dataroot
+        self.current_epoch = 0
+
+    @staticmethod
+    def modify_commandline_options(parser, is_train):
+        """Add new dataset-specific options, and rewrite default values for existing options.
+
+        Parameters:
+            parser          -- original option parser
+            is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        return parser
+
+    @abstractmethod
+    def __len__(self):
+        """Return the total number of images in the dataset."""
+        return 0
+
+    @abstractmethod
+    def __getitem__(self, index):
+        """Return a data point and its metadata information.
+
+        Parameters:
+            index - - a random integer for data indexing
+
+        Returns:
+            a dictionary of data with their names. It ususally contains the data itself and its metadata information.
+        """
+        pass
+
+
+def get_transform(grayscale=False):
+    transform_list = []
+    if grayscale:
+        transform_list.append(transforms.Grayscale(1))
+    transform_list += [transforms.ToTensor()]
+    return transforms.Compose(transform_list)
+
+def get_affine_mat(opt, size):
+    shift_x, shift_y, scale, rot_angle, flip = 0., 0., 1., 0., False
+    w, h = size
+
+    if 'shift' in opt.preprocess:
+        shift_pixs = int(opt.shift_pixs)
+        shift_x = random.randint(-shift_pixs, shift_pixs)
+        shift_y = random.randint(-shift_pixs, shift_pixs)
+    if 'scale' in opt.preprocess:
+        scale = 1 + opt.scale_delta * (2 * random.random() - 1)
+    if 'rot' in opt.preprocess:
+        rot_angle = opt.rot_angle * (2 * random.random() - 1)
+        rot_rad = -rot_angle * np.pi/180
+    if 'flip' in opt.preprocess:
+        flip = random.random() > 0.5
+
+    shift_to_origin = np.array([1, 0, -w//2, 0, 1, -h//2, 0, 0, 1]).reshape([3, 3])
+    flip_mat = np.array([-1 if flip else 1, 0, 0, 0, 1, 0, 0, 0, 1]).reshape([3, 3])
+    shift_mat = np.array([1, 0, shift_x, 0, 1, shift_y, 0, 0, 1]).reshape([3, 3])
+    rot_mat = np.array([np.cos(rot_rad), np.sin(rot_rad), 0, -np.sin(rot_rad), np.cos(rot_rad), 0, 0, 0, 1]).reshape([3, 3])
+    scale_mat = np.array([scale, 0, 0, 0, scale, 0, 0, 0, 1]).reshape([3, 3])
+    shift_to_center = np.array([1, 0, w//2, 0, 1, h//2, 0, 0, 1]).reshape([3, 3])
+    
+    affine = shift_to_center @ scale_mat @ rot_mat @ shift_mat @ flip_mat @ shift_to_origin    
+    affine_inv = np.linalg.inv(affine)
+    return affine, affine_inv, flip
+
+def apply_img_affine(img, affine_inv, method=Image.LANCZOS):
+    return img.transform(img.size, Image.AFFINE, data=affine_inv.flatten()[:6], resample=Image.LANCZOS)
+
+def apply_lm_affine(landmark, affine, flip, size):
+    _, h = size
+    lm = landmark.copy()
+    lm[:, 1] = h - 1 - lm[:, 1]
+    lm = np.concatenate((lm, np.ones([lm.shape[0], 1])), -1)
+    lm = lm @ np.transpose(affine)
+    lm[:, :2] = lm[:, :2] / lm[:, 2:]
+    lm = lm[:, :2]
+    lm[:, 1] = h - 1 - lm[:, 1]
+    if flip:
+        lm_ = lm.copy()
+        lm_[:17] = lm[16::-1]
+        lm_[17:22] = lm[26:21:-1]
+        lm_[22:27] = lm[21:16:-1]
+        lm_[31:36] = lm[35:30:-1]
+        lm_[36:40] = lm[45:41:-1]
+        lm_[40:42] = lm[47:45:-1]
+        lm_[42:46] = lm[39:35:-1]
+        lm_[46:48] = lm[41:39:-1]
+        lm_[48:55] = lm[54:47:-1]
+        lm_[55:60] = lm[59:54:-1]
+        lm_[60:65] = lm[64:59:-1]
+        lm_[65:68] = lm[67:64:-1]
+        lm = lm_
+    return lm
diff --git a/pose_estimation/data/flist_dataset.py b/pose_estimation/data/flist_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..c0b6945c80aa756074a5d3c02b9443b15ddcfc57
--- /dev/null
+++ b/pose_estimation/data/flist_dataset.py
@@ -0,0 +1,125 @@
+"""This script defines the custom dataset for Deep3DFaceRecon_pytorch
+"""
+
+import os.path
+from data.base_dataset import BaseDataset, get_transform, get_affine_mat, apply_img_affine, apply_lm_affine
+from data.image_folder import make_dataset
+from PIL import Image
+import random
+import util.util as util
+import numpy as np
+import json
+import torch
+from scipy.io import loadmat, savemat
+import pickle
+from util.preprocess import align_img, estimate_norm
+from util.load_mats import load_lm3d
+
+
+def default_flist_reader(flist):
+    """
+    flist format: impath label\nimpath label\n ...(same to caffe's filelist)
+    """
+    imlist = []
+    with open(flist, 'r') as rf:
+        for line in rf.readlines():
+            impath = line.strip()
+            imlist.append(impath)
+
+    return imlist
+
+def jason_flist_reader(flist):
+    with open(flist, 'r') as fp:
+        info = json.load(fp)
+    return info
+
+def parse_label(label):
+    return torch.tensor(np.array(label).astype(np.float32))
+
+
+class FlistDataset(BaseDataset):
+    """
+    It requires one directories to host training images '/path/to/data/train'
+    You can train the model with the dataset flag '--dataroot /path/to/data'.
+    """
+
+    def __init__(self, opt):
+        """Initialize this dataset class.
+
+        Parameters:
+            opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        """
+        BaseDataset.__init__(self, opt)
+        
+        self.lm3d_std = load_lm3d(opt.bfm_folder)
+        
+        msk_names = default_flist_reader(opt.flist)
+        self.msk_paths = [os.path.join(opt.data_root, i) for i in msk_names]
+
+        self.size = len(self.msk_paths) 
+        self.opt = opt
+        
+        self.name = 'train' if opt.isTrain else 'val'
+        if '_' in opt.flist:
+            self.name += '_' + opt.flist.split(os.sep)[-1].split('_')[0]
+        
+
+    def __getitem__(self, index):
+        """Return a data point and its metadata information.
+
+        Parameters:
+            index (int)      -- a random integer for data indexing
+
+        Returns a dictionary that contains A, B, A_paths and B_paths
+            img (tensor)       -- an image in the input domain
+            msk (tensor)       -- its corresponding attention mask
+            lm  (tensor)       -- its corresponding 3d landmarks
+            im_paths (str)     -- image paths
+            aug_flag (bool)    -- a flag used to tell whether its raw or augmented
+        """
+        msk_path = self.msk_paths[index % self.size]  # make sure index is within then range
+        img_path = msk_path.replace('mask/', '')
+        lm_path = '.'.join(msk_path.replace('mask', 'landmarks').split('.')[:-1]) + '.txt'
+
+        raw_img = Image.open(img_path).convert('RGB')
+        raw_msk = Image.open(msk_path).convert('RGB')
+        raw_lm = np.loadtxt(lm_path).astype(np.float32)
+
+        _, img, lm, msk = align_img(raw_img, raw_lm, self.lm3d_std, raw_msk)
+        
+        aug_flag = self.opt.use_aug and self.opt.isTrain
+        if aug_flag:
+            img, lm, msk = self._augmentation(img, lm, self.opt, msk)
+        
+        _, H = img.size
+        M = estimate_norm(lm, H)
+        transform = get_transform()
+        img_tensor = transform(img)
+        msk_tensor = transform(msk)[:1, ...]
+        lm_tensor = parse_label(lm)
+        M_tensor = parse_label(M)
+
+
+        return {'imgs': img_tensor, 
+                'lms': lm_tensor, 
+                'msks': msk_tensor, 
+                'M': M_tensor,
+                'im_paths': img_path, 
+                'aug_flag': aug_flag,
+                'dataset': self.name}
+
+    def _augmentation(self, img, lm, opt, msk=None):
+        affine, affine_inv, flip = get_affine_mat(opt, img.size)
+        img = apply_img_affine(img, affine_inv)
+        lm = apply_lm_affine(lm, affine, flip, img.size)
+        if msk is not None:
+            msk = apply_img_affine(msk, affine_inv, method=Image.BILINEAR)
+        return img, lm, msk
+    
+
+
+
+    def __len__(self):
+        """Return the total number of images in the dataset.
+        """
+        return self.size
diff --git a/pose_estimation/data/image_folder.py b/pose_estimation/data/image_folder.py
new file mode 100755
index 0000000000000000000000000000000000000000..efadc2ecbe2fb4b53b78230aba25ec505eff0e55
--- /dev/null
+++ b/pose_estimation/data/image_folder.py
@@ -0,0 +1,66 @@
+"""A modified image folder class
+
+We modify the official PyTorch image folder (https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py)
+so that this class can load images from both current directory and its subdirectories.
+"""
+import numpy as np
+import torch.utils.data as data
+
+from PIL import Image
+import os
+import os.path
+
+IMG_EXTENSIONS = [
+    '.jpg', '.JPG', '.jpeg', '.JPEG',
+    '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
+    '.tif', '.TIF', '.tiff', '.TIFF',
+]
+
+
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+
+
+def make_dataset(dir, max_dataset_size=float("inf")):
+    images = []
+    assert os.path.isdir(dir) or os.path.islink(dir), '%s is not a valid directory' % dir
+
+    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+        for fname in fnames:
+            if is_image_file(fname):
+                path = os.path.join(root, fname)
+                images.append(path)
+    return images[:min(max_dataset_size, len(images))]
+
+
+def default_loader(path):
+    return Image.open(path).convert('RGB')
+
+
+class ImageFolder(data.Dataset):
+
+    def __init__(self, root, transform=None, return_paths=False,
+                 loader=default_loader):
+        imgs = make_dataset(root)
+        if len(imgs) == 0:
+            raise(RuntimeError("Found 0 images in: " + root + "\n"
+                               "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
+
+        self.root = root
+        self.imgs = imgs
+        self.transform = transform
+        self.return_paths = return_paths
+        self.loader = loader
+
+    def __getitem__(self, index):
+        path = self.imgs[index]
+        img = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.return_paths:
+            return img, path
+        else:
+            return img
+
+    def __len__(self):
+        return len(self.imgs)
diff --git a/pose_estimation/data/template_dataset.py b/pose_estimation/data/template_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..bfdf16be2a8a834b204c45d88c86857b37b9bd25
--- /dev/null
+++ b/pose_estimation/data/template_dataset.py
@@ -0,0 +1,75 @@
+"""Dataset class template
+
+This module provides a template for users to implement custom datasets.
+You can specify '--dataset_mode template' to use this dataset.
+The class name should be consistent with both the filename and its dataset_mode option.
+The filename should be <dataset_mode>_dataset.py
+The class name should be <Dataset_mode>Dataset.py
+You need to implement the following functions:
+    -- <modify_commandline_options>:　Add dataset-specific options and rewrite default values for existing options.
+    -- <__init__>: Initialize this dataset class.
+    -- <__getitem__>: Return a data point and its metadata information.
+    -- <__len__>: Return the number of images.
+"""
+from data.base_dataset import BaseDataset, get_transform
+# from data.image_folder import make_dataset
+# from PIL import Image
+
+
+class TemplateDataset(BaseDataset):
+    """A template dataset class for you to implement custom datasets."""
+    @staticmethod
+    def modify_commandline_options(parser, is_train):
+        """Add new dataset-specific options, and rewrite default values for existing options.
+
+        Parameters:
+            parser          -- original option parser
+            is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        parser.add_argument('--new_dataset_option', type=float, default=1.0, help='new dataset option')
+        parser.set_defaults(max_dataset_size=10, new_dataset_option=2.0)  # specify dataset-specific default values
+        return parser
+
+    def __init__(self, opt):
+        """Initialize this dataset class.
+
+        Parameters:
+            opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+
+        A few things can be done here.
+        - save the options (have been done in BaseDataset)
+        - get image paths and meta information of the dataset.
+        - define the image transformation.
+        """
+        # save the option and dataset root
+        BaseDataset.__init__(self, opt)
+        # get the image paths of your dataset;
+        self.image_paths = []  # You can call sorted(make_dataset(self.root, opt.max_dataset_size)) to get all the image paths under the directory self.root
+        # define the default transform function. You can use <base_dataset.get_transform>; You can also define your custom transform function
+        self.transform = get_transform(opt)
+
+    def __getitem__(self, index):
+        """Return a data point and its metadata information.
+
+        Parameters:
+            index -- a random integer for data indexing
+
+        Returns:
+            a dictionary of data with their names. It usually contains the data itself and its metadata information.
+
+        Step 1: get a random image path: e.g., path = self.image_paths[index]
+        Step 2: load your data from the disk: e.g., image = Image.open(path).convert('RGB').
+        Step 3: convert your data to a PyTorch tensor. You can use helpder functions such as self.transform. e.g., data = self.transform(image)
+        Step 4: return a data point as a dictionary.
+        """
+        path = 'temp'    # needs to be a string
+        data_A = None    # needs to be a tensor
+        data_B = None    # needs to be a tensor
+        return {'data_A': data_A, 'data_B': data_B, 'path': path}
+
+    def __len__(self):
+        """Return the total number of images."""
+        return len(self.image_paths)
diff --git a/pose_estimation/extract_pose.py b/pose_estimation/extract_pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c2ee0d98383a9e19a616e47a7318a9d9d592e88
--- /dev/null
+++ b/pose_estimation/extract_pose.py
@@ -0,0 +1,23 @@
+import os
+import shutil
+import sys
+from glob import glob
+
+gpu_id = sys.argv[1]
+custom_folder = sys.argv[2]
+temp_folder = sys.argv[3]
+output_folder = sys.argv[4]
+
+name_list = [x for x in sorted(glob("%s/*.png"%(custom_folder))) if 'mask' not in x]
+os.system('CUDA_VISIBLE_DEVICES=%s python DataProcess/Gen_HeadMask.py --img_dir ./%s/'%(gpu_id,custom_folder))
+os.system('CUDA_VISIBLE_DEVICES=%s python DataProcess/Gen_Landmark.py --img_dir ./%s/'%(gpu_id,custom_folder))
+for name_all in name_list:
+    name = os.path.basename(name_all)[:-4]
+    os.system('python align_roll.py %s %s %s'%(name, custom_folder,temp_folder))
+
+os.system('CUDA_VISIBLE_DEVICES=%s python process_test_images.py --input_dir ./%s/ --gpu=%s'%(gpu_id,temp_folder,gpu_id))
+
+os.system('python check_pose.py %s %s '%(temp_folder,output_folder))
+
+##example
+#python extract_pose.py 0 custom_imgs_folder temp_folder output_folder
\ No newline at end of file
diff --git a/pose_estimation/models/__init__.py b/pose_estimation/models/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..fc01113da66ff042bd1807b5bfdb70c4bce8d14c
--- /dev/null
+++ b/pose_estimation/models/__init__.py
@@ -0,0 +1,67 @@
+"""This package contains modules related to objective functions, optimizations, and network architectures.
+
+To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
+You need to implement the following five functions:
+    -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
+    -- <set_input>:                     unpack data from dataset and apply preprocessing.
+    -- <forward>:                       produce intermediate results.
+    -- <optimize_parameters>:           calculate loss, gradients, and update network weights.
+    -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
+
+In the function <__init__>, you need to define four lists:
+    -- self.loss_names (str list):          specify the training losses that you want to plot and save.
+    -- self.model_names (str list):         define networks used in our training.
+    -- self.visual_names (str list):        specify the images that you want to display and save.
+    -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
+
+Now you can use the model class by specifying flag '--model dummy'.
+See our template model class 'template_model.py' for more details.
+"""
+
+import importlib
+from models.base_model import BaseModel
+
+
+def find_model_using_name(model_name):
+    """Import the module "models/[model_name]_model.py".
+
+    In the file, the class called DatasetNameModel() will
+    be instantiated. It has to be a subclass of BaseModel,
+    and it is case-insensitive.
+    """
+    model_filename = "models." + model_name + "_model"
+    modellib = importlib.import_module(model_filename)
+    model = None
+    target_model_name = model_name.replace('_', '') + 'model'
+    for name, cls in modellib.__dict__.items():
+        if name.lower() == target_model_name.lower() \
+           and issubclass(cls, BaseModel):
+            model = cls
+
+    if model is None:
+        print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
+        exit(0)
+
+    return model
+
+
+def get_option_setter(model_name):
+    """Return the static method <modify_commandline_options> of the model class."""
+    model_class = find_model_using_name(model_name)
+    return model_class.modify_commandline_options
+
+
+def create_model(opt):
+    """Create a model given the option.
+
+    This function warps the class CustomDatasetDataLoader.
+    This is the main interface between this package and 'train.py'/'test.py'
+
+    Example:
+        >>> from models import create_model
+        >>> model = create_model(opt)
+    """
+    model = find_model_using_name(opt.model)
+    instance = model(opt)
+    print("model [%s] was created" % type(instance).__name__)
+    return instance
diff --git a/pose_estimation/models/arcface_torch/README.md b/pose_estimation/models/arcface_torch/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..2ee63a861229b68873561fa39bfa7c9a8b53b947
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/README.md
@@ -0,0 +1,164 @@
+# Distributed Arcface Training in Pytorch
+
+This is a deep learning library that makes face recognition efficient, and effective, which can train tens of millions
+identity on a single server.
+
+## Requirements
+
+- Install [pytorch](http://pytorch.org) (torch>=1.6.0), our doc for [install.md](docs/install.md).
+- `pip install -r requirements.txt`.
+- Download the dataset
+  from [https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_)
+  .
+
+## How to Training
+
+To train a model, run `train.py` with the path to the configs:
+
+### 1. Single node, 8 GPUs:
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/ms1mv3_r50
+```
+
+### 2. Multiple nodes, each node 8 GPUs:
+
+Node 0:
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=1234 train.py train.py configs/ms1mv3_r50
+```
+
+Node 1:
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=1234 train.py train.py configs/ms1mv3_r50
+```
+
+### 3.Training resnet2060 with 8 GPUs:
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/ms1mv3_r2060.py
+```
+
+## Model Zoo
+
+- The models are available for non-commercial research purposes only.  
+- All models can be found in here.  
+- [Baidu Yun Pan](https://pan.baidu.com/s/1CL-l4zWqsI1oDuEEYVhj-g):   e8pw  
+- [onedrive](https://1drv.ms/u/s!AswpsDO2toNKq0lWY69vN58GR6mw?e=p9Ov5d)
+
+### Performance on [**ICCV2021-MFR**](http://iccv21-mfr.com/)
+
+ICCV2021-MFR testset consists of non-celebrities so we can ensure that it has very few overlap with public available face 
+recognition training set, such as MS1M and CASIA as they mostly collected from online celebrities. 
+As the result, we can evaluate the FAIR performance for different algorithms.  
+
+For **ICCV2021-MFR-ALL** set, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.000001(e-6). The 
+globalised multi-racial testset contains 242,143 identities and 1,624,305 images. 
+
+For **ICCV2021-MFR-MASK** set, TAR is measured on mask-to-nonmask 1:1 protocal, with FAR less than 0.0001(e-4). 
+Mask testset contains 6,964 identities, 6,964 masked images and 13,928 non-masked images. 
+There are totally 13,928 positive pairs and 96,983,824 negative pairs.
+
+| Datasets | backbone  | Training throughout | Size / MB  | **ICCV2021-MFR-MASK** | **ICCV2021-MFR-ALL** |
+| :---:    | :---      | :---                | :---       |:---                   |:---                  |     
+| MS1MV3    | r18  | -              | 91   | **47.85** | **68.33** |
+| Glint360k | r18  | 8536           | 91   | **53.32** | **72.07** |
+| MS1MV3    | r34  | -              | 130  | **58.72** | **77.36** |
+| Glint360k | r34  | 6344           | 130  | **65.10** | **83.02** |
+| MS1MV3    | r50  | 5500           | 166  | **63.85** | **80.53** |
+| Glint360k | r50  | 5136           | 166  | **70.23** | **87.08** |
+| MS1MV3    | r100 | -              | 248  | **69.09** | **84.31** |
+| Glint360k | r100 | 3332           | 248  | **75.57** | **90.66** |
+| MS1MV3    | mobilefacenet | 12185 | 7.8  | **41.52** | **65.26** |        
+| Glint360k | mobilefacenet | 11197 | 7.8  | **44.52** | **66.48** |  
+
+### Performance on IJB-C and Verification Datasets
+
+|   Datasets | backbone      | IJBC(1e-05) | IJBC(1e-04) | agedb30 | cfp_fp | lfw  |  log    |
+| :---:      |    :---       | :---          | :---  | :---  |:---   |:---    |:---     |  
+| MS1MV3     | r18      | 92.07 | 94.66 | 97.77 | 97.73 | 99.77 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r18_fp16/training.log)|         
+| MS1MV3     | r34      | 94.10 | 95.90 | 98.10 | 98.67 | 99.80 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r34_fp16/training.log)|        
+| MS1MV3     | r50      | 94.79 | 96.46 | 98.35 | 98.96 | 99.83 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r50_fp16/training.log)|         
+| MS1MV3     | r100     | 95.31 | 96.81 | 98.48 | 99.06 | 99.85 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r100_fp16/training.log)|        
+| MS1MV3     | **r2060**| 95.34 | 97.11 | 98.67 | 99.24 | 99.87 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r2060_fp16/training.log)|
+| Glint360k  |r18-0.1   | 93.16 | 95.33 | 97.72 | 97.73 | 99.77 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r18_fp16_0.1/training.log)| 
+| Glint360k  |r34-0.1   | 95.16 | 96.56 | 98.33 | 98.78 | 99.82 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r34_fp16_0.1/training.log)| 
+| Glint360k  |r50-0.1   | 95.61 | 96.97 | 98.38 | 99.20 | 99.83 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r50_fp16_0.1/training.log)| 
+| Glint360k  |r100-0.1  | 95.88 | 97.32 | 98.48 | 99.29 | 99.82 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|
+
+[comment]: <> (More details see [model.md]&#40;docs/modelzoo.md&#41; in docs.)
+
+
+## [Speed Benchmark](docs/speed_benchmark.md)
+
+**Arcface Torch** can train large-scale face recognition training set efficiently and quickly. When the number of
+classes in training sets is greater than 300K and the training is sufficient, partial fc sampling strategy will get same
+accuracy with several times faster training performance and smaller GPU memory. 
+Partial FC is a sparse variant of the model parallel architecture for large sacle  face recognition. Partial FC use a 
+sparse softmax, where each batch dynamicly sample a subset of class centers for training. In each iteration, only a 
+sparse part of the parameters will be updated, which can reduce a lot of GPU memory and calculations. With Partial FC, 
+we can scale trainset of 29 millions identities, the largest to date. Partial FC also supports multi-machine distributed 
+training and mixed precision training.
+
+![Image text](https://github.com/anxiangsir/insightface_arcface_log/blob/master/partial_fc_v2.png)
+
+More details see 
+[speed_benchmark.md](docs/speed_benchmark.md) in docs.
+
+### 1. Training speed of different parallel methods (samples / second), Tesla V100 32GB * 8. (Larger is better)
+
+`-` means training failed because of gpu memory limitations.
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :---    | :--- | :--- | :--- |
+|125000   | 4681         | 4824          | 5004     |
+|1400000  | **1672**     | 3043          | 4738     |
+|5500000  | **-**        | **1389**      | 3975     |
+|8000000  | **-**        | **-**         | 3565     |
+|16000000 | **-**        | **-**         | 2679     |
+|29000000 | **-**        | **-**         | **1855** |
+
+### 2. GPU memory cost of different parallel methods (MB per GPU), Tesla V100 32GB * 8. (Smaller is better)
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :---    | :---      | :---      | :---  |
+|125000   | 7358      | 5306      | 4868  |
+|1400000  | 32252     | 11178     | 6056  |
+|5500000  | **-**     | 32188     | 9854  |
+|8000000  | **-**     | **-**     | 12310 |
+|16000000 | **-**     | **-**     | 19950 |
+|29000000 | **-**     | **-**     | 32324 |
+
+## Evaluation ICCV2021-MFR and IJB-C
+
+More details see [eval.md](docs/eval.md) in docs.
+
+## Test
+
+We tested many versions of PyTorch. Please create an issue if you are having trouble.  
+
+- [x] torch 1.6.0
+- [x] torch 1.7.1
+- [x] torch 1.8.0
+- [x] torch 1.9.0
+
+## Citation
+
+```
+@inproceedings{deng2019arcface,
+  title={Arcface: Additive angular margin loss for deep face recognition},
+  author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={4690--4699},
+  year={2019}
+}
+@inproceedings{an2020partical_fc,
+  title={Partial FC: Training 10 Million Identities on a Single Machine},
+  author={An, Xiang and Zhu, Xuhan and Xiao, Yang and Wu, Lan and Zhang, Ming and Gao, Yuan and Qin, Bin and
+  Zhang, Debing and Fu Ying},
+  booktitle={Arxiv 2010.05222},
+  year={2020}
+}
+```
diff --git a/pose_estimation/models/arcface_torch/__init__.py b/pose_estimation/models/arcface_torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/models/arcface_torch/backbones/__init__.py b/pose_estimation/models/arcface_torch/backbones/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..55bd4c5d1889a1a998b52eb56793bbc1eef1b691
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/backbones/__init__.py
@@ -0,0 +1,25 @@
+from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
+from .mobilefacenet import get_mbf
+
+
+def get_model(name, **kwargs):
+    # resnet
+    if name == "r18":
+        return iresnet18(False, **kwargs)
+    elif name == "r34":
+        return iresnet34(False, **kwargs)
+    elif name == "r50":
+        return iresnet50(False, **kwargs)
+    elif name == "r100":
+        return iresnet100(False, **kwargs)
+    elif name == "r200":
+        return iresnet200(False, **kwargs)
+    elif name == "r2060":
+        from .iresnet2060 import iresnet2060
+        return iresnet2060(False, **kwargs)
+    elif name == "mbf":
+        fp16 = kwargs.get("fp16", False)
+        num_features = kwargs.get("num_features", 512)
+        return get_mbf(fp16=fp16, num_features=num_features)
+    else:
+        raise ValueError()
\ No newline at end of file
diff --git a/pose_estimation/models/arcface_torch/backbones/iresnet.py b/pose_estimation/models/arcface_torch/backbones/iresnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..c6d3b9c240c24687d432197f976ee01fbf423216
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/backbones/iresnet.py
@@ -0,0 +1,187 @@
+import torch
+from torch import nn
+
+__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200']
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+
+
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet18(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet34(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet50(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet100(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet200(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
+                    progress, **kwargs)
+
diff --git a/pose_estimation/models/arcface_torch/backbones/iresnet2060.py b/pose_estimation/models/arcface_torch/backbones/iresnet2060.py
new file mode 100755
index 0000000000000000000000000000000000000000..21d1122144d207637d2444cba1f68fe630c89f31
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/backbones/iresnet2060.py
@@ -0,0 +1,176 @@
+import torch
+from torch import nn
+
+assert torch.__version__ >= "1.8.1"
+from torch.utils.checkpoint import checkpoint_sequential
+
+__all__ = ['iresnet2060']
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05, )
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+
+
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05, )
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+
+        return nn.Sequential(*layers)
+
+    def checkpoint(self, func, num_seg, x):
+        if self.training:
+            return checkpoint_sequential(func, num_seg, x)
+        else:
+            return func(x)
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.checkpoint(self.layer2, 20, x)
+            x = self.checkpoint(self.layer3, 100, x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet2060(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet2060', IBasicBlock, [3, 128, 1024 - 128, 3], pretrained, progress, **kwargs)
diff --git a/pose_estimation/models/arcface_torch/backbones/mobilefacenet.py b/pose_estimation/models/arcface_torch/backbones/mobilefacenet.py
new file mode 100755
index 0000000000000000000000000000000000000000..87731491d76f9ff61cc70e57bb3f18c54fae308c
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/backbones/mobilefacenet.py
@@ -0,0 +1,130 @@
+'''
+Adapted from https://github.com/cavalleria/cavaface.pytorch/blob/master/backbone/mobilefacenet.py
+Original author cavalleria
+'''
+
+import torch.nn as nn
+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module
+import torch
+
+
+class Flatten(Module):
+    def forward(self, x):
+        return x.view(x.size(0), -1)
+
+
+class ConvBlock(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(ConvBlock, self).__init__()
+        self.layers = nn.Sequential(
+            Conv2d(in_c, out_c, kernel, groups=groups, stride=stride, padding=padding, bias=False),
+            BatchNorm2d(num_features=out_c),
+            PReLU(num_parameters=out_c)
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class LinearBlock(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(LinearBlock, self).__init__()
+        self.layers = nn.Sequential(
+            Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False),
+            BatchNorm2d(num_features=out_c)
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class DepthWise(Module):
+    def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
+        super(DepthWise, self).__init__()
+        self.residual = residual
+        self.layers = nn.Sequential(
+            ConvBlock(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)),
+            ConvBlock(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride),
+            LinearBlock(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        )
+
+    def forward(self, x):
+        short_cut = None
+        if self.residual:
+            short_cut = x
+        x = self.layers(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+
+
+class Residual(Module):
+    def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(DepthWise(c, c, True, kernel, stride, padding, groups))
+        self.layers = Sequential(*modules)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class GDC(Module):
+    def __init__(self, embedding_size):
+        super(GDC, self).__init__()
+        self.layers = nn.Sequential(
+            LinearBlock(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)),
+            Flatten(),
+            Linear(512, embedding_size, bias=False),
+            BatchNorm1d(embedding_size))
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class MobileFaceNet(Module):
+    def __init__(self, fp16=False, num_features=512):
+        super(MobileFaceNet, self).__init__()
+        scale = 2
+        self.fp16 = fp16
+        self.layers = nn.Sequential(
+            ConvBlock(3, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1)),
+            ConvBlock(64 * scale, 64 * scale, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64),
+            DepthWise(64 * scale, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128),
+            Residual(64 * scale, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            DepthWise(64 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256),
+            Residual(128 * scale, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            DepthWise(128 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512),
+            Residual(128 * scale, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+        )
+        self.conv_sep = ConvBlock(128 * scale, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        self.features = GDC(num_features)
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.layers(x)
+        x = self.conv_sep(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def get_mbf(fp16, num_features):
+    return MobileFaceNet(fp16, num_features)
\ No newline at end of file
diff --git a/pose_estimation/models/arcface_torch/configs/3millions.py b/pose_estimation/models/arcface_torch/configs/3millions.py
new file mode 100755
index 0000000000000000000000000000000000000000..c9edc2f1414e35f93abfd3dfe11a61f1f406580e
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/3millions.py
@@ -0,0 +1,23 @@
+from easydict import EasyDict as edict
+
+# configs for test speed
+
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "synthetic"
+config.num_classes = 300 * 10000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = []
diff --git a/pose_estimation/models/arcface_torch/configs/3millions_pfc.py b/pose_estimation/models/arcface_torch/configs/3millions_pfc.py
new file mode 100755
index 0000000000000000000000000000000000000000..77caafdbb300d8109d5bfdb844f131710ef81f20
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/3millions_pfc.py
@@ -0,0 +1,23 @@
+from easydict import EasyDict as edict
+
+# configs for test speed
+
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "synthetic"
+config.num_classes = 300 * 10000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = []
diff --git a/pose_estimation/models/arcface_torch/configs/__init__.py b/pose_estimation/models/arcface_torch/configs/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/models/arcface_torch/configs/base.py b/pose_estimation/models/arcface_torch/configs/base.py
new file mode 100755
index 0000000000000000000000000000000000000000..78e4b36a9142b649ec39a8c59331bb2557f2ad57
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/base.py
@@ -0,0 +1,56 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = "ms1mv3_arcface_r50"
+
+config.dataset = "ms1m-retinaface-t1"
+config.embedding_size = 512
+config.sample_rate = 1
+config.fp16 = False
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+if config.dataset == "emore":
+    config.rec = "/train_tmp/faces_emore"
+    config.num_classes = 85742
+    config.num_image = 5822653
+    config.num_epoch = 16
+    config.warmup_epoch = -1
+    config.decay_epoch = [8, 14, ]
+    config.val_targets = ["lfw", ]
+
+elif config.dataset == "ms1m-retinaface-t1":
+    config.rec = "/train_tmp/ms1m-retinaface-t1"
+    config.num_classes = 93431
+    config.num_image = 5179510
+    config.num_epoch = 25
+    config.warmup_epoch = -1
+    config.decay_epoch = [11, 17, 22]
+    config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+elif config.dataset == "glint360k":
+    config.rec = "/train_tmp/glint360k"
+    config.num_classes = 360232
+    config.num_image = 17091657
+    config.num_epoch = 20
+    config.warmup_epoch = -1
+    config.decay_epoch = [8, 12, 15, 18]
+    config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+elif config.dataset == "webface":
+    config.rec = "/train_tmp/faces_webface_112x112"
+    config.num_classes = 10572
+    config.num_image = "forget"
+    config.num_epoch = 34
+    config.warmup_epoch = -1
+    config.decay_epoch = [20, 28, 32]
+    config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/glint360k_mbf.py b/pose_estimation/models/arcface_torch/configs/glint360k_mbf.py
new file mode 100755
index 0000000000000000000000000000000000000000..46ae777cc97af41a531cba4e5d1ff31f2efcb468
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/glint360k_mbf.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 2e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/glint360k_r100.py b/pose_estimation/models/arcface_torch/configs/glint360k_r100.py
new file mode 100755
index 0000000000000000000000000000000000000000..93d0701c0094517cec147c382b005e8063938548
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/glint360k_r100.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/glint360k_r18.py b/pose_estimation/models/arcface_torch/configs/glint360k_r18.py
new file mode 100755
index 0000000000000000000000000000000000000000..7a8db34cd547e8e667103c93585296e47a894e97
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/glint360k_r18.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r18"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/glint360k_r34.py b/pose_estimation/models/arcface_torch/configs/glint360k_r34.py
new file mode 100755
index 0000000000000000000000000000000000000000..fda2701758a839a7161d09c25f0ca3d26033baff
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/glint360k_r34.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r34"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/glint360k_r50.py b/pose_estimation/models/arcface_torch/configs/glint360k_r50.py
new file mode 100755
index 0000000000000000000000000000000000000000..37e7922f1f63284e356dcc45a5f979f9c105f25e
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/glint360k_r50.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/ms1mv3_mbf.py b/pose_estimation/models/arcface_torch/configs/ms1mv3_mbf.py
new file mode 100755
index 0000000000000000000000000000000000000000..b8a00d6305eeda5a94788017afc1cda0d4a4cd2a
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/ms1mv3_mbf.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 2e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 20, 25]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/ms1mv3_r18.py b/pose_estimation/models/arcface_torch/configs/ms1mv3_r18.py
new file mode 100755
index 0000000000000000000000000000000000000000..eb4e0d31f1aedf4590628d394e1606920fefb5c9
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/ms1mv3_r18.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r18"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/ms1mv3_r2060.py b/pose_estimation/models/arcface_torch/configs/ms1mv3_r2060.py
new file mode 100755
index 0000000000000000000000000000000000000000..23ad81e082c4b6390b67b164d0ceb84bb0635684
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/ms1mv3_r2060.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r2060"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 64
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/ms1mv3_r34.py b/pose_estimation/models/arcface_torch/configs/ms1mv3_r34.py
new file mode 100755
index 0000000000000000000000000000000000000000..5f78337a3d1f9eb6e9145eb5093618796c6842d2
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/ms1mv3_r34.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r34"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/ms1mv3_r50.py b/pose_estimation/models/arcface_torch/configs/ms1mv3_r50.py
new file mode 100755
index 0000000000000000000000000000000000000000..08ba55dbbea6df0afffddbb3d1ed173efad99604
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/ms1mv3_r50.py
@@ -0,0 +1,26 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/pose_estimation/models/arcface_torch/configs/speed.py b/pose_estimation/models/arcface_torch/configs/speed.py
new file mode 100755
index 0000000000000000000000000000000000000000..45e95237da65e44f35a172c25ac6dc4e313e4eae
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/configs/speed.py
@@ -0,0 +1,23 @@
+from easydict import EasyDict as edict
+
+# configs for test speed
+
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.rec = "synthetic"
+config.num_classes = 100 * 10000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = []
diff --git a/pose_estimation/models/arcface_torch/dataset.py b/pose_estimation/models/arcface_torch/dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..96bbb8bb6da99122f350bc8e1a6390245840e32b
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/dataset.py
@@ -0,0 +1,124 @@
+import numbers
+import os
+import queue as Queue
+import threading
+
+import mxnet as mx
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+
+
+class BackgroundGenerator(threading.Thread):
+    def __init__(self, generator, local_rank, max_prefetch=6):
+        super(BackgroundGenerator, self).__init__()
+        self.queue = Queue.Queue(max_prefetch)
+        self.generator = generator
+        self.local_rank = local_rank
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        torch.cuda.set_device(self.local_rank)
+        for item in self.generator:
+            self.queue.put(item)
+        self.queue.put(None)
+
+    def next(self):
+        next_item = self.queue.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoaderX(DataLoader):
+
+    def __init__(self, local_rank, **kwargs):
+        super(DataLoaderX, self).__init__(**kwargs)
+        self.stream = torch.cuda.Stream(local_rank)
+        self.local_rank = local_rank
+
+    def __iter__(self):
+        self.iter = super(DataLoaderX, self).__iter__()
+        self.iter = BackgroundGenerator(self.iter, self.local_rank)
+        self.preload()
+        return self
+
+    def preload(self):
+        self.batch = next(self.iter, None)
+        if self.batch is None:
+            return None
+        with torch.cuda.stream(self.stream):
+            for k in range(len(self.batch)):
+                self.batch[k] = self.batch[k].to(device=self.local_rank, non_blocking=True)
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is None:
+            raise StopIteration
+        self.preload()
+        return batch
+
+
+class MXFaceDataset(Dataset):
+    def __init__(self, root_dir, local_rank):
+        super(MXFaceDataset, self).__init__()
+        self.transform = transforms.Compose(
+            [transforms.ToPILImage(),
+             transforms.RandomHorizontalFlip(),
+             transforms.ToTensor(),
+             transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+             ])
+        self.root_dir = root_dir
+        self.local_rank = local_rank
+        path_imgrec = os.path.join(root_dir, 'train.rec')
+        path_imgidx = os.path.join(root_dir, 'train.idx')
+        self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+        s = self.imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        if header.flag > 0:
+            self.header0 = (int(header.label[0]), int(header.label[1]))
+            self.imgidx = np.array(range(1, int(header.label[0])))
+        else:
+            self.imgidx = np.array(list(self.imgrec.keys))
+
+    def __getitem__(self, index):
+        idx = self.imgidx[index]
+        s = self.imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        label = header.label
+        if not isinstance(label, numbers.Number):
+            label = label[0]
+        label = torch.tensor(label, dtype=torch.long)
+        sample = mx.image.imdecode(img).asnumpy()
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return sample, label
+
+    def __len__(self):
+        return len(self.imgidx)
+
+
+class SyntheticDataset(Dataset):
+    def __init__(self, local_rank):
+        super(SyntheticDataset, self).__init__()
+        img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32)
+        img = np.transpose(img, (2, 0, 1))
+        img = torch.from_numpy(img).squeeze(0).float()
+        img = ((img / 255) - 0.5) / 0.5
+        self.img = img
+        self.label = 1
+
+    def __getitem__(self, index):
+        return self.img, self.label
+
+    def __len__(self):
+        return 1000000
diff --git a/pose_estimation/models/arcface_torch/docs/eval.md b/pose_estimation/models/arcface_torch/docs/eval.md
new file mode 100755
index 0000000000000000000000000000000000000000..dd1d9e257367b6422680966198646c45e5a2671d
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/docs/eval.md
@@ -0,0 +1,31 @@
+## Eval on ICCV2021-MFR
+
+coming soon.
+
+
+## Eval IJBC
+You can eval ijbc with pytorch or onnx.
+
+
+1. Eval IJBC With Onnx
+```shell
+CUDA_VISIBLE_DEVICES=0 python onnx_ijbc.py --model-root ms1mv3_arcface_r50 --image-path IJB_release/IJBC --result-dir ms1mv3_arcface_r50
+```
+
+2. Eval IJBC With Pytorch
+```shell
+CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
+--model-prefix ms1mv3_arcface_r50/backbone.pth \
+--image-path IJB_release/IJBC \
+--result-dir ms1mv3_arcface_r50 \
+--batch-size 128 \
+--job ms1mv3_arcface_r50 \
+--target IJBC \
+--network iresnet50
+```
+
+## Inference
+
+```shell
+python inference.py --weight ms1mv3_arcface_r50/backbone.pth --network r50
+```
diff --git a/pose_estimation/models/arcface_torch/docs/install.md b/pose_estimation/models/arcface_torch/docs/install.md
new file mode 100755
index 0000000000000000000000000000000000000000..6314a40441285e9236438e468caf8b71a407531a
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/docs/install.md
@@ -0,0 +1,51 @@
+## v1.8.0 
+### Linux and Windows  
+```shell
+# CUDA 11.0
+pip --default-timeout=100 install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 10.2
+pip --default-timeout=100 install torch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0
+
+# CPU only
+pip --default-timeout=100 install torch==1.8.0+cpu torchvision==0.9.0+cpu torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+```
+
+
+## v1.7.1  
+### Linux and Windows  
+```shell
+# CUDA 11.0
+pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 10.2
+pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
+
+# CUDA 10.1
+pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 9.2
+pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CPU only
+pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+```
+
+
+## v1.6.0  
+
+### Linux and Windows
+```shell
+# CUDA 10.2
+pip install torch==1.6.0 torchvision==0.7.0
+
+# CUDA 10.1
+pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 9.2
+pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CPU only
+pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+```
\ No newline at end of file
diff --git a/pose_estimation/models/arcface_torch/docs/modelzoo.md b/pose_estimation/models/arcface_torch/docs/modelzoo.md
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/models/arcface_torch/docs/speed_benchmark.md b/pose_estimation/models/arcface_torch/docs/speed_benchmark.md
new file mode 100755
index 0000000000000000000000000000000000000000..055aee0defe2c43a523ced48260242f0f99b7cea
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/docs/speed_benchmark.md
@@ -0,0 +1,93 @@
+## Test Training Speed
+
+- Test Commands
+
+You need to use the following two commands to test the Partial FC training performance. 
+The number of identites is **3 millions** (synthetic data), turn mixed precision  training on, backbone is resnet50, 
+batch size is 1024.
+```shell
+# Model Parallel
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/3millions
+# Partial FC 0.1
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/3millions_pfc
+```
+
+- GPU Memory
+
+```
+# (Model Parallel) gpustat -i
+[0] Tesla V100-SXM2-32GB | 64'C,  94 % | 30338 / 32510 MB 
+[1] Tesla V100-SXM2-32GB | 60'C,  99 % | 28876 / 32510 MB 
+[2] Tesla V100-SXM2-32GB | 60'C,  99 % | 28872 / 32510 MB 
+[3] Tesla V100-SXM2-32GB | 69'C,  99 % | 28872 / 32510 MB 
+[4] Tesla V100-SXM2-32GB | 66'C,  99 % | 28888 / 32510 MB 
+[5] Tesla V100-SXM2-32GB | 60'C,  99 % | 28932 / 32510 MB 
+[6] Tesla V100-SXM2-32GB | 68'C, 100 % | 28916 / 32510 MB 
+[7] Tesla V100-SXM2-32GB | 65'C,  99 % | 28860 / 32510 MB 
+
+# (Partial FC 0.1) gpustat -i
+[0] Tesla V100-SXM2-32GB | 60'C,  95 % | 10488 / 32510 MB                                                                                                                                          │·······················
+[1] Tesla V100-SXM2-32GB | 60'C,  97 % | 10344 / 32510 MB                                                                                                                                          │·······················
+[2] Tesla V100-SXM2-32GB | 61'C,  95 % | 10340 / 32510 MB                                                                                                                                          │·······················
+[3] Tesla V100-SXM2-32GB | 66'C,  95 % | 10340 / 32510 MB                                                                                                                                          │·······················
+[4] Tesla V100-SXM2-32GB | 65'C,  94 % | 10356 / 32510 MB                                                                                                                                          │·······················
+[5] Tesla V100-SXM2-32GB | 61'C,  95 % | 10400 / 32510 MB                                                                                                                                          │·······················
+[6] Tesla V100-SXM2-32GB | 68'C,  96 % | 10384 / 32510 MB                                                                                                                                          │·······················
+[7] Tesla V100-SXM2-32GB | 64'C,  95 % | 10328 / 32510 MB                                                                                                                                        │·······················
+```
+
+- Training Speed
+
+```python
+# (Model Parallel) trainging.log
+Training: Speed 2271.33 samples/sec   Loss 1.1624   LearningRate 0.2000   Epoch: 0   Global Step: 100 
+Training: Speed 2269.94 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 150 
+Training: Speed 2272.67 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 200 
+Training: Speed 2266.55 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 250 
+Training: Speed 2272.54 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 300 
+
+# (Partial FC 0.1) trainging.log
+Training: Speed 5299.56 samples/sec   Loss 1.0965   LearningRate 0.2000   Epoch: 0   Global Step: 100  
+Training: Speed 5296.37 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 150  
+Training: Speed 5304.37 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 200  
+Training: Speed 5274.43 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 250  
+Training: Speed 5300.10 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 300   
+```
+
+In this test case, Partial FC 0.1 only use1 1/3 of the GPU memory of the model parallel, 
+and the training speed is 2.5 times faster than the model parallel.
+
+
+## Speed Benchmark
+
+1. Training speed of different parallel methods (samples/second), Tesla V100 32GB * 8. (Larger is better)
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :---    | :--- | :--- | :--- |
+|125000   | 4681 | 4824 | 5004 |
+|250000   | 4047 | 4521 | 4976 |
+|500000   | 3087 | 4013 | 4900 |
+|1000000  | 2090 | 3449 | 4803 |
+|1400000  | 1672 | 3043 | 4738 |
+|2000000  | -    | 2593 | 4626 |
+|4000000  | -    | 1748 | 4208 |
+|5500000  | -    | 1389 | 3975 |
+|8000000  | -    | -    | 3565 |
+|16000000 | -    | -    | 2679 |
+|29000000 | -    | -    | 1855 |
+
+2. GPU memory cost of different parallel methods (GB per GPU), Tesla V100 32GB * 8. (Smaller is better)
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :---    | :---  | :---  | :---  |
+|125000   | 7358  | 5306  | 4868  |
+|250000   | 9940  | 5826  | 5004  |
+|500000   | 14220 | 7114  | 5202  |
+|1000000  | 23708 | 9966  | 5620  |
+|1400000  | 32252 | 11178 | 6056  |
+|2000000  | -     | 13978 | 6472  |
+|4000000  | -     | 23238 | 8284  |
+|5500000  | -     | 32188 | 9854  |
+|8000000  | -     | -     | 12310 |
+|16000000 | -     | -     | 19950 |
+|29000000 | -     | -     | 32324 |
diff --git a/pose_estimation/models/arcface_torch/eval/__init__.py b/pose_estimation/models/arcface_torch/eval/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/models/arcface_torch/eval/verification.py b/pose_estimation/models/arcface_torch/eval/verification.py
new file mode 100755
index 0000000000000000000000000000000000000000..253343b83dbf9d1bd154d14ec068e098bf0968db
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/eval/verification.py
@@ -0,0 +1,407 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import datetime
+import os
+import pickle
+
+import mxnet as mx
+import numpy as np
+import sklearn
+import torch
+from mxnet import ndarray as nd
+from scipy import interpolate
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy(
+                threshold, dist[test_set],
+                actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    # print(true_accept, false_accept)
+    # print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+@torch.no_grad()
+def load_bin(path, image_size):
+    try:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f)  # py2
+    except UnicodeDecodeError as e:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f, encoding='bytes')  # py3
+    data_list = []
+    for flip in [0, 1]:
+        data = torch.empty((len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for idx in range(len(issame_list) * 2):
+        _bin = bins[idx]
+        img = mx.image.imdecode(_bin)
+        if img.shape[1] != image_size[0]:
+            img = mx.image.resize_short(img, image_size[0])
+        img = nd.transpose(img, axes=(2, 0, 1))
+        for flip in [0, 1]:
+            if flip == 1:
+                img = mx.ndarray.flip(data=img, axis=2)
+            data_list[flip][idx][:] = torch.from_numpy(img.asnumpy())
+        if idx % 1000 == 0:
+            print('loading bin', idx)
+    print(data_list[0].shape)
+    return data_list, issame_list
+
+@torch.no_grad()
+def test(data_set, backbone, batch_size, nfolds=10):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = data[bb - batch_size: bb]
+            time0 = datetime.datetime.now()
+            img = ((_data / 255) - 0.5) / 0.5
+            net_out: torch.Tensor = backbone(img)
+            _embeddings = net_out.detach().cpu().numpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    acc1 = 0.0
+    std1 = 0.0
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    print('infer time', time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+def dumpR(data_set,
+          backbone,
+          batch_size,
+          name='',
+          data_extra=None,
+          label_shape=None):
+    print('dump verification embedding..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data,), label=(_label,))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label,))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    actual_issame = np.asarray(issame_list)
+    outname = os.path.join('temp.bin')
+    with open(outname, 'wb') as f:
+        pickle.dump((embeddings, issame_list),
+                    f,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+
+
+# if __name__ == '__main__':
+#
+#     parser = argparse.ArgumentParser(description='do verification')
+#     # general
+#     parser.add_argument('--data-dir', default='', help='')
+#     parser.add_argument('--model',
+#                         default='../model/softmax,50',
+#                         help='path to load model.')
+#     parser.add_argument('--target',
+#                         default='lfw,cfp_ff,cfp_fp,agedb_30',
+#                         help='test targets.')
+#     parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+#     parser.add_argument('--batch-size', default=32, type=int, help='')
+#     parser.add_argument('--max', default='', type=str, help='')
+#     parser.add_argument('--mode', default=0, type=int, help='')
+#     parser.add_argument('--nfolds', default=10, type=int, help='')
+#     args = parser.parse_args()
+#     image_size = [112, 112]
+#     print('image_size', image_size)
+#     ctx = mx.gpu(args.gpu)
+#     nets = []
+#     vec = args.model.split(',')
+#     prefix = args.model.split(',')[0]
+#     epochs = []
+#     if len(vec) == 1:
+#         pdir = os.path.dirname(prefix)
+#         for fname in os.listdir(pdir):
+#             if not fname.endswith('.params'):
+#                 continue
+#             _file = os.path.join(pdir, fname)
+#             if _file.startswith(prefix):
+#                 epoch = int(fname.split('.')[0].split('-')[1])
+#                 epochs.append(epoch)
+#         epochs = sorted(epochs, reverse=True)
+#         if len(args.max) > 0:
+#             _max = [int(x) for x in args.max.split(',')]
+#             assert len(_max) == 2
+#             if len(epochs) > _max[1]:
+#                 epochs = epochs[_max[0]:_max[1]]
+#
+#     else:
+#         epochs = [int(x) for x in vec[1].split('|')]
+#     print('model number', len(epochs))
+#     time0 = datetime.datetime.now()
+#     for epoch in epochs:
+#         print('loading', prefix, epoch)
+#         sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+#         # arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+#         all_layers = sym.get_internals()
+#         sym = all_layers['fc1_output']
+#         model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+#         # model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+#         model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+#                                           image_size[1]))])
+#         model.set_params(arg_params, aux_params)
+#         nets.append(model)
+#     time_now = datetime.datetime.now()
+#     diff = time_now - time0
+#     print('model loading time', diff.total_seconds())
+#
+#     ver_list = []
+#     ver_name_list = []
+#     for name in args.target.split(','):
+#         path = os.path.join(args.data_dir, name + ".bin")
+#         if os.path.exists(path):
+#             print('loading.. ', name)
+#             data_set = load_bin(path, image_size)
+#             ver_list.append(data_set)
+#             ver_name_list.append(name)
+#
+#     if args.mode == 0:
+#         for i in range(len(ver_list)):
+#             results = []
+#             for model in nets:
+#                 acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+#                     ver_list[i], model, args.batch_size, args.nfolds)
+#                 print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+#                 print('[%s]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], acc1, std1))
+#                 print('[%s]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], acc2, std2))
+#                 results.append(acc2)
+#             print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+#     elif args.mode == 1:
+#         raise ValueError
+#     else:
+#         model = nets[0]
+#         dumpR(ver_list[0], model, args.batch_size, args.target)
diff --git a/pose_estimation/models/arcface_torch/eval_ijbc.py b/pose_estimation/models/arcface_torch/eval_ijbc.py
new file mode 100755
index 0000000000000000000000000000000000000000..9c5a650d486d18eb02d6f60d448fc3b315261f5d
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/eval_ijbc.py
@@ -0,0 +1,483 @@
+# coding: utf-8
+
+import os
+import pickle
+
+import matplotlib
+import pandas as pd
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import timeit
+import sklearn
+import argparse
+import cv2
+import numpy as np
+import torch
+from skimage import transform as trans
+from backbones import get_model
+from sklearn.metrics import roc_curve, auc
+
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from pathlib import Path
+
+import sys
+import warnings
+
+sys.path.insert(0, "../")
+warnings.filterwarnings("ignore")
+
+parser = argparse.ArgumentParser(description='do ijb test')
+# general
+parser.add_argument('--model-prefix', default='', help='path to load model.')
+parser.add_argument('--image-path', default='', type=str, help='')
+parser.add_argument('--result-dir', default='.', type=str, help='')
+parser.add_argument('--batch-size', default=128, type=int, help='')
+parser.add_argument('--network', default='iresnet50', type=str, help='')
+parser.add_argument('--job', default='insightface', type=str, help='job name')
+parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+args = parser.parse_args()
+
+target = args.target
+model_path = args.model_prefix
+image_path = args.image_path
+result_dir = args.result_dir
+gpu_id = None
+use_norm_score = True  # if Ture, TestMode(N1)
+use_detector_score = True  # if Ture, TestMode(D1)
+use_flip_test = True  # if Ture, TestMode(F1)
+job = args.job
+batch_size = args.batch_size
+
+
+class Embedding(object):
+    def __init__(self, prefix, data_shape, batch_size=1):
+        image_size = (112, 112)
+        self.image_size = image_size
+        weight = torch.load(prefix)
+        resnet = get_model(args.network, dropout=0, fp16=False).cuda()
+        resnet.load_state_dict(weight)
+        model = torch.nn.DataParallel(resnet)
+        self.model = model
+        self.model.eval()
+        src = np.array([
+            [30.2946, 51.6963],
+            [65.5318, 51.5014],
+            [48.0252, 71.7366],
+            [33.5493, 92.3655],
+            [62.7299, 92.2041]], dtype=np.float32)
+        src[:, 0] += 8.0
+        self.src = src
+        self.batch_size = batch_size
+        self.data_shape = data_shape
+
+    def get(self, rimg, landmark):
+
+        assert landmark.shape[0] == 68 or landmark.shape[0] == 5
+        assert landmark.shape[1] == 2
+        if landmark.shape[0] == 68:
+            landmark5 = np.zeros((5, 2), dtype=np.float32)
+            landmark5[0] = (landmark[36] + landmark[39]) / 2
+            landmark5[1] = (landmark[42] + landmark[45]) / 2
+            landmark5[2] = landmark[30]
+            landmark5[3] = landmark[48]
+            landmark5[4] = landmark[54]
+        else:
+            landmark5 = landmark
+        tform = trans.SimilarityTransform()
+        tform.estimate(landmark5, self.src)
+        M = tform.params[0:2, :]
+        img = cv2.warpAffine(rimg,
+                             M, (self.image_size[1], self.image_size[0]),
+                             borderValue=0.0)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 3*112*112, RGB
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 3, self.image_size[1], self.image_size[0]), dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return input_blob
+
+    @torch.no_grad()
+    def forward_db(self, batch_data):
+        imgs = torch.Tensor(batch_data).cuda()
+        imgs.div_(255).sub_(0.5).div_(0.5)
+        feat = self.model(imgs)
+        feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+        return feat.cpu().numpy()
+
+
+# 将一个list尽量均分成n份，限制len(list)==n，份数大于原list内元素个数则分配空list[]
+def divideIntoNstrand(listTemp, n):
+    twoList = [[] for i in range(n)]
+    for i, e in enumerate(listTemp):
+        twoList[i % n].append(e)
+    return twoList
+
+
+def read_template_media_list(path):
+    # ijb_meta = np.loadtxt(path, dtype=str)
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+# In[ ]:
+
+
+def read_template_pair_list(path):
+    # pairs = np.loadtxt(path, dtype=str)
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    # print(pairs.shape)
+    # print(pairs[:, 0].astype(np.int))
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+# In[ ]:
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# In[ ]:
+
+
+def get_image_feature(img_path, files_list, model_path, epoch, gpu_id):
+    batch_size = args.batch_size
+    data_shape = (3, 112, 112)
+
+    files = files_list
+    print('files:', len(files))
+    rare_size = len(files) % batch_size
+    faceness_scores = []
+    batch = 0
+    img_feats = np.empty((len(files), 1024), dtype=np.float32)
+
+    batch_data = np.empty((2 * batch_size, 3, 112, 112))
+    embedding = Embedding(model_path, data_shape, batch_size)
+    for img_index, each_line in enumerate(files[:len(files) - rare_size]):
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        input_blob = embedding.get(img, lmk)
+
+        batch_data[2 * (img_index - batch * batch_size)][:] = input_blob[0]
+        batch_data[2 * (img_index - batch * batch_size) + 1][:] = input_blob[1]
+        if (img_index + 1) % batch_size == 0:
+            print('batch', batch)
+            img_feats[batch * batch_size:batch * batch_size +
+                                         batch_size][:] = embedding.forward_db(batch_data)
+            batch += 1
+        faceness_scores.append(name_lmk_score[-1])
+
+    batch_data = np.empty((2 * rare_size, 3, 112, 112))
+    embedding = Embedding(model_path, data_shape, rare_size)
+    for img_index, each_line in enumerate(files[len(files) - rare_size:]):
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        input_blob = embedding.get(img, lmk)
+        batch_data[2 * img_index][:] = input_blob[0]
+        batch_data[2 * img_index + 1][:] = input_blob[1]
+        if (img_index + 1) % rare_size == 0:
+            print('batch', batch)
+            img_feats[len(files) -
+                      rare_size:][:] = embedding.forward_db(batch_data)
+            batch += 1
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    # img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01
+    # faceness_scores = np.ones( (len(files), ), dtype=np.float32 )
+    return img_feats, faceness_scores
+
+
+# In[ ]:
+
+
+def image2template_feature(img_feats=None, templates=None, medias=None):
+    # ==========================================================
+    # 1. face image feature l2 normalization. img_feats:[number_image x feats_dim]
+    # 2. compute media feature.
+    # 3. compute template feature.
+    # ==========================================================
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+
+    for count_template, uqt in enumerate(unique_templates):
+
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias,
+                                                       return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [
+                    np.mean(face_norm_feats[ind_m], axis=0, keepdims=True)
+                ]
+        media_norm_feats = np.array(media_norm_feats)
+        # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    # template_norm_feats = template_feats / np.sqrt(np.sum(template_feats ** 2, -1, keepdims=True))
+    template_norm_feats = sklearn.preprocessing.normalize(template_feats)
+    # print(template_norm_feats.shape)
+    return template_norm_feats, unique_templates
+
+
+# In[ ]:
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    # ==========================================================
+    #         Compute set-to-set Similarity Score.
+    # ==========================================================
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+# In[ ]:
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def read_score(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# # Step1: Load Meta Data
+
+# In[ ]:
+
+assert target == 'IJBC' or target == 'IJBB'
+
+# =============================================================
+# load image and template relationships for template feature embedding
+# tid --> template id,  mid --> media id
+# format:
+#           image_name tid mid
+# =============================================================
+start = timeit.default_timer()
+templates, medias = read_template_media_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_face_tid_mid.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+
+# =============================================================
+# load template pairs for template-to-template verification
+# tid : template id,  label : 1/0
+# format:
+#           tid_1 tid_2 label
+# =============================================================
+start = timeit.default_timer()
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 2: Get Image Features
+
+# In[ ]:
+
+# =============================================================
+# load image features
+# format:
+#           img_feats: [image_num x feats_dim] (227630, 512)
+# =============================================================
+start = timeit.default_timer()
+img_path = '%s/loose_crop' % image_path
+img_list_path = '%s/meta/%s_name_5pts_score.txt' % (image_path, target.lower())
+img_list = open(img_list_path)
+files = img_list.readlines()
+# files_list = divideIntoNstrand(files, rank_size)
+files_list = files
+
+# img_feats
+# for i in range(rank_size):
+img_feats, faceness_scores = get_image_feature(img_path, files_list,
+                                               model_path, 0, gpu_id)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0],
+                                          img_feats.shape[1]))
+
+# # Step3: Get Template Features
+
+# In[ ]:
+
+# =============================================================
+# compute template features from image features.
+# =============================================================
+start = timeit.default_timer()
+# ==========================================================
+# Norm feature before aggregation into template feature?
+# Feature norm from embedding network and faceness score are able to decrease weights for noise samples (not face).
+# ==========================================================
+# 1. FaceScore （Feature Norm）
+# 2. FaceScore （Detector）
+
+if use_flip_test:
+    # concat --- F1
+    # img_input_feats = img_feats
+    # add --- F2
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] //
+                                     2] + img_feats[:, img_feats.shape[1] // 2:]
+else:
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+if use_norm_score:
+    img_input_feats = img_input_feats
+else:
+    # normalise features to remove norm information
+    img_input_feats = img_input_feats / np.sqrt(
+        np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+if use_detector_score:
+    print(img_input_feats.shape, faceness_scores.shape)
+    img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+else:
+    img_input_feats = img_input_feats
+
+template_norm_feats, unique_templates = image2template_feature(
+    img_input_feats, templates, medias)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 4: Get Template Similarity Scores
+
+# In[ ]:
+
+# =============================================================
+# compute verification scores between template pairs.
+# =============================================================
+start = timeit.default_timer()
+score = verification(template_norm_feats, unique_templates, p1, p2)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+save_path = os.path.join(result_dir, args.job)
+# save_path = result_dir + '/%s_result' % target
+
+if not os.path.exists(save_path):
+    os.makedirs(save_path)
+
+score_save_file = os.path.join(save_path, "%s.npy" % target.lower())
+np.save(score_save_file, score)
+
+# # Step 5: Get ROC Curves and TPR@FPR Table
+
+# In[ ]:
+
+files = [score_save_file]
+methods = []
+scores = []
+for file in files:
+    methods.append(Path(file).stem)
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, target))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+fig.savefig(os.path.join(save_path, '%s.pdf' % target.lower()))
+print(tpr_fpr_table)
diff --git a/pose_estimation/models/arcface_torch/inference.py b/pose_estimation/models/arcface_torch/inference.py
new file mode 100755
index 0000000000000000000000000000000000000000..3e5156e8d649954837e397c2ff15ec29995e7502
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/inference.py
@@ -0,0 +1,35 @@
+import argparse
+
+import cv2
+import numpy as np
+import torch
+
+from backbones import get_model
+
+
+@torch.no_grad()
+def inference(weight, name, img):
+    if img is None:
+        img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.uint8)
+    else:
+        img = cv2.imread(img)
+        img = cv2.resize(img, (112, 112))
+
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, (2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).float()
+    img.div_(255).sub_(0.5).div_(0.5)
+    net = get_model(name, fp16=False)
+    net.load_state_dict(torch.load(weight))
+    net.eval()
+    feat = net(img).numpy()
+    print(feat)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
+    parser.add_argument('--network', type=str, default='r50', help='backbone network')
+    parser.add_argument('--weight', type=str, default='')
+    parser.add_argument('--img', type=str, default=None)
+    args = parser.parse_args()
+    inference(args.weight, args.network, args.img)
diff --git a/pose_estimation/models/arcface_torch/losses.py b/pose_estimation/models/arcface_torch/losses.py
new file mode 100755
index 0000000000000000000000000000000000000000..87aeaa107af4d53f5a6132b3739d5cafdcded7fc
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/losses.py
@@ -0,0 +1,42 @@
+import torch
+from torch import nn
+
+
+def get_loss(name):
+    if name == "cosface":
+        return CosFace()
+    elif name == "arcface":
+        return ArcFace()
+    else:
+        raise ValueError()
+
+
+class CosFace(nn.Module):
+    def __init__(self, s=64.0, m=0.40):
+        super(CosFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, cosine, label):
+        index = torch.where(label != -1)[0]
+        m_hot = torch.zeros(index.size()[0], cosine.size()[1], device=cosine.device)
+        m_hot.scatter_(1, label[index, None], self.m)
+        cosine[index] -= m_hot
+        ret = cosine * self.s
+        return ret
+
+
+class ArcFace(nn.Module):
+    def __init__(self, s=64.0, m=0.5):
+        super(ArcFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, cosine: torch.Tensor, label):
+        index = torch.where(label != -1)[0]
+        m_hot = torch.zeros(index.size()[0], cosine.size()[1], device=cosine.device)
+        m_hot.scatter_(1, label[index, None], self.m)
+        cosine.acos_()
+        cosine[index] += m_hot
+        cosine.cos_().mul_(self.s)
+        return cosine
diff --git a/pose_estimation/models/arcface_torch/onnx_helper.py b/pose_estimation/models/arcface_torch/onnx_helper.py
new file mode 100755
index 0000000000000000000000000000000000000000..ca922ca6d410655029e459cf8fd1c323d276c34c
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/onnx_helper.py
@@ -0,0 +1,250 @@
+from __future__ import division
+import datetime
+import os
+import os.path as osp
+import glob
+import numpy as np
+import cv2
+import sys
+import onnxruntime
+import onnx
+import argparse
+from onnx import numpy_helper
+from insightface.data import get_image
+
+class ArcFaceORT:
+    def __init__(self, model_path, cpu=False):
+        self.model_path = model_path
+        # providers = None will use available provider, for onnxruntime-gpu it will be "CUDAExecutionProvider"
+        self.providers = ['CPUExecutionProvider'] if cpu else None
+
+    #input_size is (w,h), return error message, return None if success
+    def check(self, track='cfat', test_img = None):
+        #default is cfat
+        max_model_size_mb=1024
+        max_feat_dim=512
+        max_time_cost=15
+        if track.startswith('ms1m'):
+            max_model_size_mb=1024
+            max_feat_dim=512
+            max_time_cost=10
+        elif track.startswith('glint'):
+            max_model_size_mb=1024
+            max_feat_dim=1024
+            max_time_cost=20
+        elif track.startswith('cfat'):
+            max_model_size_mb = 1024
+            max_feat_dim = 512
+            max_time_cost = 15
+        elif track.startswith('unconstrained'):
+            max_model_size_mb=1024
+            max_feat_dim=1024
+            max_time_cost=30
+        else:
+            return "track not found"
+
+        if not os.path.exists(self.model_path):
+            return "model_path not exists"
+        if not os.path.isdir(self.model_path):
+            return "model_path should be directory"
+        onnx_files = []
+        for _file in os.listdir(self.model_path):
+            if _file.endswith('.onnx'):
+                onnx_files.append(osp.join(self.model_path, _file))
+        if len(onnx_files)==0:
+            return "do not have onnx files"
+        self.model_file = sorted(onnx_files)[-1]
+        print('use onnx-model:', self.model_file)
+        try:
+            session = onnxruntime.InferenceSession(self.model_file, providers=self.providers)
+        except:
+            return "load onnx failed"
+        input_cfg = session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        print('input-shape:', input_shape)
+        if len(input_shape)!=4:
+            return "length of input_shape should be 4"
+        if not isinstance(input_shape[0], str):
+            #return "input_shape[0] should be str to support batch-inference"
+            print('reset input-shape[0] to None')
+            model = onnx.load(self.model_file)
+            model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+            new_model_file = osp.join(self.model_path, 'zzzzrefined.onnx')
+            onnx.save(model, new_model_file)
+            self.model_file = new_model_file
+            print('use new onnx-model:', self.model_file)
+            try:
+                session = onnxruntime.InferenceSession(self.model_file, providers=self.providers)
+            except:
+                return "load onnx failed"
+            input_cfg = session.get_inputs()[0]
+            input_shape = input_cfg.shape
+            print('new-input-shape:', input_shape)
+
+        self.image_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        outputs = session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+            #print(o.name, o.shape)
+        if len(output_names)!=1:
+            return "number of output nodes should be 1"
+        self.session = session
+        self.input_name = input_name
+        self.output_names = output_names
+        #print(self.output_names)
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        if len(graph.node)<8:
+            return "too small onnx graph"
+
+        input_size = (112,112)
+        self.crop = None
+        if track=='cfat':
+            crop_file = osp.join(self.model_path, 'crop.txt')
+            if osp.exists(crop_file):
+                lines = open(crop_file,'r').readlines()
+                if len(lines)!=6:
+                    return "crop.txt should contain 6 lines"
+                lines = [int(x) for x in lines]
+                self.crop = lines[:4]
+                input_size = tuple(lines[4:6])
+        if input_size!=self.image_size:
+            return "input-size is inconsistant with onnx model input, %s vs %s"%(input_size, self.image_size)
+
+        self.model_size_mb = os.path.getsize(self.model_file) / float(1024*1024)
+        if self.model_size_mb > max_model_size_mb:
+            return "max model size exceed, given %.3f-MB"%self.model_size_mb
+
+        input_mean = None
+        input_std = None
+        if track=='cfat':
+            pn_file = osp.join(self.model_path, 'pixel_norm.txt')
+            if osp.exists(pn_file):
+                lines = open(pn_file,'r').readlines()
+                if len(lines)!=2:
+                    return "pixel_norm.txt should contain 2 lines"
+                input_mean = float(lines[0])
+                input_std = float(lines[1])
+        if input_mean is not None or input_std is not None:
+            if input_mean is None or input_std is None:
+                return "please set input_mean and input_std simultaneously"
+        else:
+            find_sub = False
+            find_mul = False
+            for nid, node in enumerate(graph.node[:8]):
+                print(nid, node.name)
+                if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                    find_sub = True
+                if node.name.startswith('Mul') or node.name.startswith('_mul') or node.name.startswith('Div'):
+                    find_mul = True
+            if find_sub and find_mul:
+                print("find sub and mul")
+                #mxnet arcface model
+                input_mean = 0.0
+                input_std = 1.0
+            else:
+                input_mean = 127.5
+                input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        for initn in graph.initializer:
+            weight_array = numpy_helper.to_array(initn)
+            dt = weight_array.dtype
+            if dt.itemsize<4:
+                return 'invalid weight type - (%s:%s)' % (initn.name, dt.name)
+        if test_img is None:
+            test_img = get_image('Tom_Hanks_54745')
+            test_img = cv2.resize(test_img, self.image_size)
+        else:
+            test_img = cv2.resize(test_img, self.image_size)
+        feat, cost = self.benchmark(test_img)
+        batch_result = self.check_batch(test_img)
+        batch_result_sum = float(np.sum(batch_result))
+        if batch_result_sum in [float('inf'), -float('inf')] or batch_result_sum != batch_result_sum:
+            print(batch_result)
+            print(batch_result_sum)
+            return "batch result output contains NaN!"
+
+        if len(feat.shape) < 2:
+           return "the shape of the feature must be two, but get {}".format(str(feat.shape))
+
+        if feat.shape[1] > max_feat_dim:
+            return "max feat dim exceed, given %d"%feat.shape[1]
+        self.feat_dim = feat.shape[1]
+        cost_ms = cost*1000
+        if cost_ms>max_time_cost:
+            return "max time cost exceed, given %.4f"%cost_ms
+        self.cost_ms = cost_ms
+        print('check stat:, model-size-mb: %.4f, feat-dim: %d, time-cost-ms: %.4f, input-mean: %.3f, input-std: %.3f'%(self.model_size_mb, self.feat_dim, self.cost_ms, self.input_mean, self.input_std))
+        return None
+
+    def check_batch(self, img):
+        if not isinstance(img, list):
+            imgs = [img, ] * 32
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1]:self.crop[3], self.crop[0]:self.crop[2], :]
+                if nimg.shape[0] != self.image_size[1] or nimg.shape[1] != self.image_size[0]:
+                    nimg = cv2.resize(nimg, self.image_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(
+            images=imgs, scalefactor=1.0 / self.input_std, size=self.image_size,
+            mean=(self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+
+    def meta_info(self):
+        return {'model-size-mb':self.model_size_mb, 'feature-dim':self.feat_dim, 'infer': self.cost_ms}
+
+
+    def forward(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.image_size
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1]:self.crop[3],self.crop[0]:self.crop[2],:]
+                if nimg.shape[0]!=input_size[1] or nimg.shape[1]!=input_size[0]:
+                    nimg = cv2.resize(nimg, input_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(imgs, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name : blob})[0]
+        return net_out
+
+    def benchmark(self, img):
+        input_size = self.image_size
+        if self.crop is not None:
+            nimg = img[self.crop[1]:self.crop[3],self.crop[0]:self.crop[2],:]
+            if nimg.shape[0]!=input_size[1] or nimg.shape[1]!=input_size[0]:
+                nimg = cv2.resize(nimg, input_size)
+            img = nimg
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        costs = []
+        for _ in range(50):
+            ta = datetime.datetime.now()
+            net_out = self.session.run(self.output_names, {self.input_name : blob})[0]
+            tb = datetime.datetime.now()
+            cost = (tb-ta).total_seconds()
+            costs.append(cost)
+        costs = sorted(costs)
+        cost = costs[5]
+        return net_out, cost
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    # general
+    parser.add_argument('workdir', help='submitted work dir', type=str)
+    parser.add_argument('--track', help='track name, for different challenge', type=str, default='cfat')
+    args = parser.parse_args()
+    handler = ArcFaceORT(args.workdir)
+    err = handler.check(args.track)
+    print('err:', err)
diff --git a/pose_estimation/models/arcface_torch/onnx_ijbc.py b/pose_estimation/models/arcface_torch/onnx_ijbc.py
new file mode 100755
index 0000000000000000000000000000000000000000..05b50bfad4b4cf38903b89f596263a8e29a50d3e
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/onnx_ijbc.py
@@ -0,0 +1,267 @@
+import argparse
+import os
+import pickle
+import timeit
+
+import cv2
+import mxnet as mx
+import numpy as np
+import pandas as pd
+import prettytable
+import skimage.transform
+from sklearn.metrics import roc_curve
+from sklearn.preprocessing import normalize
+
+from onnx_helper import ArcFaceORT
+
+SRC = np.array(
+    [
+        [30.2946, 51.6963],
+        [65.5318, 51.5014],
+        [48.0252, 71.7366],
+        [33.5493, 92.3655],
+        [62.7299, 92.2041]]
+    , dtype=np.float32)
+SRC[:, 0] += 8.0
+
+
+class AlignedDataSet(mx.gluon.data.Dataset):
+    def __init__(self, root, lines, align=True):
+        self.lines = lines
+        self.root = root
+        self.align = align
+
+    def __len__(self):
+        return len(self.lines)
+
+    def __getitem__(self, idx):
+        each_line = self.lines[idx]
+        name_lmk_score = each_line.strip().split(' ')
+        name = os.path.join(self.root, name_lmk_score[0])
+        img = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
+        landmark5 = np.array([float(x) for x in name_lmk_score[1:-1]], dtype=np.float32).reshape((5, 2))
+        st = skimage.transform.SimilarityTransform()
+        st.estimate(landmark5, SRC)
+        img = cv2.warpAffine(img, st.params[0:2, :], (112, 112), borderValue=0.0)
+        img_1 = np.expand_dims(img, 0)
+        img_2 = np.expand_dims(np.fliplr(img), 0)
+        output = np.concatenate((img_1, img_2), axis=0).astype(np.float32)
+        output = np.transpose(output, (0, 3, 1, 2))
+        output = mx.nd.array(output)
+        return output
+
+
+def extract(model_root, dataset):
+    model = ArcFaceORT(model_path=model_root)
+    model.check()
+    feat_mat = np.zeros(shape=(len(dataset), 2 * model.feat_dim))
+
+    def batchify_fn(data):
+        return mx.nd.concat(*data, dim=0)
+
+    data_loader = mx.gluon.data.DataLoader(
+        dataset, 128, last_batch='keep', num_workers=4,
+        thread_pool=True, prefetch=16, batchify_fn=batchify_fn)
+    num_iter = 0
+    for batch in data_loader:
+        batch = batch.asnumpy()
+        batch = (batch - model.input_mean) / model.input_std
+        feat = model.session.run(model.output_names, {model.input_name: batch})[0]
+        feat = np.reshape(feat, (-1, model.feat_dim * 2))
+        feat_mat[128 * num_iter: 128 * num_iter + feat.shape[0], :] = feat
+        num_iter += 1
+        if num_iter % 50 == 0:
+            print(num_iter)
+    return feat_mat
+
+
+def read_template_media_list(path):
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+def image2template_feature(img_feats=None,
+                           templates=None,
+                           medias=None):
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+    for count_template, uqt in enumerate(unique_templates):
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias, return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [np.mean(face_norm_feats[ind_m], axis=0, keepdims=True), ]
+        media_norm_feats = np.array(media_norm_feats)
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    template_norm_feats = normalize(template_feats)
+    return template_norm_feats, unique_templates
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000
+    sublists = [total_pairs[i: i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def main(args):
+    use_norm_score = True  # if Ture, TestMode(N1)
+    use_detector_score = True  # if Ture, TestMode(D1)
+    use_flip_test = True  # if Ture, TestMode(F1)
+    assert args.target == 'IJBC' or args.target == 'IJBB'
+
+    start = timeit.default_timer()
+    templates, medias = read_template_media_list(
+        os.path.join('%s/meta' % args.image_path, '%s_face_tid_mid.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    p1, p2, label = read_template_pair_list(
+        os.path.join('%s/meta' % args.image_path,
+                     '%s_template_pair_label.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    img_path = '%s/loose_crop' % args.image_path
+    img_list_path = '%s/meta/%s_name_5pts_score.txt' % (args.image_path, args.target.lower())
+    img_list = open(img_list_path)
+    files = img_list.readlines()
+    dataset = AlignedDataSet(root=img_path, lines=files, align=True)
+    img_feats = extract(args.model_root, dataset)
+
+    faceness_scores = []
+    for each_line in files:
+        name_lmk_score = each_line.split()
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0], img_feats.shape[1]))
+    start = timeit.default_timer()
+
+    if use_flip_test:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2] + img_feats[:, img_feats.shape[1] // 2:]
+    else:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+    if use_norm_score:
+        img_input_feats = img_input_feats
+    else:
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+    if use_detector_score:
+        print(img_input_feats.shape, faceness_scores.shape)
+        img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+    else:
+        img_input_feats = img_input_feats
+
+    template_norm_feats, unique_templates = image2template_feature(
+        img_input_feats, templates, medias)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    score = verification(template_norm_feats, unique_templates, p1, p2)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    save_path = os.path.join(args.result_dir, "{}_result".format(args.target))
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    score_save_file = os.path.join(save_path, "{}.npy".format(args.model_root))
+    np.save(score_save_file, score)
+    files = [score_save_file]
+    methods = []
+    scores = []
+    for file in files:
+        methods.append(os.path.basename(file))
+        scores.append(np.load(file))
+    methods = np.array(methods)
+    scores = dict(zip(methods, scores))
+    x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+    tpr_fpr_table = prettytable.PrettyTable(['Methods'] + [str(x) for x in x_labels])
+    for method in methods:
+        fpr, tpr, _ = roc_curve(label, scores[method])
+        fpr = np.flipud(fpr)
+        tpr = np.flipud(tpr)
+        tpr_fpr_row = []
+        tpr_fpr_row.append("%s-%s" % (method, args.target))
+        for fpr_iter in np.arange(len(x_labels)):
+            _, min_index = min(
+                list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+            tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+        tpr_fpr_table.add_row(tpr_fpr_row)
+    print(tpr_fpr_table)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='do ijb test')
+    # general
+    parser.add_argument('--model-root', default='', help='path to load model.')
+    parser.add_argument('--image-path', default='', type=str, help='')
+    parser.add_argument('--result-dir', default='.', type=str, help='')
+    parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+    main(parser.parse_args())
diff --git a/pose_estimation/models/arcface_torch/partial_fc.py b/pose_estimation/models/arcface_torch/partial_fc.py
new file mode 100755
index 0000000000000000000000000000000000000000..17e2d25715d10ba446c957e1d2528b0687ed71d5
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/partial_fc.py
@@ -0,0 +1,222 @@
+import logging
+import os
+
+import torch
+import torch.distributed as dist
+from torch.nn import Module
+from torch.nn.functional import normalize, linear
+from torch.nn.parameter import Parameter
+
+
+class PartialFC(Module):
+    """
+    Author: {Xiang An, Yang Xiao, XuHan Zhu} in DeepGlint,
+    Partial FC: Training 10 Million Identities on a Single Machine
+    See the original paper:
+    https://arxiv.org/abs/2010.05222
+    """
+
+    @torch.no_grad()
+    def __init__(self, rank, local_rank, world_size, batch_size, resume,
+                 margin_softmax, num_classes, sample_rate=1.0, embedding_size=512, prefix="./"):
+        """
+        rank: int
+            Unique process(GPU) ID from 0 to world_size - 1.
+        local_rank: int
+            Unique process(GPU) ID within the server from 0 to 7.
+        world_size: int
+            Number of GPU.
+        batch_size: int
+            Batch size on current rank(GPU).
+        resume: bool
+            Select whether to restore the weight of softmax.
+        margin_softmax: callable
+            A function of margin softmax, eg: cosface, arcface.
+        num_classes: int
+            The number of class center storage in current rank(CPU/GPU), usually is total_classes // world_size,
+            required.
+        sample_rate: float
+            The partial fc sampling rate, when the number of classes increases to more than 2 millions, Sampling
+            can greatly speed up training, and reduce a lot of GPU memory, default is 1.0.
+        embedding_size: int
+            The feature dimension, default is 512.
+        prefix: str
+            Path for save checkpoint, default is './'.
+        """
+        super(PartialFC, self).__init__()
+        #
+        self.num_classes: int = num_classes
+        self.rank: int = rank
+        self.local_rank: int = local_rank
+        self.device: torch.device = torch.device("cuda:{}".format(self.local_rank))
+        self.world_size: int = world_size
+        self.batch_size: int = batch_size
+        self.margin_softmax: callable = margin_softmax
+        self.sample_rate: float = sample_rate
+        self.embedding_size: int = embedding_size
+        self.prefix: str = prefix
+        self.num_local: int = num_classes // world_size + int(rank < num_classes % world_size)
+        self.class_start: int = num_classes // world_size * rank + min(rank, num_classes % world_size)
+        self.num_sample: int = int(self.sample_rate * self.num_local)
+
+        self.weight_name = os.path.join(self.prefix, "rank_{}_softmax_weight.pt".format(self.rank))
+        self.weight_mom_name = os.path.join(self.prefix, "rank_{}_softmax_weight_mom.pt".format(self.rank))
+
+        if resume:
+            try:
+                self.weight: torch.Tensor = torch.load(self.weight_name)
+                self.weight_mom: torch.Tensor = torch.load(self.weight_mom_name)
+                if self.weight.shape[0] != self.num_local or self.weight_mom.shape[0] != self.num_local:
+                    raise IndexError
+                logging.info("softmax weight resume successfully!")
+                logging.info("softmax weight mom resume successfully!")
+            except (FileNotFoundError, KeyError, IndexError):
+                self.weight = torch.normal(0, 0.01, (self.num_local, self.embedding_size), device=self.device)
+                self.weight_mom: torch.Tensor = torch.zeros_like(self.weight)
+                logging.info("softmax weight init!")
+                logging.info("softmax weight mom init!")
+        else:
+            self.weight = torch.normal(0, 0.01, (self.num_local, self.embedding_size), device=self.device)
+            self.weight_mom: torch.Tensor = torch.zeros_like(self.weight)
+            logging.info("softmax weight init successfully!")
+            logging.info("softmax weight mom init successfully!")
+        self.stream: torch.cuda.Stream = torch.cuda.Stream(local_rank)
+
+        self.index = None
+        if int(self.sample_rate) == 1:
+            self.update = lambda: 0
+            self.sub_weight = Parameter(self.weight)
+            self.sub_weight_mom = self.weight_mom
+        else:
+            self.sub_weight = Parameter(torch.empty((0, 0)).cuda(local_rank))
+
+    def save_params(self):
+        """ Save softmax weight for each rank on prefix
+        """
+        torch.save(self.weight.data, self.weight_name)
+        torch.save(self.weight_mom, self.weight_mom_name)
+
+    @torch.no_grad()
+    def sample(self, total_label):
+        """
+        Sample all positive class centers in each rank, and random select neg class centers to filling a fixed
+        `num_sample`.
+
+        total_label: tensor
+            Label after all gather, which cross all GPUs.
+        """
+        index_positive = (self.class_start <= total_label) & (total_label < self.class_start + self.num_local)
+        total_label[~index_positive] = -1
+        total_label[index_positive] -= self.class_start
+        if int(self.sample_rate) != 1:
+            positive = torch.unique(total_label[index_positive], sorted=True)
+            if self.num_sample - positive.size(0) >= 0:
+                perm = torch.rand(size=[self.num_local], device=self.device)
+                perm[positive] = 2.0
+                index = torch.topk(perm, k=self.num_sample)[1]
+                index = index.sort()[0]
+            else:
+                index = positive
+            self.index = index
+            total_label[index_positive] = torch.searchsorted(index, total_label[index_positive])
+            self.sub_weight = Parameter(self.weight[index])
+            self.sub_weight_mom = self.weight_mom[index]
+
+    def forward(self, total_features, norm_weight):
+        """ Partial fc forward, `logits = X * sample(W)`
+        """
+        torch.cuda.current_stream().wait_stream(self.stream)
+        logits = linear(total_features, norm_weight)
+        return logits
+
+    @torch.no_grad()
+    def update(self):
+        """ Set updated weight and weight_mom to memory bank.
+        """
+        self.weight_mom[self.index] = self.sub_weight_mom
+        self.weight[self.index] = self.sub_weight
+
+    def prepare(self, label, optimizer):
+        """
+        get sampled class centers for cal softmax.
+
+        label: tensor
+            Label tensor on each rank.
+        optimizer: opt
+            Optimizer for partial fc, which need to get weight mom.
+        """
+        with torch.cuda.stream(self.stream):
+            total_label = torch.zeros(
+                size=[self.batch_size * self.world_size], device=self.device, dtype=torch.long)
+            dist.all_gather(list(total_label.chunk(self.world_size, dim=0)), label)
+            self.sample(total_label)
+            optimizer.state.pop(optimizer.param_groups[-1]['params'][0], None)
+            optimizer.param_groups[-1]['params'][0] = self.sub_weight
+            optimizer.state[self.sub_weight]['momentum_buffer'] = self.sub_weight_mom
+            norm_weight = normalize(self.sub_weight)
+            return total_label, norm_weight
+
+    def forward_backward(self, label, features, optimizer):
+        """
+        Partial fc forward and backward with model parallel
+
+        label: tensor
+            Label tensor on each rank(GPU)
+        features: tensor
+            Features tensor on each rank(GPU)
+        optimizer: optimizer
+            Optimizer for partial fc
+
+        Returns:
+        --------
+        x_grad: tensor
+            The gradient of features.
+        loss_v: tensor
+            Loss value for cross entropy.
+        """
+        total_label, norm_weight = self.prepare(label, optimizer)
+        total_features = torch.zeros(
+            size=[self.batch_size * self.world_size, self.embedding_size], device=self.device)
+        dist.all_gather(list(total_features.chunk(self.world_size, dim=0)), features.data)
+        total_features.requires_grad = True
+
+        logits = self.forward(total_features, norm_weight)
+        logits = self.margin_softmax(logits, total_label)
+
+        with torch.no_grad():
+            max_fc = torch.max(logits, dim=1, keepdim=True)[0]
+            dist.all_reduce(max_fc, dist.ReduceOp.MAX)
+
+            # calculate exp(logits) and all-reduce
+            logits_exp = torch.exp(logits - max_fc)
+            logits_sum_exp = logits_exp.sum(dim=1, keepdims=True)
+            dist.all_reduce(logits_sum_exp, dist.ReduceOp.SUM)
+
+            # calculate prob
+            logits_exp.div_(logits_sum_exp)
+
+            # get one-hot
+            grad = logits_exp
+            index = torch.where(total_label != -1)[0]
+            one_hot = torch.zeros(size=[index.size()[0], grad.size()[1]], device=grad.device)
+            one_hot.scatter_(1, total_label[index, None], 1)
+
+            # calculate loss
+            loss = torch.zeros(grad.size()[0], 1, device=grad.device)
+            loss[index] = grad[index].gather(1, total_label[index, None])
+            dist.all_reduce(loss, dist.ReduceOp.SUM)
+            loss_v = loss.clamp_min_(1e-30).log_().mean() * (-1)
+
+            # calculate grad
+            grad[index] -= one_hot
+            grad.div_(self.batch_size * self.world_size)
+
+        logits.backward(grad)
+        if total_features.grad is not None:
+            total_features.grad.detach_()
+        x_grad: torch.Tensor = torch.zeros_like(features, requires_grad=True)
+        # feature gradient all-reduce
+        dist.reduce_scatter(x_grad, list(total_features.grad.chunk(self.world_size, dim=0)))
+        x_grad = x_grad * self.world_size
+        # backward backbone
+        return x_grad, loss_v
diff --git a/pose_estimation/models/arcface_torch/requirement.txt b/pose_estimation/models/arcface_torch/requirement.txt
new file mode 100755
index 0000000000000000000000000000000000000000..f72c1b3ba814ae1e0bc1c1f56402026978b9e870
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/requirement.txt
@@ -0,0 +1,5 @@
+tensorboard
+easydict
+mxnet
+onnx
+sklearn
diff --git a/pose_estimation/models/arcface_torch/run.sh b/pose_estimation/models/arcface_torch/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..61af4b4950eb11334e55362e3e3c5e2796979a01
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/run.sh
@@ -0,0 +1,2 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/ms1mv3_r50
+ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh
diff --git a/pose_estimation/models/arcface_torch/torch2onnx.py b/pose_estimation/models/arcface_torch/torch2onnx.py
new file mode 100755
index 0000000000000000000000000000000000000000..fc26ab82e552331bc8d75b34e81000418f4d38ec
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/torch2onnx.py
@@ -0,0 +1,59 @@
+import numpy as np
+import onnx
+import torch
+
+
+def convert_onnx(net, path_module, output, opset=11, simplify=False):
+    assert isinstance(net, torch.nn.Module)
+    img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32)
+    img = img.astype(np.float)
+    img = (img / 255. - 0.5) / 0.5  # torch style norm
+    img = img.transpose((2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).float()
+
+    weight = torch.load(path_module)
+    net.load_state_dict(weight)
+    net.eval()
+    torch.onnx.export(net, img, output, keep_initializers_as_inputs=False, verbose=False, opset_version=opset)
+    model = onnx.load(output)
+    graph = model.graph
+    graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+    if simplify:
+        from onnxsim import simplify
+        model, check = simplify(model)
+        assert check, "Simplified ONNX model could not be validated"
+    onnx.save(model, output)
+
+    
+if __name__ == '__main__':
+    import os
+    import argparse
+    from backbones import get_model
+
+    parser = argparse.ArgumentParser(description='ArcFace PyTorch to onnx')
+    parser.add_argument('input', type=str, help='input backbone.pth file or path')
+    parser.add_argument('--output', type=str, default=None, help='output onnx path')
+    parser.add_argument('--network', type=str, default=None, help='backbone network')
+    parser.add_argument('--simplify', type=bool, default=False, help='onnx simplify')
+    args = parser.parse_args()
+    input_file = args.input
+    if os.path.isdir(input_file):
+        input_file = os.path.join(input_file, "backbone.pth")
+    assert os.path.exists(input_file)
+    model_name = os.path.basename(os.path.dirname(input_file)).lower()
+    params = model_name.split("_")
+    if len(params) >= 3 and params[1] in ('arcface', 'cosface'):
+        if args.network is None:
+            args.network = params[2]
+    assert args.network is not None
+    print(args)
+    backbone_onnx = get_model(args.network, dropout=0)
+
+    output_path = args.output
+    if output_path is None:
+        output_path = os.path.join(os.path.dirname(__file__), 'onnx')
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    assert os.path.isdir(output_path)
+    output_file = os.path.join(output_path, "%s.onnx" % model_name)
+    convert_onnx(backbone_onnx, input_file, output_file, simplify=args.simplify)
diff --git a/pose_estimation/models/arcface_torch/train.py b/pose_estimation/models/arcface_torch/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..55eca2d0ad9463415970e09bccab8b722e496704
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/train.py
@@ -0,0 +1,141 @@
+import argparse
+import logging
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.utils.data.distributed
+from torch.nn.utils import clip_grad_norm_
+
+import losses
+from backbones import get_model
+from dataset import MXFaceDataset, SyntheticDataset, DataLoaderX
+from partial_fc import PartialFC
+from utils.utils_amp import MaxClipGradScaler
+from utils.utils_callbacks import CallBackVerification, CallBackLogging, CallBackModelCheckpoint
+from utils.utils_config import get_config
+from utils.utils_logging import AverageMeter, init_logging
+
+
+def main(args):
+    cfg = get_config(args.config)
+    try:
+        world_size = int(os.environ['WORLD_SIZE'])
+        rank = int(os.environ['RANK'])
+        dist.init_process_group('nccl')
+    except KeyError:
+        world_size = 1
+        rank = 0
+        dist.init_process_group(backend='nccl', init_method="tcp://127.0.0.1:12584", rank=rank, world_size=world_size)
+
+    local_rank = args.local_rank
+    torch.cuda.set_device(local_rank)
+    os.makedirs(cfg.output, exist_ok=True)
+    init_logging(rank, cfg.output)
+
+    if cfg.rec == "synthetic":
+        train_set = SyntheticDataset(local_rank=local_rank)
+    else:
+        train_set = MXFaceDataset(root_dir=cfg.rec, local_rank=local_rank)
+
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_set, shuffle=True)
+    train_loader = DataLoaderX(
+        local_rank=local_rank, dataset=train_set, batch_size=cfg.batch_size,
+        sampler=train_sampler, num_workers=2, pin_memory=True, drop_last=True)
+    backbone = get_model(cfg.network, dropout=0.0, fp16=cfg.fp16, num_features=cfg.embedding_size).to(local_rank)
+
+    if cfg.resume:
+        try:
+            backbone_pth = os.path.join(cfg.output, "backbone.pth")
+            backbone.load_state_dict(torch.load(backbone_pth, map_location=torch.device(local_rank)))
+            if rank == 0:
+                logging.info("backbone resume successfully!")
+        except (FileNotFoundError, KeyError, IndexError, RuntimeError):
+            if rank == 0:
+                logging.info("resume fail, backbone init successfully!")
+
+    backbone = torch.nn.parallel.DistributedDataParallel(
+        module=backbone, broadcast_buffers=False, device_ids=[local_rank])
+    backbone.train()
+    margin_softmax = losses.get_loss(cfg.loss)
+    module_partial_fc = PartialFC(
+        rank=rank, local_rank=local_rank, world_size=world_size, resume=cfg.resume,
+        batch_size=cfg.batch_size, margin_softmax=margin_softmax, num_classes=cfg.num_classes,
+        sample_rate=cfg.sample_rate, embedding_size=cfg.embedding_size, prefix=cfg.output)
+
+    opt_backbone = torch.optim.SGD(
+        params=[{'params': backbone.parameters()}],
+        lr=cfg.lr / 512 * cfg.batch_size * world_size,
+        momentum=0.9, weight_decay=cfg.weight_decay)
+    opt_pfc = torch.optim.SGD(
+        params=[{'params': module_partial_fc.parameters()}],
+        lr=cfg.lr / 512 * cfg.batch_size * world_size,
+        momentum=0.9, weight_decay=cfg.weight_decay)
+
+    num_image = len(train_set)
+    total_batch_size = cfg.batch_size * world_size
+    cfg.warmup_step = num_image // total_batch_size * cfg.warmup_epoch
+    cfg.total_step = num_image // total_batch_size * cfg.num_epoch
+
+    def lr_step_func(current_step):
+        cfg.decay_step = [x * num_image // total_batch_size for x in cfg.decay_epoch]
+        if current_step < cfg.warmup_step:
+            return current_step / cfg.warmup_step
+        else:
+            return 0.1 ** len([m for m in cfg.decay_step if m <= current_step])
+
+    scheduler_backbone = torch.optim.lr_scheduler.LambdaLR(
+        optimizer=opt_backbone, lr_lambda=lr_step_func)
+    scheduler_pfc = torch.optim.lr_scheduler.LambdaLR(
+        optimizer=opt_pfc, lr_lambda=lr_step_func)
+
+    for key, value in cfg.items():
+        num_space = 25 - len(key)
+        logging.info(": " + key + " " * num_space + str(value))
+
+    val_target = cfg.val_targets
+    callback_verification = CallBackVerification(2000, rank, val_target, cfg.rec)
+    callback_logging = CallBackLogging(50, rank, cfg.total_step, cfg.batch_size, world_size, None)
+    callback_checkpoint = CallBackModelCheckpoint(rank, cfg.output)
+
+    loss = AverageMeter()
+    start_epoch = 0
+    global_step = 0
+    grad_amp = MaxClipGradScaler(cfg.batch_size, 128 * cfg.batch_size, growth_interval=100) if cfg.fp16 else None
+    for epoch in range(start_epoch, cfg.num_epoch):
+        train_sampler.set_epoch(epoch)
+        for step, (img, label) in enumerate(train_loader):
+            global_step += 1
+            features = F.normalize(backbone(img))
+            x_grad, loss_v = module_partial_fc.forward_backward(label, features, opt_pfc)
+            if cfg.fp16:
+                features.backward(grad_amp.scale(x_grad))
+                grad_amp.unscale_(opt_backbone)
+                clip_grad_norm_(backbone.parameters(), max_norm=5, norm_type=2)
+                grad_amp.step(opt_backbone)
+                grad_amp.update()
+            else:
+                features.backward(x_grad)
+                clip_grad_norm_(backbone.parameters(), max_norm=5, norm_type=2)
+                opt_backbone.step()
+
+            opt_pfc.step()
+            module_partial_fc.update()
+            opt_backbone.zero_grad()
+            opt_pfc.zero_grad()
+            loss.update(loss_v, 1)
+            callback_logging(global_step, loss, epoch, cfg.fp16, scheduler_backbone.get_last_lr()[0], grad_amp)
+            callback_verification(global_step, backbone)
+            scheduler_backbone.step()
+            scheduler_pfc.step()
+        callback_checkpoint(global_step, backbone, module_partial_fc)
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+    parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
+    parser.add_argument('config', type=str, help='py config file')
+    parser.add_argument('--local_rank', type=int, default=0, help='local_rank')
+    main(parser.parse_args())
diff --git a/pose_estimation/models/arcface_torch/utils/__init__.py b/pose_estimation/models/arcface_torch/utils/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/models/arcface_torch/utils/plot.py b/pose_estimation/models/arcface_torch/utils/plot.py
new file mode 100755
index 0000000000000000000000000000000000000000..ccc588e5c01ca550b69c385aeb3fd139c59fb88a
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/utils/plot.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+
+import os
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from sklearn.metrics import roc_curve, auc
+
+image_path = "/data/anxiang/IJB_release/IJBC"
+files = [
+        "./ms1mv3_arcface_r100/ms1mv3_arcface_r100/ijbc.npy"
+]
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % 'ijbc'))
+
+methods = []
+scores = []
+for file in files:
+    methods.append(file.split('/')[-2])
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, "IJBC"))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+print(tpr_fpr_table)
diff --git a/pose_estimation/models/arcface_torch/utils/utils_amp.py b/pose_estimation/models/arcface_torch/utils/utils_amp.py
new file mode 100755
index 0000000000000000000000000000000000000000..9ac2a03f4212faa129faed447a8f4519c0a00a8b
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/utils/utils_amp.py
@@ -0,0 +1,88 @@
+from typing import Dict, List
+
+import torch
+
+if torch.__version__ < '1.9':
+    Iterable = torch._six.container_abcs.Iterable
+else:
+    import collections
+
+    Iterable = collections.abc.Iterable
+from torch.cuda.amp import GradScaler
+
+
+class _MultiDeviceReplicator(object):
+    """
+    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert master_tensor.is_cuda
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+class MaxClipGradScaler(GradScaler):
+    def __init__(self, init_scale, max_scale: float, growth_interval=100):
+        GradScaler.__init__(self, init_scale=init_scale, growth_interval=growth_interval)
+        self.max_scale = max_scale
+
+    def scale_clip(self):
+        if self.get_scale() == self.max_scale:
+            self.set_growth_factor(1)
+        elif self.get_scale() < self.max_scale:
+            self.set_growth_factor(2)
+        elif self.get_scale() > self.max_scale:
+            self._scale.fill_(self.max_scale)
+            self.set_growth_factor(1)
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Arguments:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+        self.scale_clip()
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            assert outputs.is_cuda
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        stash: List[_MultiDeviceReplicator] = []  # holds a reference that can be overwritten by apply_scale
+
+        def apply_scale(val):
+            if isinstance(val, torch.Tensor):
+                assert val.is_cuda
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            elif isinstance(val, Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
diff --git a/pose_estimation/models/arcface_torch/utils/utils_callbacks.py b/pose_estimation/models/arcface_torch/utils/utils_callbacks.py
new file mode 100755
index 0000000000000000000000000000000000000000..bd2f56cba47c57de102710ff56eaac591e59f4da
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/utils/utils_callbacks.py
@@ -0,0 +1,117 @@
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+from eval import verification
+from utils.utils_logging import AverageMeter
+
+
+class CallBackVerification(object):
+    def __init__(self, frequent, rank, val_targets, rec_prefix, image_size=(112, 112)):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.highest_acc: float = 0.0
+        self.highest_acc_list: List[float] = [0.0] * len(val_targets)
+        self.ver_list: List[object] = []
+        self.ver_name_list: List[str] = []
+        if self.rank is 0:
+            self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size)
+
+    def ver_test(self, backbone: torch.nn.Module, global_step: int):
+        results = []
+        for i in range(len(self.ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                self.ver_list[i], backbone, 10, 10)
+            logging.info('[%s][%d]XNorm: %f' % (self.ver_name_list[i], global_step, xnorm))
+            logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2))
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info(
+                '[%s][%d]Accuracy-Highest: %1.5f' % (self.ver_name_list[i], global_step, self.highest_acc_list[i]))
+            results.append(acc2)
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+        for name in val_targets:
+            path = os.path.join(data_dir, name + ".bin")
+            if os.path.exists(path):
+                data_set = verification.load_bin(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+
+    def __call__(self, num_update, backbone: torch.nn.Module):
+        if self.rank is 0 and num_update > 0 and num_update % self.frequent == 0:
+            backbone.eval()
+            self.ver_test(backbone, num_update)
+            backbone.train()
+
+
+class CallBackLogging(object):
+    def __init__(self, frequent, rank, total_step, batch_size, world_size, writer=None):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.time_start = time.time()
+        self.total_step: int = total_step
+        self.batch_size: int = batch_size
+        self.world_size: int = world_size
+        self.writer = writer
+
+        self.init = False
+        self.tic = 0
+
+    def __call__(self,
+                 global_step: int,
+                 loss: AverageMeter,
+                 epoch: int,
+                 fp16: bool,
+                 learning_rate: float,
+                 grad_scaler: torch.cuda.amp.GradScaler):
+        if self.rank == 0 and global_step > 0 and global_step % self.frequent == 0:
+            if self.init:
+                try:
+                    speed: float = self.frequent * self.batch_size / (time.time() - self.tic)
+                    speed_total = speed * self.world_size
+                except ZeroDivisionError:
+                    speed_total = float('inf')
+
+                time_now = (time.time() - self.time_start) / 3600
+                time_total = time_now / ((global_step + 1) / self.total_step)
+                time_for_end = time_total - time_now
+                if self.writer is not None:
+                    self.writer.add_scalar('time_for_end', time_for_end, global_step)
+                    self.writer.add_scalar('learning_rate', learning_rate, global_step)
+                    self.writer.add_scalar('loss', loss.avg, global_step)
+                if fp16:
+                    msg = "Speed %.2f samples/sec   Loss %.4f   LearningRate %.4f   Epoch: %d   Global Step: %d   " \
+                          "Fp16 Grad Scale: %2.f   Required: %1.f hours" % (
+                              speed_total, loss.avg, learning_rate, epoch, global_step,
+                              grad_scaler.get_scale(), time_for_end
+                          )
+                else:
+                    msg = "Speed %.2f samples/sec   Loss %.4f   LearningRate %.4f   Epoch: %d   Global Step: %d   " \
+                          "Required: %1.f hours" % (
+                              speed_total, loss.avg, learning_rate, epoch, global_step, time_for_end
+                          )
+                logging.info(msg)
+                loss.reset()
+                self.tic = time.time()
+            else:
+                self.init = True
+                self.tic = time.time()
+
+
+class CallBackModelCheckpoint(object):
+    def __init__(self, rank, output="./"):
+        self.rank: int = rank
+        self.output: str = output
+
+    def __call__(self, global_step, backbone, partial_fc, ):
+        if global_step > 100 and self.rank == 0:
+            path_module = os.path.join(self.output, "backbone.pth")
+            torch.save(backbone.module.state_dict(), path_module)
+            logging.info("Pytorch Model Saved in '{}'".format(path_module))
+
+        if global_step > 100 and partial_fc is not None:
+            partial_fc.save_params()
diff --git a/pose_estimation/models/arcface_torch/utils/utils_config.py b/pose_estimation/models/arcface_torch/utils/utils_config.py
new file mode 100755
index 0000000000000000000000000000000000000000..0c02eaf70fc0140aca7925f621c29a496f491cae
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/utils/utils_config.py
@@ -0,0 +1,16 @@
+import importlib
+import os.path as osp
+
+
+def get_config(config_file):
+    assert config_file.startswith('configs/'), 'config file setting must start with configs/'
+    temp_config_name = osp.basename(config_file)
+    temp_module_name = osp.splitext(temp_config_name)[0]
+    config = importlib.import_module("configs.base")
+    cfg = config.config
+    config = importlib.import_module("configs.%s" % temp_module_name)
+    job_cfg = config.config
+    cfg.update(job_cfg)
+    if cfg.output is None:
+        cfg.output = osp.join('work_dirs', temp_module_name)
+    return cfg
\ No newline at end of file
diff --git a/pose_estimation/models/arcface_torch/utils/utils_logging.py b/pose_estimation/models/arcface_torch/utils/utils_logging.py
new file mode 100755
index 0000000000000000000000000000000000000000..c787b6aae7cd037a4718df44d672b8ffa9e5c249
--- /dev/null
+++ b/pose_estimation/models/arcface_torch/utils/utils_logging.py
@@ -0,0 +1,41 @@
+import logging
+import os
+import sys
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value
+    """
+
+    def __init__(self):
+        self.val = None
+        self.avg = None
+        self.sum = None
+        self.count = None
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def init_logging(rank, models_root):
+    if rank == 0:
+        log_root = logging.getLogger()
+        log_root.setLevel(logging.INFO)
+        formatter = logging.Formatter("Training: %(asctime)s-%(message)s")
+        handler_file = logging.FileHandler(os.path.join(models_root, "training.log"))
+        handler_stream = logging.StreamHandler(sys.stdout)
+        handler_file.setFormatter(formatter)
+        handler_stream.setFormatter(formatter)
+        log_root.addHandler(handler_file)
+        log_root.addHandler(handler_stream)
+        log_root.info('rank_id: %d' % rank)
diff --git a/pose_estimation/models/arcface_torch/utils/utils_os.py b/pose_estimation/models/arcface_torch/utils/utils_os.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/models/base_model.py b/pose_estimation/models/base_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..478d6d0d1795c180153ffd4ff7dcfbbaa328b4be
--- /dev/null
+++ b/pose_estimation/models/base_model.py
@@ -0,0 +1,317 @@
+"""This script defines the base network model for Deep3DFaceRecon_pytorch
+"""
+
+import os
+import torch
+from collections import OrderedDict
+from abc import ABC, abstractmethod
+from . import networks
+
+
+class BaseModel(ABC):
+    """This class is an abstract base class (ABC) for models.
+    To create a subclass, you need to implement the following five functions:
+        -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
+        -- <set_input>:                     unpack data from dataset and apply preprocessing.
+        -- <forward>:                       produce intermediate results.
+        -- <optimize_parameters>:           calculate losses, gradients, and update network weights.
+        -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
+    """
+
+    def __init__(self, opt):
+        """Initialize the BaseModel class.
+
+        Parameters:
+            opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
+
+        When creating your custom class, you need to implement your own initialization.
+        In this fucntion, you should first call <BaseModel.__init__(self, opt)>
+        Then, you need to define four lists:
+            -- self.loss_names (str list):          specify the training losses that you want to plot and save.
+            -- self.model_names (str list):         specify the images that you want to display and save.
+            -- self.visual_names (str list):        define networks used in our training.
+            -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
+        """
+        self.opt = opt
+        self.isTrain = opt.isTrain
+        self.device = torch.device('cpu') 
+        self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)  # save all the checkpoints to save_dir
+        self.loss_names = []
+        self.model_names = []
+        self.visual_names = []
+        self.parallel_names = []
+        self.optimizers = []
+        self.image_paths = []
+        self.metric = 0  # used for learning rate policy 'plateau'
+
+    @staticmethod
+    def dict_grad_hook_factory(add_func=lambda x: x):
+        saved_dict = dict()
+
+        def hook_gen(name):
+            def grad_hook(grad):
+                saved_vals = add_func(grad)
+                saved_dict[name] = saved_vals
+            return grad_hook
+        return hook_gen, saved_dict
+
+    @staticmethod
+    def modify_commandline_options(parser, is_train):
+        """Add new model-specific options, and rewrite default values for existing options.
+
+        Parameters:
+            parser          -- original option parser
+            is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        return parser
+
+    @abstractmethod
+    def set_input(self, input):
+        """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+        Parameters:
+            input (dict): includes the data itself and its metadata information.
+        """
+        pass
+
+    @abstractmethod
+    def forward(self):
+        """Run forward pass; called by both functions <optimize_parameters> and <test>."""
+        pass
+
+    @abstractmethod
+    def optimize_parameters(self):
+        """Calculate losses, gradients, and update network weights; called in every training iteration"""
+        pass
+
+    def setup(self, opt):
+        """Load and print networks; create schedulers
+
+        Parameters:
+            opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        """
+        if self.isTrain:
+            self.schedulers = [networks.get_scheduler(optimizer, opt) for optimizer in self.optimizers]
+        
+        if not self.isTrain or opt.continue_train:
+            load_suffix = opt.epoch
+            self.load_networks(load_suffix)
+ 
+            
+        # self.print_networks(opt.verbose)
+
+    def parallelize(self, convert_sync_batchnorm=True):
+        if not self.opt.use_ddp:
+            for name in self.parallel_names:
+                if 'renderer' in name:
+                    continue
+                if isinstance(name, str):
+                    module = getattr(self, name)
+                    setattr(self, name, module.to(self.device))
+        else:
+            for name in self.model_names:
+                if isinstance(name, str):
+                    module = getattr(self, name)
+                    if convert_sync_batchnorm:
+                        module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module)
+                    setattr(self, name, torch.nn.parallel.DistributedDataParallel(module.to(self.device),
+                        device_ids=[self.device.index], 
+                        find_unused_parameters=True, broadcast_buffers=True))
+            
+            # DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient.
+            for name in self.parallel_names:
+                if isinstance(name, str) and name not in self.model_names:
+                    module = getattr(self, name)
+                    setattr(self, name, module.to(self.device))
+            
+        # put state_dict of optimizer to gpu device
+        if self.opt.phase != 'test':
+            if self.opt.continue_train:
+                for optim in self.optimizers:
+                    for state in optim.state.values():
+                        for k, v in state.items():
+                            if isinstance(v, torch.Tensor):
+                                state[k] = v.to(self.device)
+
+    def data_dependent_initialize(self, data):
+        pass
+
+    def train(self):
+        """Make models train mode"""
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                net.train()
+
+    def eval(self):
+        """Make models eval mode"""
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                net.eval()
+
+    def test(self):
+        """Forward function used in test time.
+
+        This function wraps <forward> function in no_grad() so we don't save intermediate steps for backprop
+        It also calls <compute_visuals> to produce additional visualization results
+        """
+        with torch.no_grad():
+            self.forward()
+          #  self.compute_visuals()
+
+    def compute_visuals(self):
+        """Calculate additional output images for visdom and HTML visualization"""
+        pass
+
+    def get_image_paths(self, name='A'):
+        """ Return image paths that are used to load current data"""
+        return self.image_paths if name =='A' else self.image_paths_B
+
+    def update_learning_rate(self):
+        """Update learning rates for all the networks; called at the end of every epoch"""
+        for scheduler in self.schedulers:
+            if self.opt.lr_policy == 'plateau':
+                scheduler.step(self.metric)
+            else:
+                scheduler.step()
+
+        lr = self.optimizers[0].param_groups[0]['lr']
+        print('learning rate = %.7f' % lr)
+
+    def get_current_visuals(self):
+        """Return visualization images. train.py will display these images with visdom, and save the images to a HTML"""
+        visual_ret = OrderedDict()
+        for name in self.visual_names:
+            if isinstance(name, str):
+                visual_ret[name] = getattr(self, name)[:, :3, ...]
+        return visual_ret
+
+    def get_current_losses(self):
+        """Return traning losses / errors. train.py will print out these errors on console, and save them to a file"""
+        errors_ret = OrderedDict()
+        for name in self.loss_names:
+            if isinstance(name, str):
+                errors_ret[name] = float(getattr(self, 'loss_' + name))  # float(...) works for both scalar tensor and float number
+        return errors_ret
+
+    def save_networks(self, epoch):
+        """Save all the networks to the disk.
+
+        Parameters:
+            epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
+        """
+        if not os.path.isdir(self.save_dir):
+            os.makedirs(self.save_dir)
+
+        save_filename = 'epoch_%s.pth' % (epoch)
+        save_path = os.path.join(self.save_dir, save_filename)
+        
+        save_dict = {}
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                if isinstance(net, torch.nn.DataParallel) or isinstance(net,
+                        torch.nn.parallel.DistributedDataParallel):
+                    net = net.module
+                save_dict[name] = net.state_dict()
+                
+
+        for i, optim in enumerate(self.optimizers):
+            save_dict['opt_%02d'%i] = optim.state_dict()
+
+        for i, sched in enumerate(self.schedulers):
+            save_dict['sched_%02d'%i] = sched.state_dict()
+        
+        torch.save(save_dict, save_path)
+
+    def __patch_instance_norm_state_dict(self, state_dict, module, keys, i=0):
+        """Fix InstanceNorm checkpoints incompatibility (prior to 0.4)"""
+        key = keys[i]
+        if i + 1 == len(keys):  # at the end, pointing to a parameter/buffer
+            if module.__class__.__name__.startswith('InstanceNorm') and \
+                    (key == 'running_mean' or key == 'running_var'):
+                if getattr(module, key) is None:
+                    state_dict.pop('.'.join(keys))
+            if module.__class__.__name__.startswith('InstanceNorm') and \
+               (key == 'num_batches_tracked'):
+                state_dict.pop('.'.join(keys))
+        else:
+            self.__patch_instance_norm_state_dict(state_dict, getattr(module, key), keys, i + 1)
+
+    def load_networks(self, epoch):
+        """Load all the networks from the disk.
+
+        Parameters:
+            epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
+        """
+        if self.opt.isTrain and self.opt.pretrained_name is not None:
+            load_dir = os.path.join(self.opt.checkpoints_dir, self.opt.pretrained_name)
+        else:
+            load_dir = self.save_dir    
+        load_filename = 'epoch_%s.pth' % (epoch)
+        load_path = os.path.join(load_dir, load_filename)
+        state_dict = torch.load(load_path, map_location=self.device)
+        print('loading the model from %s' % load_path)
+
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                if isinstance(net, torch.nn.DataParallel):
+                    net = net.module
+                net.load_state_dict(state_dict[name])
+        
+        if self.opt.phase != 'test':
+            if self.opt.continue_train:
+                print('loading the optim from %s' % load_path)
+                for i, optim in enumerate(self.optimizers):
+                    optim.load_state_dict(state_dict['opt_%02d'%i])
+
+                try:
+                    print('loading the sched from %s' % load_path)
+                    for i, sched in enumerate(self.schedulers):
+                        sched.load_state_dict(state_dict['sched_%02d'%i])
+                except:
+                    print('Failed to load schedulers, set schedulers according to epoch count manually')
+                    for i, sched in enumerate(self.schedulers):
+                        sched.last_epoch = self.opt.epoch_count - 1
+                    
+
+            
+
+    def print_networks(self, verbose):
+        """Print the total number of parameters in the network and (if verbose) network architecture
+
+        Parameters:
+            verbose (bool) -- if verbose: print the network architecture
+        """
+        print('---------- Networks initialized -------------')
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                num_params = 0
+                for param in net.parameters():
+                    num_params += param.numel()
+                if verbose:
+                    print(net)
+                print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6))
+        print('-----------------------------------------------')
+
+    def set_requires_grad(self, nets, requires_grad=False):
+        """Set requies_grad=Fasle for all the networks to avoid unnecessary computations
+        Parameters:
+            nets (network list)   -- a list of networks
+            requires_grad (bool)  -- whether the networks require gradients or not
+        """
+        if not isinstance(nets, list):
+            nets = [nets]
+        for net in nets:
+            if net is not None:
+                for param in net.parameters():
+                    param.requires_grad = requires_grad
+
+    def generate_visuals_for_evaluation(self, data, mode):
+        return {}
diff --git a/pose_estimation/models/bfm.py b/pose_estimation/models/bfm.py
new file mode 100755
index 0000000000000000000000000000000000000000..f5aea7b3c6e2fa83a5035a7c36906fe3afa7e181
--- /dev/null
+++ b/pose_estimation/models/bfm.py
@@ -0,0 +1,299 @@
+"""This script defines the parametric 3d face model for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+import  torch
+import torch.nn.functional as F
+from scipy.io import loadmat
+from util.load_mats import transferBFM09
+import os
+
+def perspective_projection(focal, center):
+    # return p.T (N, 3) @ (3, 3) 
+    return np.array([
+        focal, 0, center,
+        0, focal, center,
+        0, 0, 1
+    ]).reshape([3, 3]).astype(np.float32).transpose()
+
+class SH:
+    def __init__(self):
+        self.a = [np.pi, 2 * np.pi / np.sqrt(3.), 2 * np.pi / np.sqrt(8.)]
+        self.c = [1/np.sqrt(4 * np.pi), np.sqrt(3.) / np.sqrt(4 * np.pi), 3 * np.sqrt(5.) / np.sqrt(12 * np.pi)]
+
+
+
+class ParametricFaceModel:
+    def __init__(self, 
+                bfm_folder='./BFM', 
+                recenter=True,
+                camera_distance=10.,
+                init_lit=np.array([
+                    0.8, 0, 0, 0, 0, 0, 0, 0, 0
+                    ]),
+                focal=1015.,
+                center=112.,
+                is_train=True,
+                default_name='BFM_model_front.mat'):
+        
+        if not os.path.isfile(os.path.join(bfm_folder, default_name)):
+            transferBFM09(bfm_folder)
+        model = loadmat(os.path.join(bfm_folder, default_name))
+        # mean face shape. [3*N,1]
+        self.mean_shape = model['meanshape'].astype(np.float32)
+        # identity basis. [3*N,80]
+        self.id_base = model['idBase'].astype(np.float32)
+        # expression basis. [3*N,64]
+        self.exp_base = model['exBase'].astype(np.float32)
+        # mean face texture. [3*N,1] (0-255)
+        self.mean_tex = model['meantex'].astype(np.float32)
+        # texture basis. [3*N,80]
+        self.tex_base = model['texBase'].astype(np.float32)
+        # face indices for each vertex that lies in. starts from 0. [N,8]
+        self.point_buf = model['point_buf'].astype(np.int64) - 1
+        # vertex indices for each face. starts from 0. [F,3]
+        self.face_buf = model['tri'].astype(np.int64) - 1
+        # vertex indices for 68 landmarks. starts from 0. [68,1]
+        self.keypoints = np.squeeze(model['keypoints']).astype(np.int64) - 1
+
+        if is_train:
+            # vertex indices for small face region to compute photometric error. starts from 0.
+            self.front_mask = np.squeeze(model['frontmask2_idx']).astype(np.int64) - 1
+            # vertex indices for each face from small face region. starts from 0. [f,3]
+            self.front_face_buf = model['tri_mask2'].astype(np.int64) - 1
+            # vertex indices for pre-defined skin region to compute reflectance loss
+            self.skin_mask = np.squeeze(model['skinmask'])
+        
+        if recenter:
+            mean_shape = self.mean_shape.reshape([-1, 3])
+            mean_shape = mean_shape - np.mean(mean_shape, axis=0, keepdims=True)
+            self.mean_shape = mean_shape.reshape([-1, 1])
+
+        self.persc_proj = perspective_projection(focal, center)
+        self.device = 'cpu'
+        self.camera_distance = camera_distance
+        self.SH = SH()
+        self.init_lit = init_lit.reshape([1, 1, -1]).astype(np.float32)
+        
+
+    def to(self, device):
+        self.device = device
+        for key, value in self.__dict__.items():
+            if type(value).__module__ == np.__name__:
+                setattr(self, key, torch.tensor(value).to(device))
+
+    
+    def compute_shape(self, id_coeff, exp_coeff):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            id_coeff         -- torch.tensor, size (B, 80), identity coeffs
+            exp_coeff        -- torch.tensor, size (B, 64), expression coeffs
+        """
+        batch_size = id_coeff.shape[0]
+        id_part = torch.einsum('ij,aj->ai', self.id_base, id_coeff)
+        exp_part = torch.einsum('ij,aj->ai', self.exp_base, exp_coeff)
+        face_shape = id_part + exp_part + self.mean_shape.reshape([1, -1])
+        return face_shape.reshape([batch_size, -1, 3])
+    
+
+    def compute_texture(self, tex_coeff, normalize=True):
+        """
+        Return:
+            face_texture     -- torch.tensor, size (B, N, 3), in RGB order, range (0, 1.)
+
+        Parameters:
+            tex_coeff        -- torch.tensor, size (B, 80)
+        """
+        batch_size = tex_coeff.shape[0]
+        face_texture = torch.einsum('ij,aj->ai', self.tex_base, tex_coeff) + self.mean_tex
+        if normalize:
+            face_texture = face_texture / 255.
+        return face_texture.reshape([batch_size, -1, 3])
+
+
+    def compute_norm(self, face_shape):
+        """
+        Return:
+            vertex_norm      -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+
+        v1 = face_shape[:, self.face_buf[:, 0]]
+        v2 = face_shape[:, self.face_buf[:, 1]]
+        v3 = face_shape[:, self.face_buf[:, 2]]
+        e1 = v1 - v2
+        e2 = v2 - v3
+        face_norm = torch.cross(e1, e2, dim=-1)
+        face_norm = F.normalize(face_norm, dim=-1, p=2)
+        face_norm = torch.cat([face_norm, torch.zeros(face_norm.shape[0], 1, 3).to(self.device)], dim=1)
+        
+        vertex_norm = torch.sum(face_norm[:, self.point_buf], dim=2)
+        vertex_norm = F.normalize(vertex_norm, dim=-1, p=2)
+        return vertex_norm
+
+
+    def compute_color(self, face_texture, face_norm, gamma):
+        """
+        Return:
+            face_color       -- torch.tensor, size (B, N, 3), range (0, 1.)
+
+        Parameters:
+            face_texture     -- torch.tensor, size (B, N, 3), from texture model, range (0, 1.)
+            face_norm        -- torch.tensor, size (B, N, 3), rotated face normal
+            gamma            -- torch.tensor, size (B, 27), SH coeffs
+        """
+        batch_size = gamma.shape[0]
+        v_num = face_texture.shape[1]
+        a, c = self.SH.a, self.SH.c
+        gamma = gamma.reshape([batch_size, 3, 9])
+        gamma = gamma + self.init_lit
+        gamma = gamma.permute(0, 2, 1)
+        Y = torch.cat([
+             a[0] * c[0] * torch.ones_like(face_norm[..., :1]).to(self.device),
+            -a[1] * c[1] * face_norm[..., 1:2],
+             a[1] * c[1] * face_norm[..., 2:],
+            -a[1] * c[1] * face_norm[..., :1],
+             a[2] * c[2] * face_norm[..., :1] * face_norm[..., 1:2],
+            -a[2] * c[2] * face_norm[..., 1:2] * face_norm[..., 2:],
+            0.5 * a[2] * c[2] / np.sqrt(3.) * (3 * face_norm[..., 2:] ** 2 - 1),
+            -a[2] * c[2] * face_norm[..., :1] * face_norm[..., 2:],
+            0.5 * a[2] * c[2] * (face_norm[..., :1] ** 2  - face_norm[..., 1:2] ** 2)
+        ], dim=-1)
+        r = Y @ gamma[..., :1]
+        g = Y @ gamma[..., 1:2]
+        b = Y @ gamma[..., 2:]
+        face_color = torch.cat([r, g, b], dim=-1) * face_texture
+        return face_color
+
+    
+    def compute_rotation(self, angles):
+        """
+        Return:
+            rot              -- torch.tensor, size (B, 3, 3) pts @ trans_mat
+
+        Parameters:
+            angles           -- torch.tensor, size (B, 3), radian
+        """
+
+        batch_size = angles.shape[0]
+        ones = torch.ones([batch_size, 1]).to(self.device)
+        zeros = torch.zeros([batch_size, 1]).to(self.device)
+        x, y, z = angles[:, :1], angles[:, 1:2], angles[:, 2:],
+        
+        rot_x = torch.cat([
+            ones, zeros, zeros,
+            zeros, torch.cos(x), -torch.sin(x), 
+            zeros, torch.sin(x), torch.cos(x)
+        ], dim=1).reshape([batch_size, 3, 3])
+        
+        rot_y = torch.cat([
+            torch.cos(y), zeros, torch.sin(y),
+            zeros, ones, zeros,
+            -torch.sin(y), zeros, torch.cos(y)
+        ], dim=1).reshape([batch_size, 3, 3])
+
+        rot_z = torch.cat([
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z), torch.cos(z), zeros,
+            zeros, zeros, ones
+        ], dim=1).reshape([batch_size, 3, 3])
+
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)
+
+
+    def to_camera(self, face_shape):
+        face_shape[..., -1] = self.camera_distance - face_shape[..., -1]
+        return face_shape
+
+    def to_image(self, face_shape):
+        """
+        Return:
+            face_proj        -- torch.tensor, size (B, N, 2), y direction is opposite to v direction
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+        # to image_plane
+        face_proj = face_shape @ self.persc_proj
+        face_proj = face_proj[..., :2] / face_proj[..., 2:]
+
+        return face_proj
+
+
+    def transform(self, face_shape, rot, trans):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3) pts @ rot + trans
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+            rot              -- torch.tensor, size (B, 3, 3)
+            trans            -- torch.tensor, size (B, 3)
+        """
+        return face_shape @ rot + trans.unsqueeze(1)
+
+
+    def get_landmarks(self, face_proj):
+        """
+        Return:
+            face_lms         -- torch.tensor, size (B, 68, 2)
+
+        Parameters:
+            face_proj       -- torch.tensor, size (B, N, 2)
+        """  
+        return face_proj[:, self.keypoints]
+
+    def split_coeff(self, coeffs):
+        """
+        Return:
+            coeffs_dict     -- a dict of torch.tensors
+
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 256)
+        """
+        id_coeffs = coeffs[:, :80]
+        exp_coeffs = coeffs[:, 80: 144]
+        tex_coeffs = coeffs[:, 144: 224]
+        angles = coeffs[:, 224: 227]
+        gammas = coeffs[:, 227: 254]
+        translations = coeffs[:, 254:]
+        return {
+            'id': id_coeffs,
+            'exp': exp_coeffs,
+            'tex': tex_coeffs,
+            'angle': angles,
+            'gamma': gammas,
+            'trans': translations
+        }
+    def compute_for_render(self, coeffs):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        coef_dict = self.split_coeff(coeffs)
+        face_shape = self.compute_shape(coef_dict['id'], coef_dict['exp'])
+        rotation = self.compute_rotation(coef_dict['angle'])
+
+        
+        face_shape_transformed = self.transform(face_shape, rotation, coef_dict['trans']) #face_shape
+        face_vertex = self.to_camera(face_shape_transformed)
+        
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+
+        face_texture = self.compute_texture(coef_dict['tex'])
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted, coef_dict['gamma'])
+        face_color = (face_norm+1)*0.5
+        return face_vertex, face_texture, face_color, landmark, face_shape
diff --git a/pose_estimation/models/facerecon_model.py b/pose_estimation/models/facerecon_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..4eb91f54caa611152c6f580f0ec0250eeec977f4
--- /dev/null
+++ b/pose_estimation/models/facerecon_model.py
@@ -0,0 +1,236 @@
+"""This script defines the face reconstruction model for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+import torch
+from .base_model import BaseModel
+from . import networks
+from .bfm import ParametricFaceModel
+from .losses import perceptual_loss, photo_loss, reg_loss, reflectance_loss, landmark_loss
+from util import util 
+#from util.nvdiffrast import MeshRenderer
+from util.preprocess import estimate_norm_torch
+
+import trimesh
+
+
+class FaceReconModel(BaseModel):
+
+    @staticmethod
+    def modify_commandline_options(parser, is_train=True):
+        """  Configures options specific for CUT model
+        """
+        # net structure and parameters
+        parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='network structure')
+        parser.add_argument('--init_path', type=str, default='checkpoints/init_model/resnet50-0676ba61.pth')
+        parser.add_argument('--use_last_fc', type=util.str2bool, nargs='?', const=True, default=False, help='zero initialize the last fc')
+        parser.add_argument('--bfm_folder', type=str, default='BFM')
+        parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model')
+
+        # renderer parameters
+        parser.add_argument('--focal', type=float, default=1015.)
+        parser.add_argument('--center', type=float, default=112.)
+        parser.add_argument('--camera_d', type=float, default=10.)
+        parser.add_argument('--z_near', type=float, default=5.)
+        parser.add_argument('--z_far', type=float, default=15.)
+
+        if is_train:
+            # training parameters
+            parser.add_argument('--net_recog', type=str, default='r50', choices=['r18', 'r43', 'r50'], help='face recog network structure')
+            parser.add_argument('--net_recog_path', type=str, default='checkpoints/recog_model/ms1mv3_arcface_r50_fp16/backbone.pth')
+            parser.add_argument('--use_crop_face', type=util.str2bool, nargs='?', const=True, default=False, help='use crop mask for photo loss')
+            parser.add_argument('--use_predef_M', type=util.str2bool, nargs='?', const=True, default=False, help='use predefined M for predicted face')
+
+            
+            # augmentation parameters
+            parser.add_argument('--shift_pixs', type=float, default=10., help='shift pixels')
+            parser.add_argument('--scale_delta', type=float, default=0.1, help='delta scale factor')
+            parser.add_argument('--rot_angle', type=float, default=10., help='rot angles, degree')
+
+            # loss weights
+            parser.add_argument('--w_feat', type=float, default=1.0, help='weight for feat loss')
+            parser.add_argument('--w_color', type=float, default=1.92, help='weight for loss loss')
+            parser.add_argument('--w_reg', type=float, default=3.0e-4, help='weight for reg loss')
+            parser.add_argument('--w_id', type=float, default=0.2, help='weight for id_reg loss')
+            parser.add_argument('--w_exp', type=float, default=0.8, help='weight for exp_reg loss')
+            parser.add_argument('--w_tex', type=float, default=1.7e-2, help='weight for tex_reg loss')
+            parser.add_argument('--w_gamma', type=float, default=10.0, help='weight for gamma loss')
+            parser.add_argument('--w_lm', type=float, default=1.6e-3, help='weight for lm loss')
+            parser.add_argument('--w_reflc', type=float, default=5.0, help='weight for reflc loss')
+
+
+
+        opt, _ = parser.parse_known_args()
+        parser.set_defaults(
+                focal=1015., center=112., camera_d=10., use_last_fc=False, z_near=5., z_far=15.
+            )
+        if is_train:
+            parser.set_defaults(
+                use_crop_face=True, use_predef_M=False
+            )
+        return parser
+
+    def __init__(self, opt):
+        """Initialize this model class.
+
+        Parameters:
+            opt -- training/test options
+
+        A few things can be done here.
+        - (required) call the initialization function of BaseModel
+        - define loss function, visualization images, model names, and optimizers
+        """
+        BaseModel.__init__(self, opt)  # call the initialization method of BaseModel
+        
+        self.visual_names = ['output_vis']
+        self.model_names = ['net_recon']
+        self.parallel_names = self.model_names + ['renderer']
+
+        self.net_recon = networks.define_net_recon(
+            net_recon=opt.net_recon, use_last_fc=opt.use_last_fc, init_path=opt.init_path
+        )
+
+        self.facemodel = ParametricFaceModel(
+            bfm_folder=opt.bfm_folder, camera_distance=opt.camera_d, focal=opt.focal, center=opt.center,
+            is_train=self.isTrain, default_name=opt.bfm_model
+        )
+        
+        print("FOCAL")
+        print(opt.focal)
+        fov = 2 * np.arctan(opt.center / opt.focal) * 180 / np.pi
+     #Renderer(
+        #    rasterize_fov=fov, znear=opt.z_near, zfar=opt.z_far, rasterize_size=int(2 * opt.center)
+       # )
+
+        if self.isTrain:
+            self.loss_names = ['all', 'feat', 'color', 'lm', 'reg', 'gamma', 'reflc']
+
+            self.net_recog = networks.define_net_recog(
+                net_recog=opt.net_recog, pretrained_path=opt.net_recog_path
+                )
+            # loss func name: (compute_%s_loss) % loss_name
+            self.compute_feat_loss = perceptual_loss
+            self.comupte_color_loss = photo_loss
+            self.compute_lm_loss = landmark_loss
+            self.compute_reg_loss = reg_loss
+            self.compute_reflc_loss = reflectance_loss
+
+            self.optimizer = torch.optim.Adam(self.net_recon.parameters(), lr=opt.lr)
+            self.optimizers = [self.optimizer]
+            self.parallel_names += ['net_recog']
+        # Our program will automatically call <model.setup> to define schedulers, load networks, and print networks
+
+    def set_input(self, input):
+        """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+        Parameters:
+            input: a dictionary that contains the data itself and its metadata information.
+        """
+        self.input_img = input['imgs'].to(self.device) 
+        self.atten_mask = input['msks'].to(self.device) if 'msks' in input else None
+        self.gt_lm = input['lms'].to(self.device)  if 'lms' in input else None
+        self.trans_m = input['M'].to(self.device) if 'M' in input else None
+        self.image_paths = input['im_paths'] if 'im_paths' in input else None
+
+    def forward(self):
+        output_coeff = self.net_recon(self.input_img)
+        self.facemodel.to(self.device)
+        self.pred_vertex, self.pred_tex, self.pred_color, self.pred_lm, self.object_vertex = \
+            self.facemodel.compute_for_render(output_coeff)
+      #  self.pred_mask, _, self.pred_face = self.renderer(
+        #    self.pred_vertex, self.facemodel.face_buf, feat=self.pred_color)
+        
+        self.pred_coeffs_dict = self.facemodel.split_coeff(output_coeff)
+
+
+    def compute_losses(self):
+        """Calculate losses, gradients, and update network weights; called in every training iteration"""
+
+        assert self.net_recog.training == False
+        trans_m = self.trans_m
+        if not self.opt.use_predef_M:
+            trans_m = estimate_norm_torch(self.pred_lm, self.input_img.shape[-2])
+
+        pred_feat = self.net_recog(self.pred_face, trans_m)
+        gt_feat = self.net_recog(self.input_img, self.trans_m)
+        self.loss_feat = self.opt.w_feat * self.compute_feat_loss(pred_feat, gt_feat)
+
+        face_mask = self.pred_mask
+        if self.opt.use_crop_face:
+            face_mask, _, _ = self.renderer(self.pred_vertex, self.facemodel.front_face_buf)
+        
+        face_mask = face_mask.detach()
+        self.loss_color = self.opt.w_color * self.comupte_color_loss(
+            self.pred_face, self.input_img, self.atten_mask * face_mask)
+        
+        loss_reg, loss_gamma = self.compute_reg_loss(self.pred_coeffs_dict, self.opt)
+        self.loss_reg = self.opt.w_reg * loss_reg
+        self.loss_gamma = self.opt.w_gamma * loss_gamma
+
+        self.loss_lm = self.opt.w_lm * self.compute_lm_loss(self.pred_lm, self.gt_lm)
+
+        self.loss_reflc = self.opt.w_reflc * self.compute_reflc_loss(self.pred_tex, self.facemodel.skin_mask)
+
+        self.loss_all = self.loss_feat + self.loss_color + self.loss_reg + self.loss_gamma \
+                        + self.loss_lm + self.loss_reflc
+            
+
+    def optimize_parameters(self, isTrain=True):
+        self.forward()               
+        self.compute_losses()
+        """Update network weights; it will be called in every training iteration."""
+        if isTrain:
+            self.optimizer.zero_grad()  
+            self.loss_all.backward()         
+            self.optimizer.step()        
+
+    def compute_visuals(self):
+        with torch.no_grad():
+            input_img_numpy = 255. * self.input_img.detach().cpu().permute(0, 2, 3, 1).numpy()
+            output_vis = self.pred_face * self.pred_mask + (1 - self.pred_mask) * self.input_img
+            output_vis_numpy_raw = 255. * output_vis.detach().cpu().permute(0, 2, 3, 1).numpy()
+            
+            if self.gt_lm is not None:
+                gt_lm_numpy = self.gt_lm.cpu().numpy()
+                pred_lm_numpy = self.pred_lm.detach().cpu().numpy()
+                output_vis_numpy = util.draw_landmarks(output_vis_numpy_raw, gt_lm_numpy, 'b')
+                output_vis_numpy = util.draw_landmarks(output_vis_numpy, pred_lm_numpy, 'r')
+            
+                output_vis_numpy = np.concatenate((input_img_numpy, 
+                                    output_vis_numpy_raw, output_vis_numpy), axis=-2)
+            else:
+                output_vis_numpy = np.concatenate((input_img_numpy, 
+                                    output_vis_numpy_raw), axis=-2)
+
+            self.output_vis = torch.tensor(
+                    output_vis_numpy / 255., dtype=torch.float32
+                ).permute(0, 3, 1, 2).to(self.device)
+
+    def save_mesh(self, name):
+
+        recon_shape = self.pred_vertex  # get reconstructed shape
+        recon_shape[..., -1] = 10 - recon_shape[..., -1] # from camera space to world space
+       # recon_shape = self.object_vertex  # get reconstructed shape
+        recon_shape = recon_shape.cpu().numpy()[0]
+        recon_color = self.pred_color
+        recon_color = recon_color.cpu().numpy()[0]
+        tri = self.facemodel.face_buf.cpu().numpy()
+        #mesh = trimesh.Trimesh(vertices=recon_shape, faces=tri, vertex_colors=np.clip(255. * recon_color, 0, 255).astype(np.uint8))
+        mesh = trimesh.Trimesh(vertices=recon_shape, faces=tri)
+        mesh.export(name)
+
+    def save_coeff(self,name):
+
+        pred_coeffs = {key:self.pred_coeffs_dict[key].cpu().numpy() for key in self.pred_coeffs_dict}
+        pred_lm = self.pred_lm.cpu().numpy()
+        pred_lm = np.stack([pred_lm[:,:,0],self.input_img.shape[2]-1-pred_lm[:,:,1]],axis=2) # transfer to image coordinate
+        pred_coeffs['lm68'] = pred_lm
+        # print(pred_coeffs['angle'])
+        # print(pred_coeffs['angle'].shape)
+       # print(pred_coeffs)
+        np.save(name.replace(".mat", ".npy"),pred_coeffs)
+        # https://www.programmersought.com/article/89115137158/
+        #savemat(name,pred_coeffs)
+
+
+
diff --git a/pose_estimation/models/losses.py b/pose_estimation/models/losses.py
new file mode 100755
index 0000000000000000000000000000000000000000..fbacb63b6110f3dbe7256eb4d5eb781a41e87b8f
--- /dev/null
+++ b/pose_estimation/models/losses.py
@@ -0,0 +1,113 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from kornia.geometry import warp_affine
+import torch.nn.functional as F
+
+def resize_n_crop(image, M, dsize=112):
+    # image: (b, c, h, w)
+    # M   :  (b, 2, 3)
+    return warp_affine(image, M, dsize=(dsize, dsize))
+
+### perceptual level loss
+class PerceptualLoss(nn.Module):
+    def __init__(self, recog_net, input_size=112):
+        super(PerceptualLoss, self).__init__()
+        self.recog_net = recog_net
+        self.preprocess = lambda x: 2 * x - 1
+        self.input_size=input_size
+    def forward(imageA, imageB, M):
+        """
+        1 - cosine distance
+        Parameters:
+            imageA       --torch.tensor (B, 3, H, W), range (0, 1) , RGB order
+            imageB       --same as imageA
+        """
+
+        imageA = self.preprocess(resize_n_crop(imageA, M, self.input_size))
+        imageB = self.preprocess(resize_n_crop(imageB, M, self.input_size))
+
+        # freeze bn
+        self.recog_net.eval()
+        
+        id_featureA = F.normalize(self.recog_net(imageA), dim=-1, p=2)
+        id_featureB = F.normalize(self.recog_net(imageB), dim=-1, p=2)  
+        cosine_d = torch.sum(id_featureA * id_featureB, dim=-1)
+        # assert torch.sum((cosine_d > 1).float()) == 0
+        return torch.sum(1 - cosine_d) / cosine_d.shape[0]        
+
+def perceptual_loss(id_featureA, id_featureB):
+    cosine_d = torch.sum(id_featureA * id_featureB, dim=-1)
+        # assert torch.sum((cosine_d > 1).float()) == 0
+    return torch.sum(1 - cosine_d) / cosine_d.shape[0]  
+
+### image level loss
+def photo_loss(imageA, imageB, mask, eps=1e-6):
+    """
+    l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur)
+    Parameters:
+        imageA       --torch.tensor (B, 3, H, W), range (0, 1), RGB order 
+        imageB       --same as imageA
+    """
+    loss = torch.sqrt(eps + torch.sum((imageA - imageB) ** 2, dim=1, keepdims=True)) * mask
+    loss = torch.sum(loss) / torch.max(torch.sum(mask), torch.tensor(1.0).to(mask.device))
+    return loss
+
+def landmark_loss(predict_lm, gt_lm, weight=None):
+    """
+    weighted mse loss
+    Parameters:
+        predict_lm    --torch.tensor (B, 68, 2)
+        gt_lm         --torch.tensor (B, 68, 2)
+        weight        --numpy.array (1, 68)
+    """
+    if not weight:
+        weight = np.ones([68])
+        weight[28:31] = 20
+        weight[-8:] = 20
+        weight = np.expand_dims(weight, 0)
+        weight = torch.tensor(weight).to(predict_lm.device)
+    loss = torch.sum((predict_lm - gt_lm)**2, dim=-1) * weight
+    loss = torch.sum(loss) / (predict_lm.shape[0] * predict_lm.shape[1])
+    return loss
+
+
+### regulization
+def reg_loss(coeffs_dict, opt=None):
+    """
+    l2 norm without the sqrt, from yu's implementation (mse)
+    tf.nn.l2_loss https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss
+    Parameters:
+        coeffs_dict     -- a  dict of torch.tensors , keys: id, exp, tex, angle, gamma, trans
+
+    """
+    # coefficient regularization to ensure plausible 3d faces
+    if opt:
+        w_id, w_exp, w_tex = opt.w_id, opt.w_exp, opt.w_tex
+    else:
+        w_id, w_exp, w_tex = 1, 1, 1, 1
+    creg_loss = w_id * torch.sum(coeffs_dict['id'] ** 2) +  \
+           w_exp * torch.sum(coeffs_dict['exp'] ** 2) + \
+           w_tex * torch.sum(coeffs_dict['tex'] ** 2)
+    creg_loss = creg_loss / coeffs_dict['id'].shape[0]
+
+    # gamma regularization to ensure a nearly-monochromatic light
+    gamma = coeffs_dict['gamma'].reshape([-1, 3, 9])
+    gamma_mean = torch.mean(gamma, dim=1, keepdims=True)
+    gamma_loss = torch.mean((gamma - gamma_mean) ** 2)
+
+    return creg_loss, gamma_loss
+
+def reflectance_loss(texture, mask):
+    """
+    minimize texture variance (mse), albedo regularization to ensure an uniform skin albedo
+    Parameters:
+        texture       --torch.tensor, (B, N, 3)
+        mask          --torch.tensor, (N), 1 or 0
+
+    """
+    mask = mask.reshape([1, mask.shape[0], 1])
+    texture_mean = torch.sum(mask * texture, dim=1, keepdims=True) / torch.sum(mask)
+    loss = torch.sum(((texture - texture_mean) * mask)**2) / (texture.shape[0] * torch.sum(mask))
+    return loss
+
diff --git a/pose_estimation/models/networks.py b/pose_estimation/models/networks.py
new file mode 100755
index 0000000000000000000000000000000000000000..40ce9f9974267da87505b9c0a5e929b12c644801
--- /dev/null
+++ b/pose_estimation/models/networks.py
@@ -0,0 +1,521 @@
+"""This script defines deep neural networks for Deep3DFaceRecon_pytorch
+"""
+
+import os
+import numpy as np
+import torch.nn.functional as F
+from torch.nn import init
+import functools
+from torch.optim import lr_scheduler
+import torch
+from torch import Tensor
+import torch.nn as nn
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+from typing import Type, Any, Callable, Union, List, Optional
+from .arcface_torch.backbones import get_model
+from kornia.geometry import warp_affine
+
+def resize_n_crop(image, M, dsize=112):
+    # image: (b, c, h, w)
+    # M   :  (b, 2, 3)
+    return warp_affine(image, M, dsize=(dsize, dsize))
+
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+
+def get_scheduler(optimizer, opt):
+    """Return a learning rate scheduler
+
+    Parameters:
+        optimizer          -- the optimizer of the network
+        opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions．　
+                              opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine
+
+    For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
+    See https://pytorch.org/docs/stable/optim.html for more details.
+    """
+    if opt.lr_policy == 'linear':
+        def lambda_rule(epoch):
+            lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.n_epochs) / float(opt.n_epochs + 1)
+            return lr_l
+        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
+    elif opt.lr_policy == 'step':
+        scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_epochs, gamma=0.2)
+    elif opt.lr_policy == 'plateau':
+        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
+    elif opt.lr_policy == 'cosine':
+        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.n_epochs, eta_min=0)
+    else:
+        return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy)
+    return scheduler
+
+
+def define_net_recon(net_recon, use_last_fc=False, init_path=None):
+    return ReconNetWrapper(net_recon, use_last_fc=use_last_fc, init_path=init_path)
+
+def define_net_recog(net_recog, pretrained_path=None):
+    net = RecogNetWrapper(net_recog=net_recog, pretrained_path=pretrained_path)
+    net.eval()
+    return net
+
+class ReconNetWrapper(nn.Module):
+    fc_dim=257
+    def __init__(self, net_recon, use_last_fc=False, init_path=None):
+        super(ReconNetWrapper, self).__init__()
+        self.use_last_fc = use_last_fc
+        if net_recon not in func_dict:
+            return  NotImplementedError('network [%s] is not implemented', net_recon)
+        func, last_dim = func_dict[net_recon]
+        backbone = func(use_last_fc=use_last_fc, num_classes=self.fc_dim)
+        if init_path and os.path.isfile(init_path):
+            state_dict = filter_state_dict(torch.load(init_path, map_location='cpu'))
+            backbone.load_state_dict(state_dict)
+            print("loading init net_recon %s from %s" %(net_recon, init_path))
+        self.backbone = backbone
+        if not use_last_fc:
+            self.final_layers = nn.ModuleList([
+                conv1x1(last_dim, 80, bias=True), # id layer
+                conv1x1(last_dim, 64, bias=True), # exp layer
+                conv1x1(last_dim, 80, bias=True), # tex layer
+                conv1x1(last_dim, 3, bias=True),  # angle layer
+                conv1x1(last_dim, 27, bias=True), # gamma layer
+                conv1x1(last_dim, 2, bias=True),  # tx, ty
+                conv1x1(last_dim, 1, bias=True)   # tz
+            ])
+            for m in self.final_layers:
+                nn.init.constant_(m.weight, 0.)
+                nn.init.constant_(m.bias, 0.)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if not self.use_last_fc:
+            output = []
+            for layer in self.final_layers:
+                output.append(layer(x))
+            x = torch.flatten(torch.cat(output, dim=1), 1)
+        return x
+
+
+class RecogNetWrapper(nn.Module):
+    def __init__(self, net_recog, pretrained_path=None, input_size=112):
+        super(RecogNetWrapper, self).__init__()
+        net = get_model(name=net_recog, fp16=False)
+        if pretrained_path:
+            state_dict = torch.load(pretrained_path, map_location='cpu')
+            net.load_state_dict(state_dict)
+            print("loading pretrained net_recog %s from %s" %(net_recog, pretrained_path))
+        for param in net.parameters():
+            param.requires_grad = False
+        self.net = net
+        self.preprocess = lambda x: 2 * x - 1
+        self.input_size=input_size
+        
+    def forward(self, image, M):
+        image = self.preprocess(resize_n_crop(image, M, self.input_size))
+        id_feature = F.normalize(self.net(image), dim=-1, p=2)
+        return id_feature
+
+
+# adapted from https://github.com/pytorch/vision/edit/master/torchvision/models/resnet.py
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-f37072fd.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-b627a593.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-0676ba61.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-63fe2227.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-394f9c45.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+
+
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1, bias: bool = False) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=bias)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        use_last_fc: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.use_last_fc = use_last_fc
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        
+        if self.use_last_fc:
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+
+    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
+                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        if self.use_last_fc:
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(
+    arch: str,
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any
+) -> ResNet:
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def resnet18(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet152(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+
+
+func_dict = {
+    'resnet18': (resnet18, 512),
+    'resnet50': (resnet50, 2048)
+}
diff --git a/pose_estimation/models/template_model.py b/pose_estimation/models/template_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..68cdaf6a9a2cb321ff2a01949b38adc6fa22e97c
--- /dev/null
+++ b/pose_estimation/models/template_model.py
@@ -0,0 +1,99 @@
+"""Model class template
+
+This module provides a template for users to implement custom models.
+You can specify '--model template' to use this model.
+The class name should be consistent with both the filename and its model option.
+The filename should be <model>_dataset.py
+The class name should be <Model>Dataset.py
+It implements a simple image-to-image translation baseline based on regression loss.
+Given input-output pairs (data_A, data_B), it learns a network netG that can minimize the following L1 loss:
+    min_<netG> ||netG(data_A) - data_B||_1
+You need to implement the following functions:
+    <modify_commandline_options>:　Add model-specific options and rewrite default values for existing options.
+    <__init__>: Initialize this model class.
+    <set_input>: Unpack input data and perform data pre-processing.
+    <forward>: Run forward pass. This will be called by both <optimize_parameters> and <test>.
+    <optimize_parameters>: Update network weights; it will be called in every training iteration.
+"""
+import torch
+from .base_model import BaseModel
+from . import networks
+
+
+class TemplateModel(BaseModel):
+    @staticmethod
+    def modify_commandline_options(parser, is_train=True):
+        """Add new model-specific options and rewrite default values for existing options.
+
+        Parameters:
+            parser -- the option parser
+            is_train -- if it is training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        parser.set_defaults(dataset_mode='aligned')  # You can rewrite default values for this model. For example, this model usually uses aligned dataset as its dataset.
+        if is_train:
+            parser.add_argument('--lambda_regression', type=float, default=1.0, help='weight for the regression loss')  # You can define new arguments for this model.
+
+        return parser
+
+    def __init__(self, opt):
+        """Initialize this model class.
+
+        Parameters:
+            opt -- training/test options
+
+        A few things can be done here.
+        - (required) call the initialization function of BaseModel
+        - define loss function, visualization images, model names, and optimizers
+        """
+        BaseModel.__init__(self, opt)  # call the initialization method of BaseModel
+        # specify the training losses you want to print out. The program will call base_model.get_current_losses to plot the losses to the console and save them to the disk.
+        self.loss_names = ['loss_G']
+        # specify the images you want to save and display. The program will call base_model.get_current_visuals to save and display these images.
+        self.visual_names = ['data_A', 'data_B', 'output']
+        # specify the models you want to save to the disk. The program will call base_model.save_networks and base_model.load_networks to save and load networks.
+        # you can use opt.isTrain to specify different behaviors for training and test. For example, some networks will not be used during test, and you don't need to load them.
+        self.model_names = ['G']
+        # define networks; you can use opt.isTrain to specify different behaviors for training and test.
+        self.netG = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, gpu_ids=self.gpu_ids)
+        if self.isTrain:  # only defined during training time
+            # define your loss functions. You can use losses provided by torch.nn such as torch.nn.L1Loss.
+            # We also provide a GANLoss class "networks.GANLoss". self.criterionGAN = networks.GANLoss().to(self.device)
+            self.criterionLoss = torch.nn.L1Loss()
+            # define and initialize optimizers. You can define one optimizer for each network.
+            # If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
+            self.optimizer = torch.optim.Adam(self.netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+            self.optimizers = [self.optimizer]
+
+        # Our program will automatically call <model.setup> to define schedulers, load networks, and print networks
+
+    def set_input(self, input):
+        """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+        Parameters:
+            input: a dictionary that contains the data itself and its metadata information.
+        """
+        AtoB = self.opt.direction == 'AtoB'  # use <direction> to swap data_A and data_B
+        self.data_A = input['A' if AtoB else 'B'].to(self.device)  # get image data A
+        self.data_B = input['B' if AtoB else 'A'].to(self.device)  # get image data B
+        self.image_paths = input['A_paths' if AtoB else 'B_paths']  # get image paths
+
+    def forward(self):
+        """Run forward pass. This will be called by both functions <optimize_parameters> and <test>."""
+        self.output = self.netG(self.data_A)  # generate output image given the input data_A
+
+    def backward(self):
+        """Calculate losses, gradients, and update network weights; called in every training iteration"""
+        # caculate the intermediate results if necessary; here self.output has been computed during function <forward>
+        # calculate loss given the input and intermediate results
+        self.loss_G = self.criterionLoss(self.output, self.data_B) * self.opt.lambda_regression
+        self.loss_G.backward()       # calculate gradients of network G w.r.t. loss_G
+
+    def optimize_parameters(self):
+        """Update network weights; it will be called in every training iteration."""
+        self.forward()               # first call forward to calculate intermediate results
+        self.optimizer.zero_grad()   # clear network G's existing gradients
+        self.backward()              # calculate gradients for network G
+        self.optimizer.step()        # update gradients for network G
diff --git a/pose_estimation/nvdiffrast/LICENSE.txt b/pose_estimation/nvdiffrast/LICENSE.txt
new file mode 100755
index 0000000000000000000000000000000000000000..26a070a431ce5bb4e926e1289f508f003a4ec730
--- /dev/null
+++ b/pose_estimation/nvdiffrast/LICENSE.txt
@@ -0,0 +1,97 @@
+Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
+
+
+Nvidia Source Code License (1-Way Commercial)
+
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. The Work or
+    derivative works thereof may be used or intended for use by Nvidia
+    or its affiliates commercially or non-commercially. As used herein,
+    "non-commercially" means for research or evaluation purposes only
+    and not for any direct or indirect monetary gain.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor's or its affiliates' names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
diff --git a/pose_estimation/nvdiffrast/README.md b/pose_estimation/nvdiffrast/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..2fece8422829e3c62a8206b665302171a609f0a6
--- /dev/null
+++ b/pose_estimation/nvdiffrast/README.md
@@ -0,0 +1,42 @@
+## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
+
+![Teaser image](docs/img/teaser.png)
+
+**Modular Primitives for High-Performance Differentiable Rendering**<br>
+Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
+[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
+
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
+Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
+
+## Licenses
+
+Copyright &copy; 2020, NVIDIA Corporation. All rights reserved.
+
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
+
+For business inquiries, please contact [researchinquiries@nvidia.com](mailto:researchinquiries@nvidia.com)
+
+We do not currently accept outside code contributions in the form of pull requests.
+
+Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
+[sample material](https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
+originally shared under 
+[MIT License](https://github.com/WaveEngine/Samples/blob/master/LICENSE.md).
+Mesh and texture stored as part of `samples/data/earth.npz` are derived from
+[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
+model originally made available under
+[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
+
+## Citation
+
+```
+@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}
+```
diff --git a/pose_estimation/nvdiffrast/__init__.py b/pose_estimation/nvdiffrast/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/nvdiffrast/build/__init__.py b/pose_estimation/nvdiffrast/build/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/nvdiffrast/build/lib/__init__.py b/pose_estimation/nvdiffrast/build/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/__init__.py b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..3678b790f5e025f8943eee49e9dafa2489dce867
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+__version__ = '0.2.5'
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/antialias.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/antialias.cu
new file mode 100755
index 0000000000000000000000000000000000000000..5411b0873c800f9e9a578383d4c42d226e31dc6c
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/antialias.cu
@@ -0,0 +1,558 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "antialias.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define F32_MAX (3.402823466e+38f)
+static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
+static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
+static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
+{
+    bool g10 = rational_gt(n1, n0, d1, d0);
+    bool g20 = rational_gt(n2, n0, d2, d0);
+    bool g21 = rational_gt(n2, n1, d2, d1);
+    if (g20 && g21) return 2;
+    if (g10) return 1;
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
+
+struct AAWorkItem
+{
+    enum
+    {
+        EDGE_MASK       = 3,    // Edge index in lowest bits.
+        FLAG_DOWN_BIT   = 2,    // Down instead of right.
+        FLAG_TRI1_BIT   = 3,    // Edge is from other pixel's triangle.
+    };
+
+    int             px, py;         // Pixel x, y.
+    unsigned int    pz_flags;       // High 16 bits = pixel z, low 16 bits = edge index and flags.
+    float           alpha;          // Antialiasing alpha value. Zero if no AA.
+};
+
+//------------------------------------------------------------------------
+// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
+
+#define JENKINS_MAGIC (0x9e3779b9u)
+static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
+{
+    a -= b; a -= c; a ^= (c>>13);
+    b -= c; b -= a; b ^= (a<<8);
+    c -= a; c -= b; c ^= (b>>13);
+    a -= b; a -= c; a ^= (c>>12);
+    b -= c; b -= a; b ^= (a<<16);
+    c -= a; c -= b; c ^= (b>>5);
+    a -= b; a -= c; a ^= (c>>3);
+    b -= c; b -= a; b ^= (a<<10);
+    c -= a; c -= b; c ^= (b>>15);
+}
+
+// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
+class HashIndex
+{
+public:
+    __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
+    {
+        m_mask = p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE - 1;
+        m_idx  = (uint32_t)(key & 0xffffffffu);
+        m_skip = (uint32_t)(key >> 32);
+        uint32_t dummy = JENKINS_MAGIC;
+        jenkins_mix(m_idx, m_skip, dummy);
+        m_idx &= m_mask;
+        m_skip &= m_mask;
+        m_skip |= 1;
+    }
+    __device__ __forceinline__ int get(void) const { return m_idx; }
+    __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
+private:
+    uint32_t m_idx, m_skip, m_mask;
+};
+
+static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
+        if (prev == 0 || prev == key)
+            break;
+        idx.next();
+    }
+    int* q = (int*)&p.evHash[idx.get()];
+    int a = atomicCAS(q+2, 0, v);
+    if (a != 0 && a != v)
+        atomicCAS(q+3, 0, v);
+}
+
+static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint4 entry = p.evHash[idx.get()];
+        uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
+        if (k == key || k == 0)
+            return make_int2((int)entry.z, (int)entry.w);
+        idx.next();
+    }
+}
+
+static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
+{
+    if (va == vb)
+        return;
+    
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    hash_insert(p, vk, vn + 1);
+}
+
+static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
+{
+    if (va == vb)
+        return -1;
+
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    int2 vn = hash_find(p, vk) - 1;
+    if (vn.x == vr) return vn.y;
+    if (vn.y == vr) return vn.x;
+    return -1;
+}
+
+//------------------------------------------------------------------------
+// Mesh analysis kernel.
+
+__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= p.numTriangles)
+        return;
+
+    int v0 = p.tri[idx * 3 + 0];
+    int v1 = p.tri[idx * 3 + 1];
+    int v2 = p.tri[idx * 3 + 2];
+
+    if (v0 < 0 || v0 >= p.numVertices ||
+        v1 < 0 || v1 >= p.numVertices ||
+        v2 < 0 || v2 >= p.numVertices)
+        return;
+
+    if (v0 == v1 || v1 == v2 || v2 == v0)
+        return;
+
+    evhash_insert_vertex(p, v1, v2, v0);
+    evhash_insert_vertex(p, v2, v0, v1);
+    evhash_insert_vertex(p, v0, v1, v2);
+}
+
+//------------------------------------------------------------------------
+// Discontinuity finder kernel.
+
+__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
+    int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.n)
+        return;
+
+    // Pointer to our TriIdx and fetch.
+    int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
+    float tri0 = p.rasterOut[pidx0];
+
+    // Look right, clamp at edge.
+    int pidx1 = pidx0;
+    if (px < p.width - 1)
+        pidx1 += 4;
+    float tri1 = p.rasterOut[pidx1];
+
+    // Look down, clamp at edge.
+    int pidx2 = pidx0;
+    if (py < p.height - 1)
+        pidx2 += p.width << 2;
+    float tri2 = p.rasterOut[pidx2];
+
+    // Determine amount of work.
+    int count = 0;
+    if (tri1 != tri0) count  = 1;
+    if (tri2 != tri0) count += 1;
+    if (!count)
+        return; // Exit warp.
+
+    // Coalesce work counter update to once per CTA.
+    __shared__ int s_temp;
+    s_temp = 0;
+    __syncthreads();
+    int idx = atomicAdd(&s_temp, count);
+    __syncthreads();
+    if (idx == 0)
+    {
+        int base = atomicAdd(&p.workBuffer[0].x, s_temp);
+        s_temp = base + 1; // don't clobber the counters in first slot.
+    }
+    __syncthreads();
+    idx += s_temp;
+
+    // Write to memory.
+    if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
+    if (tri2 != tri0) p.workBuffer[idx]   = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
+}
+
+//------------------------------------------------------------------------
+// Forward analysis kernel.
+
+__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
+{
+    __shared__ int s_base;
+    int workCount = p.workBuffer[0].x;
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        int4* pItem = p.workBuffer + thread_idx + 1;
+        int4 item = *pItem;
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d  = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
+        float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
+        int tri0 = (int)zt0.y - 1;
+        int tri1 = (int)zt1.y - 1;
+
+        // Select triangle based on background / depth.
+        int tri = (tri0 >= 0) ? tri0 : tri1;
+        if (tri0 >= 0 && tri1 >= 0)
+            tri = (zt0.x < zt1.x) ? tri0 : tri1;
+        if (tri == tri1)
+        {
+            // Calculate with respect to neighbor pixel if chose that triangle.
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        if (tri < 0 || tri >= p.numTriangles)
+            continue;
+
+        // Fetch vertex indices.
+        int vi0 = p.tri[tri * 3 + 0];
+        int vi1 = p.tri[tri * 3 + 1];
+        int vi2 = p.tri[tri * 3 + 2];
+
+        // Bail out if vertex indices are corrupt.
+        if (vi0 < 0 || vi0 >= p.numVertices ||
+            vi1 < 0 || vi1 >= p.numVertices ||
+            vi2 < 0 || vi2 >= p.numVertices)
+            continue;
+
+        // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
+        int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
+        int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
+        int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
+
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            int vbase = pz * p.numVertices;
+            vi0 += vbase; 
+            vi1 += vbase; 
+            vi2 += vbase;
+            if (op0 >= 0) op0 += vbase;
+            if (op1 >= 0) op1 += vbase;
+            if (op2 >= 0) op2 += vbase;
+        }
+
+        // Fetch vertex positions.
+        float4 p0 = ((float4*)p.pos)[vi0];
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+        float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
+        float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
+        float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
+
+        // Project vertices to pixel space.
+        float w0  = 1.f / p0.w;
+        float w1  = 1.f / p1.w;
+        float w2  = 1.f / p2.w;
+        float ow0 = 1.f / o0.w;
+        float ow1 = 1.f / o1.w;
+        float ow2 = 1.f / o2.w;
+        float fx  = (float)px + .5f - p.xh;
+        float fy  = (float)py + .5f - p.yh;
+        float x0  = p0.x * w0 * p.xh - fx;
+        float y0  = p0.y * w0 * p.yh - fy;
+        float x1  = p1.x * w1 * p.xh - fx;
+        float y1  = p1.y * w1 * p.yh - fy;
+        float x2  = p2.x * w2 * p.xh - fx;
+        float y2  = p2.y * w2 * p.yh - fy;
+        float ox0 = o0.x * ow0 * p.xh - fx;
+        float oy0 = o0.y * ow0 * p.yh - fy;
+        float ox1 = o1.x * ow1 * p.xh - fx;
+        float oy1 = o1.y * ow1 * p.yh - fy;
+        float ox2 = o2.x * ow2 * p.xh - fx;
+        float oy2 = o2.y * ow2 * p.yh - fy;
+
+        // Signs to kill non-silhouette edges.
+        float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
+        float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
+        float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
+        float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
+
+        // If no matching signs anywhere, skip the rest.
+        if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
+        {
+            // XY flip for horizontal edges.
+            if (d)
+            {
+                swap(x0, y0);
+                swap(x1, y1);
+                swap(x2, y2);
+            }
+
+            float dx0 = x2 - x1;
+            float dx1 = x0 - x2;
+            float dx2 = x1 - x0;
+            float dy0 = y2 - y1;
+            float dy1 = y0 - y2;
+            float dy2 = y1 - y0;
+
+            // Check if an edge crosses between us and the neighbor pixel.
+            float dc = -F32_MAX;
+            float ds = (tri == tri0) ? 1.f : -1.f;
+            float d0 = ds * (x1*dy0 - y1*dx0);
+            float d1 = ds * (x2*dy1 - y2*dx1);
+            float d2 = ds * (x0*dy2 - y0*dx2);
+
+            if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
+            if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
+            if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
+
+            int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
+            if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
+            if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
+            if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
+            float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
+
+            // Adjust output image if a suitable edge was found.
+            if (dc > -eps && dc < 1.f + eps)
+            {
+                dc = fminf(fmaxf(dc, 0.f), 1.f);
+                float alpha = ds * (.5f - dc);
+                const float* pColor0 = p.color + pixel0 * p.channels;
+                const float* pColor1 = p.color + pixel1 * p.channels;
+                float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+                for (int i=0; i < p.channels; i++)
+                    atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
+
+                // Rewrite the work item's flags and alpha. Keep original px, py.
+                unsigned int flags = pz << 16;
+                flags |= di;
+                flags |= d << AAWorkItem::FLAG_DOWN_BIT;
+                flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
+                ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+    __shared__ int s_base; // Work counter communication across entire CTA.
+
+    int workCount = p.workBuffer[0].x;
+
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        // Read work item filled out by forward kernel.
+        int4 item = p.workBuffer[thread_idx + 1];
+        unsigned int amask = __ballot_sync(0xffffffffu, item.w);
+        if (item.w == 0)
+            continue; // No effect.
+
+        // Unpack work item and replicate setup from forward analysis kernel.
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+        float alpha = __int_as_float(item.w);
+        int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
+        int di = item.z & AAWorkItem::EDGE_MASK;
+        float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        int tri = (int)p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3] - 1;
+        if (tri1)
+        {
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        bool triFail = (tri < 0 || tri >= p.numTriangles);
+        amask = __ballot_sync(amask, !triFail);
+        if (triFail)
+            continue;
+
+        // Outgoing color gradients.
+        float* pGrad0 = p.gradColor + pixel0 * p.channels;
+        float* pGrad1 = p.gradColor + pixel1 * p.channels;
+
+        // Incoming color gradients.
+        const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+
+        // Position gradient weight based on colors and incoming gradients.
+        float dd = 0.f;
+        const float* pColor0 = p.color + pixel0 * p.channels;
+        const float* pColor1 = p.color + pixel1 * p.channels;
+
+        // Loop over channels and accumulate.
+        for (int i=0; i < p.channels; i++)
+        {
+            float dy = pDy[i];
+            if (dy != 0.f)
+            {
+                // Update position gradient weight.
+                dd += dy * (pColor1[i] - pColor0[i]);
+
+                // Update color gradients. No coalescing because all have different targets.
+                float v = alpha * dy;
+                atomicAdd(&pGrad0[i], -v);
+                atomicAdd(&pGrad1[i], v);
+            }
+        }
+
+        // If position weight is zero, skip the rest.
+        bool noGrad = (dd == 0.f);
+        amask = __ballot_sync(amask, !noGrad);
+        if (noGrad)
+            continue;
+
+        // Fetch vertex indices of the active edge and their positions.
+        int i1 = (di < 2) ? (di + 1) : 0;
+        int i2 = (i1 < 2) ? (i1 + 1) : 0;
+        int vi1 = p.tri[3 * tri + i1];
+        int vi2 = p.tri[3 * tri + i2];
+
+        // Bail out if vertex indices are corrupt.
+        bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
+        amask = __ballot_sync(amask, !vtxFail);
+        if (vtxFail)
+            continue;
+    
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            vi1 += pz * p.numVertices;
+            vi2 += pz * p.numVertices;
+        }
+
+        // Fetch vertex positions.
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+
+        // Project vertices to pixel space.
+        float pxh = p.xh;
+        float pyh = p.yh;
+        float fx = (float)px + .5f - pxh;
+        float fy = (float)py + .5f - pyh;
+
+        // XY flip for horizontal edges.
+        if (d)
+        {
+            swap(p1.x, p1.y);
+            swap(p2.x, p2.y);
+            swap(pxh, pyh);
+            swap(fx, fy);
+        }
+
+        // Gradient calculation setup.
+        float w1 = 1.f / p1.w;
+        float w2 = 1.f / p2.w;
+        float x1 = p1.x * w1 * pxh - fx;
+        float y1 = p1.y * w1 * pyh - fy;
+        float x2 = p2.x * w2 * pxh - fx;
+        float y2 = p2.y * w2 * pyh - fy;
+        float dx = x2 - x1;
+        float dy = y2 - y1;
+        float db = x1*dy - y1*dx;
+
+        // Compute inverse delta-y with epsilon.
+        float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
+        float iy = 1.f / (dy + ep);
+
+        // Compute position gradients.
+        float dby = db * iy;
+        float iw1 = -w1 * iy * dd;
+        float iw2 =  w2 * iy * dd;
+        float gp1x = iw1 * pxh * y2;
+        float gp2x = iw2 * pxh * y1;
+        float gp1y = iw1 * pyh * (dby - x2);
+        float gp2y = iw2 * pyh * (dby - x1);
+        float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
+        float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
+
+        // XY flip the gradients.
+        if (d)
+        {
+            swap(gp1x, gp1y);
+            swap(gp2x, gp2y);
+        }
+
+        // Kill position gradients if alpha was saturated.
+        if (fabsf(alpha) >= 0.5f)
+        {
+            gp1x = gp1y = gp1w = 0.f;
+            gp2x = gp2y = gp2w = 0.f;
+        }
+
+        // Initialize coalesced atomics. Match both triangle ID and edge index.
+        // Also note that some threads may be inactive.
+        CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
+
+        // Accumulate gradients.
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/antialias.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/antialias.h
new file mode 100755
index 0000000000000000000000000000000000000000..a35737db38c3f70da9ca81729cba4f5515a201d2
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/antialias.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "common.h"
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH     32
+#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT    8
+#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK    256
+#define AA_MESH_KERNEL_THREADS_PER_BLOCK        256
+#define AA_HASH_ELEMENTS_PER_TRIANGLE           8   // Minimum is 4 but 8 gives fewer collisions. Must be power of two.
+#define AA_GRAD_KERNEL_THREADS_PER_BLOCK        256
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct AntialiasKernelParams
+{
+    const float*    color;          // Incoming color buffer.
+    const float*    rasterOut;      // Incoming rasterizer output buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    pos;            // Incoming position buffer.
+    float*          output;         // Output buffer of forward kernel.    
+    const float*    dy;             // Incoming gradients.
+    float*          gradColor;      // Output buffer, color gradient.
+    float*          gradPos;        // Output buffer, position gradient.
+    int4*           workBuffer;     // Buffer for storing intermediate work items. First item reserved for counters.
+    uint4*          evHash;         // Edge-vertex hash.
+    int             allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Input width.
+    int             height;         // Input height.
+    int             n;              // Minibatch size.
+    int             channels;       // Channel count in color input.
+    float           xh, yh;         // Transfer to pixel space.
+    int             instance_mode;  // 0=normal, 1=instance mode.
+    int             tri_const;      // 1 if triangle array is known to be constant.
+};
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/common.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/common.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..e566c035bdef66e9b75265a58fb8602b0fa530ca
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/common.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (width * height) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+
+    // Optimizations for weirdly sized buffers.
+    if (width < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= width)
+            bw >>= 1;
+
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > height)
+            bh = height;
+    }
+    else if (height < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > height)
+        {
+            bh >>= 1;
+            if (bw < width)
+                bw <<= 1;
+        }
+    }
+
+    // Done.
+    return dim3(bw, bh, 1);
+}
+
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
+{
+    dim3 gridSize;
+    gridSize.x = (width  - 1) / blockSize.x + 1;
+    gridSize.y = (height - 1) / blockSize.y + 1;
+    gridSize.z = (depth  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/common.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/common.h
new file mode 100755
index 0000000000000000000000000000000000000000..8df48ed73cd330c45250ee02a113e03357504055
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/common.h
@@ -0,0 +1,253 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+// The rest is CUDA device code specific stuff.
+
+#ifdef __CUDACC__
+
+//------------------------------------------------------------------------
+// Helpers for CUDA vector types.
+
+static __device__ __forceinline__ float2&   operator*=  (float2& a, const float2& b)       { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, const float2& b)       { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, const float2& b)       { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ float2&   operator*=  (float2& a, float b)               { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, float b)               { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, float b)               { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ float2    operator*   (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ float2    operator*   (const float2& a, float b)         { return make_float2(a.x * b, a.y * b); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, float b)         { return make_float2(a.x + b, a.y + b); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, float b)         { return make_float2(a.x - b, a.y - b); }
+static __device__ __forceinline__ float2    operator*   (float a, const float2& b)         { return make_float2(a * b.x, a * b.y); }
+static __device__ __forceinline__ float2    operator+   (float a, const float2& b)         { return make_float2(a + b.x, a + b.y); }
+static __device__ __forceinline__ float2    operator-   (float a, const float2& b)         { return make_float2(a - b.x, a - b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a)                  { return make_float2(-a.x, -a.y); }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, const float3& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, const float3& b)       { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, const float3& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, float b)               { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, float b)               { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, float b)               { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ float3    operator*   (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ float3    operator*   (const float3& a, float b)         { return make_float3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, float b)         { return make_float3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, float b)         { return make_float3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ float3    operator*   (float a, const float3& b)         { return make_float3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ float3    operator+   (float a, const float3& b)         { return make_float3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ float3    operator-   (float a, const float3& b)         { return make_float3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a)                  { return make_float3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, const float4& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, const float4& b)       { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, const float4& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, float b)               { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, float b)               { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, float b)               { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ float4    operator*   (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ float4    operator*   (const float4& a, float b)         { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, float b)         { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, float b)         { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ float4    operator*   (float a, const float4& b)         { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ float4    operator+   (float a, const float4& b)         { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ float4    operator-   (float a, const float4& b)         { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a)                  { return make_float4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, const int2& b)           { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, const int2& b)           { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, const int2& b)           { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, int b)                   { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, int b)                   { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, int b)                   { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ int2      operator*   (const int2& a, const int2& b)     { return make_int2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, const int2& b)     { return make_int2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, const int2& b)     { return make_int2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ int2      operator*   (const int2& a, int b)             { return make_int2(a.x * b, a.y * b); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, int b)             { return make_int2(a.x + b, a.y + b); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, int b)             { return make_int2(a.x - b, a.y - b); }
+static __device__ __forceinline__ int2      operator*   (int a, const int2& b)             { return make_int2(a * b.x, a * b.y); }
+static __device__ __forceinline__ int2      operator+   (int a, const int2& b)             { return make_int2(a + b.x, a + b.y); }
+static __device__ __forceinline__ int2      operator-   (int a, const int2& b)             { return make_int2(a - b.x, a - b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a)                    { return make_int2(-a.x, -a.y); }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, const int3& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, const int3& b)           { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, const int3& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, int b)                   { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ int3      operator*   (const int3& a, const int3& b)     { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, const int3& b)     { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, const int3& b)     { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ int3      operator*   (const int3& a, int b)             { return make_int3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, int b)             { return make_int3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, int b)             { return make_int3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ int3      operator*   (int a, const int3& b)             { return make_int3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ int3      operator+   (int a, const int3& b)             { return make_int3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ int3      operator-   (int a, const int3& b)             { return make_int3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a)                    { return make_int3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, const int4& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, const int4& b)           { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, const int4& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, int b)                   { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ int4      operator*   (const int4& a, const int4& b)     { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, const int4& b)     { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, const int4& b)     { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ int4      operator*   (const int4& a, int b)             { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, int b)             { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, int b)             { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ int4      operator*   (int a, const int4& b)             { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ int4      operator+   (int a, const int4& b)             { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ int4      operator-   (int a, const int4& b)             { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a)                    { return make_int4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, const uint2& b)         { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, const uint2& b)         { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, const uint2& b)         { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, unsigned int b)         { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, unsigned int b)         { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, unsigned int b)         { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, const uint2& b)   { return make_uint2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, const uint2& b)   { return make_uint2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, const uint2& b)   { return make_uint2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, unsigned int b)   { return make_uint2(a.x * b, a.y * b); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, unsigned int b)   { return make_uint2(a.x + b, a.y + b); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, unsigned int b)   { return make_uint2(a.x - b, a.y - b); }
+static __device__ __forceinline__ uint2     operator*   (unsigned int a, const uint2& b)   { return make_uint2(a * b.x, a * b.y); }
+static __device__ __forceinline__ uint2     operator+   (unsigned int a, const uint2& b)   { return make_uint2(a + b.x, a + b.y); }
+static __device__ __forceinline__ uint2     operator-   (unsigned int a, const uint2& b)   { return make_uint2(a - b.x, a - b.y); }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, const uint3& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, const uint3& b)         { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, const uint3& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, const uint3& b)   { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, const uint3& b)   { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, const uint3& b)   { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, unsigned int b)   { return make_uint3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, unsigned int b)   { return make_uint3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, unsigned int b)   { return make_uint3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ uint3     operator*   (unsigned int a, const uint3& b)   { return make_uint3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ uint3     operator+   (unsigned int a, const uint3& b)   { return make_uint3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ uint3     operator-   (unsigned int a, const uint3& b)   { return make_uint3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, const uint4& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, const uint4& b)         { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, const uint4& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, const uint4& b)   { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, const uint4& b)   { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, const uint4& b)   { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, unsigned int b)   { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, unsigned int b)   { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, unsigned int b)   { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ uint4     operator*   (unsigned int a, const uint4& b)   { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ uint4     operator+   (unsigned int a, const uint4& b)   { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ uint4     operator-   (unsigned int a, const uint4& b)   { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
+
+template<class T> static __device__ __forceinline__ T zero_value(void);
+template<> __device__ __forceinline__ float  zero_value<float> (void)                      { return 0.f; }
+template<> __device__ __forceinline__ float2 zero_value<float2>(void)                      { return make_float2(0.f, 0.f); }
+template<> __device__ __forceinline__ float4 zero_value<float4>(void)                      { return make_float4(0.f, 0.f, 0.f, 0.f); }
+static __device__ __forceinline__ float3 make_float3(const float2& a, float b)             { return make_float3(a.x, a.y, b); }
+static __device__ __forceinline__ float4 make_float4(const float3& a, float b)             { return make_float4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b)     { return make_float4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ int3 make_int3(const int2& a, int b)                     { return make_int3(a.x, a.y, b); }
+static __device__ __forceinline__ int4 make_int4(const int3& a, int b)                     { return make_int4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b)             { return make_int4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b)         { return make_uint3(a.x, a.y, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b)         { return make_uint4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b)         { return make_uint4(a.x, a.y, b.x, b.y); }
+
+template<class T> static __device__ __forceinline__ void swap(T& a, T& b)                  { T temp = a; a = b; b = temp; }
+
+//------------------------------------------------------------------------
+// Coalesced atomics. These are all done via macros.
+
+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
+
+#define CA_TEMP       _ca_temp
+#define CA_TEMP_PARAM float* CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) \
+    __shared__ float CA_TEMP[(threads_per_block)]
+
+#define CA_SET_GROUP_MASK(group, thread_mask)                   \
+    bool   _ca_leader;                                          \
+    float* _ca_ptr;                                             \
+    do {                                                        \
+        int tidx   = threadIdx.x + blockDim.x * threadIdx.y;    \
+        int lane   = tidx & 31;                                 \
+        int warp   = tidx >> 5;                                 \
+        int tmask  = __match_any_sync((thread_mask), (group));  \
+        int leader = __ffs(tmask) - 1;                          \
+        _ca_leader = (leader == lane);                          \
+        _ca_ptr    = &_ca_temp[((warp << 5) + leader)];         \
+    } while(0)
+
+#define CA_SET_GROUP(group) \
+    CA_SET_GROUP_MASK((group), 0xffffffffu)
+
+#define caAtomicAdd(ptr, value)         \
+    do {                                \
+        if (_ca_leader)                 \
+            *_ca_ptr = 0.f;             \
+        atomicAdd(_ca_ptr, (value));    \
+        if (_ca_leader)                 \
+            atomicAdd((ptr), *_ca_ptr); \
+    } while(0)
+
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        caAtomicAdd((ptr), (x));        \
+        caAtomicAdd((ptr)+1, (y));      \
+        caAtomicAdd((ptr)+3, (w));      \
+    } while(0)
+
+#define caAtomicAddTexture(ptr, level, idx, value)  \
+    do {                                            \
+        CA_SET_GROUP((idx) ^ ((level) << 27));      \
+        caAtomicAdd((ptr)+(idx), (value));          \
+    } while(0)
+
+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        atomicAdd((ptr), (x));          \
+        atomicAdd((ptr)+1, (y));        \
+        atomicAdd((ptr)+3, (w));        \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
+
+//------------------------------------------------------------------------
+#endif // __CUDACC__
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/framework.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/framework.h
new file mode 100755
index 0000000000000000000000000000000000000000..12d803caaf3210c45808dee41217c4c6c6edfe6e
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/framework.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+// Framework-specific macros to enable code sharing.
+
+//------------------------------------------------------------------------
+// Tensorflow.
+
+#ifdef NVDR_TENSORFLOW
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/default/logging.h"
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+#define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
+#define NVDR_CTX_PARAMS _nvdr_ctx
+#define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
+#endif
+
+//------------------------------------------------------------------------
+// PyTorch.
+
+#ifdef NVDR_TORCH
+#ifndef __CUDACC__
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <pybind11/numpy.h>
+#endif
+#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
+#define NVDR_CTX_PARAMS 0
+#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(!err, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
+#endif
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..2af3e931b6808e2575d8a209d5485746499b3374
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil.cpp
@@ -0,0 +1,403 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common.
+//------------------------------------------------------------------------
+
+#include "framework.h"
+#include "glutil.h"
+#include <iostream>
+#include <iomanip>
+
+// Create the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) return_type (GLAPIENTRY* name)(__VA_ARGS__) = 0;
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+// Track initialization status.
+static volatile bool s_glExtInitialized = false;
+
+// Error strings.
+const char* getGLErrorString(GLenum err)
+{
+    switch(err)
+    {
+        case GL_NO_ERROR:                       return "GL_NO_ERROR";
+        case GL_INVALID_ENUM:                   return "GL_INVALID_ENUM";
+        case GL_INVALID_VALUE:                  return "GL_INVALID_VALUE";
+        case GL_INVALID_OPERATION:              return "GL_INVALID_OPERATION";
+        case GL_STACK_OVERFLOW:                 return "GL_STACK_OVERFLOW";
+        case GL_STACK_UNDERFLOW:                return "GL_STACK_UNDERFLOW";
+        case GL_OUT_OF_MEMORY:                  return "GL_OUT_OF_MEMORY";
+        case GL_INVALID_FRAMEBUFFER_OPERATION:  return "GL_INVALID_FRAMEBUFFER_OPERATION";
+        case GL_TABLE_TOO_LARGE:                return "GL_TABLE_TOO_LARGE";
+        case GL_CONTEXT_LOST:                   return "GL_CONTEXT_LOST";
+    }
+    return "Unknown error";
+}
+
+//------------------------------------------------------------------------
+// Windows.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+
+static CRITICAL_SECTION getInitializedCriticalSection(void)
+{
+    CRITICAL_SECTION cs;
+    InitializeCriticalSection(&cs);
+    return cs;
+}
+
+static CRITICAL_SECTION s_getProcAddressMutex = getInitializedCriticalSection();
+
+static void safeGetProcAddress(const char* name, PROC* pfn)
+{
+    PROC result = wglGetProcAddress(name);
+    if (!result)
+    {
+        LeaveCriticalSection(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    // Use critical section for thread safety.
+    EnterCriticalSection(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROC*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    // Done.
+    LeaveCriticalSection(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+    if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
+        LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    if (!wglMakeCurrent(NULL, NULL))
+        LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
+}
+
+extern "C" int set_gpu(const char*); // In setgpu.lib
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx))
+        {
+            LOG(INFO) << "PCI bus id query failed";
+        }
+        else
+        {
+            int res = set_gpu(pciBusId);
+            LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
+        }
+    }
+
+    HINSTANCE hInstance = GetModuleHandle(NULL);
+    WNDCLASS wc = {};
+    wc.style         = CS_OWNDC;
+    wc.lpfnWndProc   = DefWindowProc;
+    wc.hInstance     = hInstance;
+    wc.lpszClassName = "__DummyGLClassCPP";
+    int res = RegisterClass(&wc);
+
+    HWND hwnd = CreateWindow(
+        "__DummyGLClassCPP",        // lpClassName
+        "__DummyGLWindowCPP",       // lpWindowName
+        WS_OVERLAPPEDWINDOW,        // dwStyle
+        CW_USEDEFAULT,              // x
+        CW_USEDEFAULT,              // y
+        0, 0,                       // nWidth, nHeight
+        NULL, NULL,                 // hWndParent, hMenu
+        hInstance,                  // hInstance
+        NULL                        // lpParam
+    );
+
+    PIXELFORMATDESCRIPTOR pfd = {};
+    pfd.dwFlags      = PFD_SUPPORT_OPENGL;
+    pfd.iPixelType   = PFD_TYPE_RGBA;
+    pfd.iLayerType   = PFD_MAIN_PLANE;
+    pfd.cColorBits   = 32;
+    pfd.cDepthBits   = 24;
+    pfd.cStencilBits = 8;
+
+    HDC hdc = GetDC(hwnd);
+    int pixelformat = ChoosePixelFormat(hdc, &pfd);
+    SetPixelFormat(hdc, pixelformat, &pfd);
+
+    HGLRC hglrc = wglCreateContext(hdc);
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
+
+    GLContext glctx = {hdc, hglrc, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (wglGetCurrentContext() == glctx.hglrc)
+        releaseGLContext();
+
+    HWND hwnd = WindowFromDC(glctx.hdc);
+    if (!hwnd)
+        LOG(FATAL) << "WindowFromDC() failed";
+    if (!ReleaseDC(hwnd, glctx.hdc))
+        LOG(FATAL) << "ReleaseDC() failed";
+    if (!wglDeleteContext(glctx.hglrc))
+        LOG(FATAL) << "wglDeleteContext() failed";
+    if (!DestroyWindow(hwnd))
+        LOG(FATAL) << "DestroyWindow() failed";
+
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+
+static pthread_mutex_t s_getProcAddressMutex;
+
+typedef void (*PROCFN)();
+
+static void safeGetProcAddress(const char* name, PROCFN* pfn)
+{
+    PROCFN result = eglGetProcAddress(name);
+    if (!result)
+    {
+        pthread_mutex_unlock(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    pthread_mutex_lock(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROCFN*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    pthread_mutex_unlock(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+
+    if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
+        LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    EGLDisplay display = eglGetCurrentDisplay();
+    if (display == EGL_NO_DISPLAY)
+        LOG(WARNING) << "releaseGLContext() called with no active display";
+    if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
+        LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
+}
+
+static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
+{
+    typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
+    typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
+    typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
+
+    eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
+    if (!eglQueryDevicesEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
+        return 0;
+    }
+
+    eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!eglQueryDeviceAttribEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
+        return 0;
+    }
+
+    eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
+    if (!eglGetPlatformDisplayEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
+        return 0;
+    }
+
+    int num_devices = 0;
+    eglQueryDevicesEXT(0, 0, &num_devices);
+    if (!num_devices)
+        return 0;
+
+    EGLDisplay display = 0;
+    EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
+    eglQueryDevicesEXT(num_devices, devices, &num_devices);
+    for (int i=0; i < num_devices; i++)
+    {
+        EGLDeviceEXT device = devices[i];
+        intptr_t value = -1;
+        if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
+        {
+            display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
+            break;
+        }
+    }
+
+    free(devices);
+    return display;
+}
+
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    EGLDisplay display = 0;
+
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        display = getCudaDisplay(cudaDeviceIdx);
+        if (!display)
+            LOG(INFO) << "Failed, falling back to default display";
+    }
+
+    if (!display)
+    {
+        display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+        if (display == EGL_NO_DISPLAY)
+            LOG(FATAL) << "eglGetDisplay() failed";
+    }
+
+    EGLint major;
+    EGLint minor;
+    if (!eglInitialize(display, &major, &minor))
+        LOG(FATAL) << "eglInitialize() failed";
+
+    // Choose configuration.
+
+    const EGLint context_attribs[] = {
+        EGL_RED_SIZE,           8,
+        EGL_GREEN_SIZE,         8,
+        EGL_BLUE_SIZE,          8,
+        EGL_ALPHA_SIZE,         8,
+        EGL_DEPTH_SIZE,         24,
+        EGL_STENCIL_SIZE,       8,
+        EGL_RENDERABLE_TYPE,    EGL_OPENGL_BIT,
+        EGL_SURFACE_TYPE,       EGL_PBUFFER_BIT,
+        EGL_NONE
+    };
+
+    EGLConfig config;
+    EGLint num_config;
+    if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
+        LOG(FATAL) << "eglChooseConfig() failed";
+
+    // Create GL context.
+
+    if (!eglBindAPI(EGL_OPENGL_API))
+        LOG(FATAL) << "eglBindAPI() failed";
+
+    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
+    if (context == EGL_NO_CONTEXT)
+        LOG(FATAL) << "eglCreateContext() failed";
+
+    // Done.
+
+    LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
+
+    GLContext glctx = {display, context, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (eglGetCurrentContext() == glctx.context)
+        releaseGLContext();
+
+    if (!eglDestroyContext(glctx.display, glctx.context))
+        LOG(ERROR) << "eglDestroyContext() failed";
+
+    LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)glctx.display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+//------------------------------------------------------------------------
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil.h
new file mode 100755
index 0000000000000000000000000000000000000000..e9a3a7d95a5af4a808a25097cc055b699024409e
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Windows-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+#define NOMINMAX
+#include <windows.h> // Required by gl.h in Windows.
+#define GLAPIENTRY APIENTRY
+
+struct GLContext
+{
+    HDC     hdc;
+    HGLRC   hglrc;
+    int     extInitialized;
+};
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+#define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
+#define MESA_EGL_NO_X11_HEADERS
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#define GLAPIENTRY
+
+struct GLContext
+{
+    EGLDisplay  display;
+    EGLContext  context;
+    int         extInitialized;
+};
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
+// OpenGL, CUDA interop, GL extensions.
+//------------------------------------------------------------------------
+#define GL_GLEXT_LEGACY
+#include <GL/gl.h>
+#include <cuda_gl_interop.h>
+
+// Constants.
+#ifndef GL_VERSION_1_2
+#define GL_CLAMP_TO_EDGE                 0x812F
+#define GL_TEXTURE_3D                    0x806F
+#endif
+#ifndef GL_VERSION_1_5
+#define GL_ARRAY_BUFFER                  0x8892
+#define GL_DYNAMIC_DRAW                  0x88E8
+#define GL_ELEMENT_ARRAY_BUFFER          0x8893
+#endif
+#ifndef GL_VERSION_2_0
+#define GL_FRAGMENT_SHADER               0x8B30
+#define GL_INFO_LOG_LENGTH               0x8B84
+#define GL_LINK_STATUS                   0x8B82
+#define GL_VERTEX_SHADER                 0x8B31
+#endif
+#ifndef GL_VERSION_3_0
+#define GL_MAJOR_VERSION                 0x821B
+#define GL_MINOR_VERSION                 0x821C
+#define GL_RGBA32F                       0x8814
+#define GL_TEXTURE_2D_ARRAY              0x8C1A
+#endif
+#ifndef GL_VERSION_3_2
+#define GL_GEOMETRY_SHADER               0x8DD9
+#endif
+#ifndef GL_ARB_framebuffer_object
+#define GL_COLOR_ATTACHMENT0             0x8CE0
+#define GL_COLOR_ATTACHMENT1             0x8CE1
+#define GL_DEPTH_STENCIL                 0x84F9
+#define GL_DEPTH_STENCIL_ATTACHMENT      0x821A
+#define GL_DEPTH24_STENCIL8              0x88F0
+#define GL_FRAMEBUFFER                   0x8D40
+#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
+#define GL_UNSIGNED_INT_24_8             0x84FA
+#endif
+#ifndef GL_ARB_imaging
+#define GL_TABLE_TOO_LARGE               0x8031
+#endif
+#ifndef GL_KHR_robustness
+#define GL_CONTEXT_LOST                  0x0507
+#endif
+
+// Declare function pointers to OpenGL extension functions.
+#define GLUTIL_EXT(return_type, name, ...) extern return_type (GLAPIENTRY* name)(__VA_ARGS__);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+//------------------------------------------------------------------------
+// Common functions.
+//------------------------------------------------------------------------
+
+void        setGLContext            (GLContext& glctx);
+void        releaseGLContext        (void);
+GLContext   createGLContext         (int cudaDeviceIdx);
+void        destroyGLContext        (GLContext& glctx);
+const char* getGLErrorString        (GLenum err);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil_extlist.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil_extlist.h
new file mode 100755
index 0000000000000000000000000000000000000000..49061ab760e9dca5bf610f8ed71fbd3fe11023fc
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/glutil_extlist.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#ifndef GL_VERSION_1_2
+GLUTIL_EXT(void,   glTexImage3D,                GLenum target, GLint level, GLint internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+#endif
+#ifndef GL_VERSION_1_5
+GLUTIL_EXT(void,   glBindBuffer,                GLenum target, GLuint buffer);
+GLUTIL_EXT(void,   glBufferData,                GLenum target, ptrdiff_t size, const void* data, GLenum usage);
+GLUTIL_EXT(void,   glGenBuffers,                GLsizei n, GLuint* buffers);
+#endif
+#ifndef GL_VERSION_2_0
+GLUTIL_EXT(void,   glAttachShader,              GLuint program, GLuint shader);
+GLUTIL_EXT(void,   glCompileShader,             GLuint shader);
+GLUTIL_EXT(GLuint, glCreateProgram,             void);
+GLUTIL_EXT(GLuint, glCreateShader,              GLenum type);
+GLUTIL_EXT(void,   glDrawBuffers,               GLsizei n, const GLenum* bufs);
+GLUTIL_EXT(void,   glEnableVertexAttribArray,   GLuint index);
+GLUTIL_EXT(void,   glGetProgramInfoLog,         GLuint program, GLsizei bufSize, GLsizei* length, char* infoLog);
+GLUTIL_EXT(void,   glGetProgramiv,              GLuint program, GLenum pname, GLint* param);
+GLUTIL_EXT(void,   glLinkProgram,               GLuint program);
+GLUTIL_EXT(void,   glShaderSource,              GLuint shader, GLsizei count, const char *const* string, const GLint* length);
+GLUTIL_EXT(void,   glUniform2f,                 GLint location, GLfloat v0, GLfloat v1);
+GLUTIL_EXT(void,   glUseProgram,                GLuint program);
+GLUTIL_EXT(void,   glVertexAttribPointer,       GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void* pointer);
+#endif
+#ifndef GL_VERSION_3_2
+GLUTIL_EXT(void,   glFramebufferTexture,        GLenum target, GLenum attachment, GLuint texture, GLint level);
+#endif
+#ifndef GL_ARB_framebuffer_object
+GLUTIL_EXT(void,   glBindFramebuffer,           GLenum target, GLuint framebuffer);
+GLUTIL_EXT(void,   glGenFramebuffers,           GLsizei n, GLuint* framebuffers);
+#endif
+#ifndef GL_ARB_vertex_array_object
+GLUTIL_EXT(void,   glBindVertexArray,           GLuint array);
+GLUTIL_EXT(void,   glGenVertexArrays,           GLsizei n, GLuint* arrays);
+#endif
+#ifndef GL_ARB_multi_draw_indirect
+GLUTIL_EXT(void,   glMultiDrawElementsIndirect, GLenum mode, GLenum type, const void *indirect, GLsizei primcount, GLsizei stride);
+#endif
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/interpolate.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/interpolate.cu
new file mode 100755
index 0000000000000000000000000000000000000000..84f5fb761175dc7844e6137da75bb944cab5fd35
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/interpolate.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "interpolate.h"
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Output ptrs.
+    float* out = p.out + pidx * p.numAttr;
+    float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0;
+
+    // Fetch rasterizer output.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = (int)r.w - 1;
+    bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);
+
+    // If no geometry in entire warp, zero the output and exit.
+    // Otherwise force barys to zero and output with live threads.
+    if (__all_sync(0xffffffffu, !triValid))
+    {
+        for (int i=0; i < p.numAttr; i++)
+            out[i] = 0.f;
+        if (ENABLE_DA)
+            for (int i=0; i < p.numDiffAttr; i++)
+                outDA[i] = make_float2(0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
+    int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
+    int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Pointers to attributes.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+
+    // Barys. If no triangle, force all to zero -> output is zero.
+    float b0 = triValid ? r.x : 0.f;
+    float b1 = triValid ? r.y : 0.f;
+    float b2 = triValid ? (1.f - r.x - r.y) : 0.f;
+
+    // Interpolate and write attributes.
+    for (int i=0; i < p.numAttr; i++)
+        out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i];
+
+    // No diff attrs? Exit.
+    if (!ENABLE_DA)
+        return;
+
+    // Read bary pixel differentials if we have a triangle.
+    float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
+    if (triValid)
+        db = ((float4*)p.rastDB)[pidx];
+
+    // Unpack a bit.
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    // Calculate the pixel differentials of chosen attributes.    
+    for (int i=0; i < p.numDiffAttr; i++)
+    {   
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Zero output if invalid index.
+        float dsdx = 0.f;
+        float dsdy = 0.f;
+        if (j >= 0 && j < p.numAttr)
+        {
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            dsdx = dudx*dsdu + dvdx*dsdv;
+            dsdy = dudy*dsdu + dvdy*dsdv;
+        }
+
+        // Write.
+        outDA[i] = make_float2(dsdx, dsdy);
+    }
+}
+
+// Template specializations.
+__global__ void InterpolateFwdKernel  (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); }
+__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Fetch triangle ID. If none, output zero bary/db gradients and exit.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = (int)r.w - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        ((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        if (ENABLE_DA)
+            ((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+
+    // Pointers to inputs.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+    const float* pdy = p.dy + pidx * p.numAttr;
+
+    // Pointers to outputs.
+    float* ga0 = p.gradAttr + vi0 * p.numAttr;
+    float* ga1 = p.gradAttr + vi1 * p.numAttr;
+    float* ga2 = p.gradAttr + vi2 * p.numAttr;
+
+    // Barys and bary gradient accumulators.
+    float b0 = r.x;
+    float b1 = r.y;
+    float b2 = 1.f - r.x - r.y;
+    float gb0 = 0.f;
+    float gb1 = 0.f;
+
+    // Loop over attributes and accumulate attribute gradients.
+    for (int i=0; i < p.numAttr; i++)
+    {
+        float y = pdy[i];
+        float s0 = a0[i];
+        float s1 = a1[i];
+        float s2 = a2[i];
+        gb0 += y * (s0 - s2);
+        gb1 += y * (s1 - s2);
+        caAtomicAdd(ga0 + i, b0 * y);
+        caAtomicAdd(ga1 + i, b1 * y);
+        caAtomicAdd(ga2 + i, b2 * y);
+    }
+
+    // Write the bary gradients.
+    ((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);
+
+    // If pixel differentials disabled, we're done.
+    if (!ENABLE_DA)
+        return;
+
+    // Calculate gradients based on attribute pixel differentials.
+    const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr;
+    float gdudx = 0.f;
+    float gdudy = 0.f;
+    float gdvdx = 0.f;
+    float gdvdy = 0.f;
+
+    // Read bary pixel differentials.
+    float4 db = ((float4*)p.rastDB)[pidx];
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    for (int i=0; i < p.numDiffAttr; i++)
+    {
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Check that index is valid.
+        if (j >= 0 && j < p.numAttr)
+        {
+            float2 dsdxy = dda[i];
+            float dsdx = dsdxy.x;
+            float dsdy = dsdxy.y;
+
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+
+            // Gradients of db.
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            gdudx += dsdu * dsdx;
+            gdudy += dsdu * dsdy;
+            gdvdx += dsdv * dsdx;
+            gdvdy += dsdv * dsdy;
+
+            // Gradients of attributes.
+            float du = dsdx*dudx + dsdy*dudy;
+            float dv = dsdx*dvdx + dsdy*dvdy;
+            caAtomicAdd(ga0 + j, du);
+            caAtomicAdd(ga1 + j, dv);
+            caAtomicAdd(ga2 + j, -du - dv);
+        }
+    }
+
+    // Write.
+    ((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
+}
+
+// Template specializations.
+__global__ void InterpolateGradKernel  (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); }
+__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/interpolate.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/interpolate.h
new file mode 100755
index 0000000000000000000000000000000000000000..d35d8388240e97c255c837446609d8ae00cd78d9
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/interpolate.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define IP_FWD_MAX_KERNEL_BLOCK_WIDTH   8
+#define IP_FWD_MAX_KERNEL_BLOCK_HEIGHT  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define IP_MAX_DIFF_ATTRS               32
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct InterpolateKernelParams
+{
+    const int*      tri;                            // Incoming triangle buffer.
+    const float*    attr;                           // Incoming attribute buffer.
+    const float*    rast;                           // Incoming rasterizer output buffer.
+    const float*    rastDB;                         // Incoming rasterizer output buffer for bary derivatives.
+    const float*    dy;                             // Incoming attribute gradients.
+    const float*    dda;                            // Incoming attr diff gradients.
+    float*          out;                            // Outgoing interpolated attributes.
+    float*          outDA;                          // Outgoing texcoord major axis lengths.
+    float*          gradAttr;                       // Outgoing attribute gradients.
+    float*          gradRaster;                     // Outgoing rasterizer gradients.
+    float*          gradRasterDB;                   // Outgoing rasterizer bary diff gradients.
+    int             numTriangles;                   // Number of triangles.
+    int             numVertices;                    // Number of vertices.
+    int             numAttr;                        // Number of total vertex attributes.
+    int             numDiffAttr;                    // Number of attributes to differentiate.
+    int             width;                          // Image width.
+    int             height;                         // Image height.
+    int             depth;                          // Minibatch size.
+    int             attrBC;                         // 0=normal, 1=attr is broadcast.
+    int             instance_mode;                  // 0=normal, 1=instance mode.
+    int             diff_attrs_all;                 // 0=normal, 1=produce pixel differentials for all attributes.
+    int             diffAttrs[IP_MAX_DIFF_ATTRS];   // List of attributes to differentiate.
+};
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..73064d4620a0905d8732c3ec33abc825a8a71bc9
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cpp
@@ -0,0 +1,560 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "rasterize.h"
+#include "glutil.h"
+#include <vector>
+#define STRINGIFY_SHADER_SOURCE(x) #x
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+static int ROUND_UP_BITS(uint32_t x, uint32_t y)
+{
+    // Round x up so that it has at most y bits of mantissa.
+    if (x < (1u << y))
+        return x;
+    uint32_t m = 0;
+    while (x & ~m)
+        m = (m << 1) | 1u;
+    m >>= y;
+    if (!(x & m))
+        return x;
+    return (x | m) + 1u;
+}
+
+//------------------------------------------------------------------------
+// GL helpers.
+
+static void compileGLShader(NVDR_CTX_ARGS, GLuint* pShader, GLenum shaderType, const char* src)
+{
+    const char* srcPtr = src;
+    int srcLength = strlen(src);
+    *pShader = 0;
+    NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
+    NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &srcPtr, &srcLength));
+    NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
+}
+
+static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
+{
+    *pProgram = 0;
+
+    GLuint glProgram = 0;
+    NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
+    NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));
+
+    GLint linkStatus = 0;
+    NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
+    if (!linkStatus)
+    {
+        GLint infoLen = 0;
+        NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
+        if (infoLen)
+        {
+            const char* hdr = "glLinkProgram() failed:\n";
+            std::vector<char> info(strlen(hdr) + infoLen);
+            strcpy(&info[0], hdr);
+            NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
+            NVDR_CHECK(0, &info[0]);
+        }
+        NVDR_CHECK(0, "glLinkProgram() failed");
+    }
+
+    *pProgram = glProgram;
+}
+
+//------------------------------------------------------------------------
+// Shared C++ functions.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
+{
+    // Create GL context and set it current.
+    s.glctx = createGLContext(cudaDeviceIdx);
+    setGLContext(s.glctx);
+
+    // Version check.
+    GLint vMajor = 0;
+    GLint vMinor = 0;
+    glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
+    glGetIntegerv(GL_MINOR_VERSION, &vMinor);
+    glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
+    LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
+    NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
+
+    // Number of output buffers.
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Set up vertex shader.
+    compileGLShader(NVDR_CTX_PARAMS, &s.glVertexShader, GL_VERTEX_SHADER,
+        "#version 330\n"
+        "#extension GL_ARB_shader_draw_parameters : enable\n"
+        STRINGIFY_SHADER_SOURCE(
+            layout(location = 0) in vec4 in_pos;
+            out int v_layer;
+            out int v_offset;
+            void main()
+            {
+                int layer = gl_DrawIDARB;
+                gl_Position = in_pos;
+                v_layer = layer;
+                v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
+            }
+        )
+    );
+
+    // Geometry and fragment shaders depend on if bary differential output is enabled or not.
+    if (s.enableDB)
+    {
+        // Set up geometry shader. Calculation of per-pixel bary differentials is based on:
+        //           u = (u/w) / (1/w)
+        //   --> du/dX = d((u/w) / (1/w))/dX
+        //   --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w
+        // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                layout(location = 0) uniform vec2 vp_scale;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                out vec4 var_db;
+                void main()
+                {
+                    // Plane equations for bary differentials.
+                    float w0 = gl_in[0].gl_Position.w;
+                    float w1 = gl_in[1].gl_Position.w;
+                    float w2 = gl_in[2].gl_Position.w;
+                    vec2 p0 = gl_in[0].gl_Position.xy;
+                    vec2 p1 = gl_in[1].gl_Position.xy;
+                    vec2 p2 = gl_in[2].gl_Position.xy;
+                    vec2 e0 = p0*w2 - p2*w0;
+                    vec2 e1 = p1*w2 - p2*w1;
+                    float a = e0.x*e1.y - e0.y*e1.x;
+
+                    // Clamp area to an epsilon to avoid arbitrarily high bary differentials.
+                    float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
+                    float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
+                    float ia = 1.f / ca; // Inverse area.
+
+                    vec2 ascl = ia * vp_scale;
+                    float dudx =  e1.y * ascl.x;
+                    float dudy = -e1.x * ascl.y;
+                    float dvdx = -e0.y * ascl.x;
+                    float dvdy =  e0.x * ascl.y;
+
+                    float duwdx = w2 * dudx;
+                    float dvwdx = w2 * dvdx;
+                    float duvdx = w0 * dudx + w1 * dvdx;
+                    float duwdy = w2 * dudy;
+                    float dvwdy = w2 * dvdy;
+                    float duvdy = w0 * dudy + w1 * dvdy;
+
+                    vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
+                    vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
+                    vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);
+
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
+                }
+            )
+        );
+
+        // Set up fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                in int gl_PrimitiveID;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                void main()
+                {
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                    out_db = var_db * var_uvzw.w;
+                }
+            )
+        );
+
+        // Set up fragment shader for depth peeling.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                in int gl_Layer;
+                in int gl_PrimitiveID;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                void main()
+                {
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, float(gl_PrimitiveID + 1));
+                    out_db = var_db * var_uvzw.w;
+                }
+            )
+        );
+    }
+    else
+    {
+        // Geometry shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                void main()
+                {
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
+                }
+            )
+        );
+
+        // Fragment shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in int gl_PrimitiveID;
+                layout(location = 0) out vec4 out_raster;
+                void main()
+                {
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                }
+            )
+        );
+
+        // Depth peeling variant of fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in int gl_Layer;
+                in int gl_PrimitiveID;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                void main()
+                {
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                }
+            )
+        );
+    }
+
+    // Finalize programs.
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP);
+
+    // Construct main fbo and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
+    NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));
+
+    // Enable two color attachments.
+    GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+    NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));
+
+    // Construct vertex array object.
+    NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
+    NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));
+
+    // Construct position buffer, bind permanently, enable, set ptr.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
+    NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));
+
+    // Construct index buffer and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));
+
+    // Set up depth test.
+    NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
+    NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
+    NVDR_CHECK_GL_ERROR(glClearDepth(1.0));
+
+    // Create and bind output buffers. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
+    }
+
+    // Create and bind depth/stencil buffer. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));
+
+    // Create texture name for previous output buffer (depth peeling).
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
+}
+
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth)
+{
+    // Resize vertex buffer?
+    if (posCount > s.posCount)
+    {
+        if (s.cudaPosBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+        s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
+        LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+    }
+
+    // Resize triangle buffer?
+    if (triCount > s.triCount)
+    {
+        if (s.cudaTriBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+        s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
+        LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+    }
+
+    // Resize framebuffer?
+    if (width > s.width || height > s.height || depth > s.depth)
+    {
+        int num_outputs = s.enableDB ? 2 : 1;
+        if (s.cudaColorBuffer[0])
+            for (int i=0; i < num_outputs; i++)
+                NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+
+        if (s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+            s.cudaPrevOutBuffer = 0;
+        }
+
+        // New framebuffer size.
+        s.width  = (width > s.width) ? width : s.width;
+        s.height = (height > s.height) ? height : s.height;
+        s.depth  = (depth > s.depth) ? depth : s.depth;
+        s.width  = ROUND_UP(s.width, 32);
+        s.height = ROUND_UP(s.height, 32);
+        LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
+
+        // Allocate color buffers.
+        for (int i=0; i < num_outputs; i++)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+        }
+
+        // Allocate depth/stencil buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+        NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+
+        // (Re-)register all GL buffers into Cuda.
+        for (int i=0; i < num_outputs; i++)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+    }
+
+    // Resize range arrays?
+    if ((unsigned int)depth > s.drawCmdBuffer.size())
+    {
+        int newSize = (depth > 64) ? ROUND_UP_BITS(depth, 1) : 64;
+        LOG(INFO) << "Increasing range array size to " << newSize << " elements";
+        s.drawCmdBuffer.resize(newSize);
+    }
+}
+
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
+{
+    // Only copy inputs if we are on first iteration of depth peeling or not doing it at all.
+    if (peeling_idx < 1)
+    {
+        if (triPtr)
+        {
+            // Copy both position and triangle buffers.
+            void* glPosPtr = NULL;
+            void* glTriPtr = NULL;
+            size_t posBytes = 0;
+            size_t triBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
+        }
+        else
+        {
+            // Copy position buffer only. Triangles are already copied and known to be constant.
+            void* glPosPtr = NULL;
+            size_t posBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
+        }
+    }
+
+    // Select program based on whether we have a depth peeling input or not.
+    if (peeling_idx < 1)
+    {
+        // Normal case: No peeling, or peeling disabled.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
+    }
+    else
+    {
+        // If we don't have a third buffer yet, create one.
+        if (!s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+        }
+
+        // Swap the GL buffers.
+        GLuint glTempBuffer = s.glPrevOutBuffer;
+        s.glPrevOutBuffer = s.glColorBuffer[0];
+        s.glColorBuffer[0] = glTempBuffer;
+
+        // Swap the Cuda buffers.
+        cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer;
+        s.cudaPrevOutBuffer = s.cudaColorBuffer[0];
+        s.cudaColorBuffer[0] = cudaTempBuffer;
+
+        // Bind the new output buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0));
+
+        // Bind old buffer as the input texture.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+
+        // Activate the correct program.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP));
+    }
+
+    // Set viewport, clear color buffer(s) and depth/stencil buffer.
+    NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
+    NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT));
+
+    // If outputting bary differentials, set resolution uniform
+    if (s.enableDB)
+        NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
+
+    // Render the meshes.
+    if (depth == 1 && !rangesPtr)
+    {
+        // Trivial case.
+        NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
+    }
+    else
+    {
+        if (!rangesPtr)
+        {
+            // Fill in range array to instantiate the same triangles for each output layer.
+            // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = s.drawCmdBuffer[i];
+                cmd.firstIndex    = 0;
+                cmd.count         = triCount;
+                cmd.baseVertex    = vtxPerInstance * i;
+                cmd.baseInstance  = 0;
+                cmd.instanceCount = 1;
+            }
+        }
+        else
+        {
+            // Fill in the range array according to user-given ranges. Triangle IDs point
+            // to the input triangle array, NOT index within range, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0, j=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = s.drawCmdBuffer[i];
+                int first = rangesPtr[j++];
+                int count = rangesPtr[j++];
+                NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
+                NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
+                cmd.firstIndex    = first * 3;
+                cmd.count         = count * 3;
+                cmd.baseVertex    = 0;
+                cmd.baseInstance  = first;
+                cmd.instanceCount = 1;
+            }
+        }
+
+        // Draw!
+        NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &s.drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
+    }
+}
+
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
+{
+    // Copy color buffers to output tensors.
+    cudaArray_t array = 0;
+    cudaChannelFormatDesc arrayDesc = {};   // For error checking.
+    cudaExtent arrayExt = {};               // For error checking.
+    int num_outputs = s.enableDB ? 2 : 1;
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
+        NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
+        NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
+        NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
+        NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
+        cudaMemcpy3DParms p = {0};
+        p.srcArray = array;
+        p.dstPtr.ptr = outputPtr[i];
+        p.dstPtr.pitch = width * 4 * sizeof(float);
+        p.dstPtr.xsize = width;
+        p.dstPtr.ysize = height;
+        p.extent.width = width;
+        p.extent.height = height;
+        p.extent.depth = depth;
+        p.kind = cudaMemcpyDeviceToDevice;
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
+    }
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cu
new file mode 100755
index 0000000000000000000000000000000000000000..fe9888e002fd131bbd3b270fcb8f7f6746878e9e
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cu
@@ -0,0 +1,175 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "rasterize.h"
+
+//------------------------------------------------------------------------
+// Gradient Cuda kernel.
+
+template <bool ENABLE_DB>
+static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Read triangle idx and dy.
+    float2 dy  = ((float2*)p.dy)[pidx * 2];
+    float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
+    int triIdx = (int)(((float*)p.out)[pidx * 4 + 3]) - 1;
+
+    // Exit if nothing to do.
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+        return; // No or corrupt triangle.
+    int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
+    int grad_all_ddb = 0;
+    if (ENABLE_DB)
+        grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
+    if (((grad_all_dy | grad_all_ddb) << 1) == 0)
+        return; // All incoming gradients are +0/-0.
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+    
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Compute inverse area with epsilon.
+    float at = a0 + a1 + a2;
+    float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
+    float iw = 1.f / (at + ep);
+
+    // Perspective correct, normalized barycentrics.
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Position gradients.
+    float gb0  = dy.x * iw;
+    float gb1  = dy.y * iw;
+    float gbb  = gb0 * b0 + gb1 * b1;
+    float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
+    float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
+    float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
+    float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
+    float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
+    float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
+    float gp0w = -fx * gp0x - fy * gp0y;
+    float gp1w = -fx * gp1x - fy * gp1y;
+    float gp2w = -fx * gp2x - fy * gp2y;
+
+    // Bary differential gradients.
+    if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
+    {
+        float dfxdX = p.xs * iw;
+        float dfydY = p.ys * iw;
+        ddb.x *= dfxdX;
+        ddb.y *= dfydY;
+        ddb.z *= dfxdX;
+        ddb.w *= dfydY;
+
+        float da0dX = p1.y * p2.w - p2.y * p1.w;
+        float da1dX = p2.y * p0.w - p0.y * p2.w;
+        float da2dX = p0.y * p1.w - p1.y * p0.w;
+        float da0dY = p2.x * p1.w - p1.x * p2.w;
+        float da1dY = p0.x * p2.w - p2.x * p0.w;
+        float da2dY = p1.x * p0.w - p0.x * p1.w;
+        float datdX = da0dX + da1dX + da2dX;
+        float datdY = da0dY + da1dY + da2dY;
+
+        float x01 = p0.x - p1.x;
+        float x12 = p1.x - p2.x;
+        float x20 = p2.x - p0.x;
+        float y01 = p0.y - p1.y;
+        float y12 = p1.y - p2.y;
+        float y20 = p2.y - p0.y;
+        float w01 = p0.w - p1.w;
+        float w12 = p1.w - p2.w;
+        float w20 = p2.w - p0.w;
+
+        float a0p1 = fy * p2.x - fx * p2.y;
+        float a0p2 = fx * p1.y - fy * p1.x;
+        float a1p0 = fx * p2.y - fy * p2.x;
+        float a1p2 = fy * p0.x - fx * p0.y;
+
+        float wdudX = 2.f * b0 * datdX - da0dX; 
+        float wdudY = 2.f * b0 * datdY - da0dY;
+        float wdvdX = 2.f * b1 * datdX - da1dX;
+        float wdvdY = 2.f * b1 * datdY - da1dY;
+
+        float c0  = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
+        float cx  = c0 * fx - ddb.x * b0 - ddb.z * b1;
+        float cy  = c0 * fy - ddb.y * b0 - ddb.w * b1;
+        float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
+        float czw = iw * (ddb.z * datdX + ddb.w * datdY);
+
+        gp0x += c0 * y12 - cy * w12              + czw * p2y                                               + ddb.w * p2.w;
+        gp1x += c0 * y20 - cy * w20 - cxy * p2y                              - ddb.y * p2.w;
+        gp2x += c0 * y01 - cy * w01 + cxy * p1y  - czw * p0y                 + ddb.y * p1.w                - ddb.w * p0.w;
+        gp0y += cx * w12 - c0 * x12              - czw * p2x                                - ddb.z * p2.w;
+        gp1y += cx * w20 - c0 * x20 + cxy * p2x               + ddb.x * p2.w;
+        gp2y += cx * w01 - c0 * x01 - cxy * p1x  + czw * p0x  - ddb.x * p1.w                + ddb.z * p0.w;
+        gp0w += cy * x12 - cx * y12              - czw * a1p0                               + ddb.z * p2.y - ddb.w * p2.x;
+        gp1w += cy * x20 - cx * y20 - cxy * a0p1              - ddb.x * p2.y + ddb.y * p2.x;
+        gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
+    }
+
+    // Accumulate using coalesced atomics.
+    caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
+}
+
+// Template specializations.
+__global__ void RasterizeGradKernel  (const RasterizeGradParams p) { RasterizeGradKernelTemplate<false>(p); }
+__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.h
new file mode 100755
index 0000000000000000000000000000000000000000..6905b98508ea540729a1eae1bfb71af0f4033520
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/rasterize.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+
+//------------------------------------------------------------------------
+// Gradient CUDA kernel params.
+
+struct RasterizeGradParams
+{
+    const float*    pos;            // Incoming position buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    out;            // Rasterizer output buffer.
+    const float*    dy;             // Incoming gradients of rasterizer output buffer.
+    const float*    ddb;            // Incoming gradients of bary diff output buffer.
+    float*          grad;           // Outgoing position gradients.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Image width.
+    int             height;         // Image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
+// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
+
+#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
+#include "framework.h"
+#include "glutil.h"
+
+//------------------------------------------------------------------------
+// Draw command struct used by rasterizer.
+
+struct GLDrawCmd
+{
+    uint32_t    count;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    uint32_t    baseVertex;
+    uint32_t    baseInstance;
+};
+
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+
+struct RasterizeGLState
+{
+    int                     width;              // Allocated frame buffer width.
+    int                     height;             // Allocated frame buffer height.
+    int                     depth;              // Allocated frame buffer depth.
+    int                     posCount;           // Allocated position buffer in floats.
+    int                     triCount;           // Allocated triangle buffer in ints.
+    GLContext               glctx;
+    GLuint                  glFBO;
+    GLuint                  glColorBuffer[2];
+    GLuint                  glPrevOutBuffer;
+    GLuint                  glDepthStencilBuffer;
+    GLuint                  glVAO;
+    GLuint                  glTriBuffer;
+    GLuint                  glPosBuffer;
+    GLuint                  glProgram;
+    GLuint                  glProgramDP;
+    GLuint                  glVertexShader;
+    GLuint                  glGeometryShader;
+    GLuint                  glFragmentShader;
+    GLuint                  glFragmentShaderDP;
+    cudaGraphicsResource_t  cudaColorBuffer[2];
+    cudaGraphicsResource_t  cudaPrevOutBuffer;
+    cudaGraphicsResource_t  cudaPosBuffer;
+    cudaGraphicsResource_t  cudaTriBuffer;
+    std::vector<GLDrawCmd>  drawCmdBuffer;
+    int                     enableDB;
+};
+
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..51633e10120b4dc465e5283241a38c95db31f8dc
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "framework.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Mip stack construction and access helpers.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
+{
+    char buf[1024];
+    int bufsz = 1024;
+
+    std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
+
+    int w = p.texWidth;
+    int h = p.texHeight;
+    bool ew = false;
+    bool eh = false;
+
+    msg += "Attempted mip stack construction:\n";
+    msg +=               "level  width height\n";
+    msg +=               "-----  ----- ------\n";
+    snprintf(buf, bufsz, "base   %5d  %5d\n", w, h);
+    msg += buf;
+
+    int mipTotal = 0;
+    int level = 0;
+    while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
+    {
+        // Current level.
+        level += 1;
+
+        // Determine if downsampling fails.
+        ew = ew || (w > 1 && (w & 1));
+        eh = eh || (h > 1 && (h & 1));
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        // Append level size to error message.
+        snprintf(buf, bufsz, "mip %-2d ", level);
+        msg += buf; 
+        if (ew) snprintf(buf, bufsz, "  err  ");
+        else    snprintf(buf, bufsz, "%5d  ", w);
+        msg += buf;
+        if (eh) snprintf(buf, bufsz, "  err\n");
+        else    snprintf(buf, bufsz, "%5d\n", h);
+        msg += buf;
+    }
+
+    NVDR_CHECK(0, msg);
+}
+
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets)
+{
+    // No levels at all?
+    if (p.mipLevelLimit == 0)
+    {
+        p.mipLevelMax = 0;
+        return 0;
+    }
+
+    // Current level size.
+    int w = p.texWidth;
+    int h = p.texHeight;
+
+    int mipTotal = 0;
+    int level = 0;
+    int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
+    mipOffsets[0] = 0;
+    while ((w|h) > 1)
+    {
+        // Current level.
+        level += 1;
+
+        // Quit if cannot downsample.
+        if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
+            raiseMipSizeError(NVDR_CTX_PARAMS, p);
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        mipOffsets[level] = mipTotal; // Store the mip offset (#floats).
+        mipTotal += w * h * p.texDepth * c;
+
+        // Hit the level limit?
+        if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
+            break;
+    }
+
+    p.mipLevelMax = level;
+    return mipTotal;
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.cu
new file mode 100755
index 0000000000000000000000000000000000000000..c5e2ad4abdd7c84a512e1dc3d62b3245f2261d0b
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.cu
@@ -0,0 +1,1124 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Memory access and math helpers.
+
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float  b, float c) { a[0] += b * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
+static __device__ __forceinline__ void accum_to_mem(float&  a, float* b, int s) { a += b[0]; }
+static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
+static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
+template<class T> static __device__ __forceinline__ T lerp  (const T& a, const T& b, float c) { return a + c * (b - a); }
+template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
+
+//------------------------------------------------------------------------
+// Cube map wrapping for smooth filtering across edges and corners. At corners,
+// one of the texture coordinates will be negative. For correct interpolation,
+// the missing texel must take the average color of the other three.
+
+static __constant__ uint32_t c_cubeWrapMask1[48] =
+{
+    0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
+    0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
+    0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
+    0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
+    0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
+    0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
+};
+
+static __constant__ uint8_t c_cubeWrapMask2[48] =
+{
+    0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
+    0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
+    0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
+};
+
+static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
+{
+    // Calculate case number.
+    int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
+    int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
+    int c = cx + cy;
+    if (c >= 5)
+        c--;
+    c = (face << 3) + c;
+
+    // Compute coordinates and faces.
+    unsigned int m = c_cubeWrapMask1[c];
+    int x0 = (m >>  0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
+    int x1 = (m >>  2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
+    int x2 = (m >>  4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
+    int x3 = (m >>  6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
+    int y0 = (m >>  8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
+    int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
+    int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
+    int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
+    int f0 = ((m >> 16) & 15) - 1;
+    int f1 = ((m >> 20) & 15) - 1;
+    int f2 = ((m >> 24) & 15) - 1;
+    int f3 = ((m >> 28)     ) - 1;
+
+    // Flips.
+    unsigned int f = c_cubeWrapMask2[c];
+    int w1 = w - 1;
+    if (f & 0x01) x0 = w1 - x0;
+    if (f & 0x02) x1 = w1 - x1;
+    if (f & 0x04) x2 = w1 - x2;
+    if (f & 0x08) x3 = w1 - x3;
+    if (f & 0x10) y0 = w1 - y0;
+    if (f & 0x20) y1 = w1 - y1;
+    if (f & 0x40) y2 = w1 - y2;
+    if (f & 0x80) y3 = w1 - y3;
+
+    // Done.
+    int4 tcOut;
+    tcOut.x = x0 + (y0 + f0 * w) * w;
+    tcOut.y = x1 + (y1 + f1 * w) * w;
+    tcOut.z = x2 + (y2 + f2 * w) * w;
+    tcOut.w = x3 + (y3 + f3 * w) * w;
+    return tcOut;
+}
+
+//------------------------------------------------------------------------
+// Cube map indexing and gradient functions.
+
+// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
+// two parameters) and face index.
+static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
+{
+    float ax = fabsf(x);
+    float ay = fabsf(y);
+    float az = fabsf(z);
+    int idx;
+    float c;
+    if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
+    else if (ay > ax)       { idx = 2; c = y; y = z; }
+    else                    { idx = 0; c = x; x = z; }
+    if (c < 0.f) idx += 1;
+    float m = __frcp_rz(fabsf(c)) * .5;
+    float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
+    float m1 = (idx != 2) ? -m : m;
+    x = x * m0 + .5;
+    y = y * m1 + .5;
+    x = fminf(fmaxf(x, 0.f), 1.f);
+    y = fminf(fmaxf(y, 0.f), 1.f);
+    return idx;
+}
+
+// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
+static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c;
+    float c0 = gu;
+    float c1 = gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl = (c0 + c1) * m;
+    float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
+    float gy = (idx & 0x0c) ? gl : -gv;
+    float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
+    gz = (idx & 0x09) ? -gz : gz;
+    return make_float3(gx, gy, gz) * (m * .5f);
+}
+
+// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
+// indexCubeMapGrad() functions rolled together.
+static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, c0, c1;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl0 = (dw.x * c0 + dw.z * c1) * m;
+    float gl1 = (dw.y * c0 + dw.w * c1) * m;
+    float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
+    float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
+    float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
+    float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
+    float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
+    float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
+    if (idx & 0x09)
+    {
+        gz0 = -gz0;
+        gz1 = -gz1;
+    }
+    g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
+    g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
+}
+
+// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
+// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
+static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+    float m = __frcp_rz(fabsf(c));
+    float dm = m * .5f;
+    float mm = m * dm;
+    gu *= (idx & 0x34) ? -mm : mm;
+    gv *= (idx & 0x2e) ? -mm : mm;
+
+    if (idx & 0x03)
+    {
+        return make_float4(gu * dvdX.x + dm * dvdX.z,
+                           gu * dvdY.x + dm * dvdY.z,
+                           gv * dvdX.x - dm * dvdX.y,
+                           gv * dvdY.x - dm * dvdY.y);
+    }
+    else if (idx & 0x0c)
+    {
+        return make_float4(gu * dvdX.y + dm * dvdX.x,
+                           gu * dvdY.y + dm * dvdY.x,
+                           gv * dvdX.y + dm * dvdX.z,
+                           gv * dvdY.y + dm * dvdY.z);
+    } 
+    else // (idx & 0x30)
+    {
+        return make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
+                           gu * dvdY.z + copysignf(dm, c) * dvdY.x,
+                           gv * dvdX.z - dm * dvdX.y,
+                           gv * dvdY.z - dm * dvdY.y);
+    }
+}
+
+// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
+// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
+//   |  d(ds/dX)/dx  d(ds/dY)/dx  d(dt/dX)/dx  d(dt/dY)/dx  |
+//   |  d(ds/dX)/dy  d(ds/dY)/dy  d(dt/dX)/dy  d(dt/dY)/dy  |
+//   |  d(ds/dX)/dz  d(ds/dY)/dz  d(dt/dX)/dz  d(dt/dY)/dz  |
+static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+
+    float m = __frcp_rz(c);
+    float dm = -m * fabsf(m) * .5;
+    float mm = m * m * .5;
+    float mu = (idx & 0x34) ? -mm : mm;
+    float mv = (idx & 0x2e) ? -mm : mm;
+    gu *= -2.0 * m * mu;
+    gv *= -2.0 * m * mv;
+
+    if (idx & 0x03)
+    {
+        dx.x = gu * dvdX.x + dm * dvdX.z;
+        dx.y = gu * dvdY.x + dm * dvdY.z;
+        dx.z = gv * dvdX.x - dm * dvdX.y;
+        dx.w = gv * dvdY.x - dm * dvdY.y;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.x;
+        dy.w = mv * dvdY.x;
+        dz.x = mu * dvdX.x;
+        dz.y = mu * dvdY.x;
+        dz.z = 0.f;
+        dz.w = 0.f;
+    }
+    else if (idx & 0x0c)
+    {
+        dx.x = mu * dvdX.y;
+        dx.y = mu * dvdY.y;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = gu * dvdX.y + dm * dvdX.x;
+        dy.y = gu * dvdY.y + dm * dvdY.x;
+        dy.z = gv * dvdX.y + dm * dvdX.z;
+        dy.w = gv * dvdY.y + dm * dvdY.z;
+        dz.x = 0.f;
+        dz.y = 0.f;
+        dz.z = mv * dvdX.y;
+        dz.w = mv * dvdY.y;
+    }
+    else // (idx & 0x30)
+    {
+        dx.x = mu * dvdX.z;
+        dx.y = mu * dvdY.z;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.z;
+        dy.w = mv * dvdY.z;
+        dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
+        dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
+        dz.z = gv * dvdX.z - dm * dvdX.y;
+        dz.w = gv * dvdY.z - dm * dvdY.y;
+    }
+}
+
+//------------------------------------------------------------------------
+// General texture indexing.
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
+{
+    int w = p.texWidth;
+    int h = p.texHeight;
+    float u = uv.x;
+    float v = uv.y;
+
+    // Cube map indexing.
+    if (CUBE_MODE)
+    {
+        // No wrap. Fold face index into tz right away.
+        tz = 6 * tz + indexCubeMap(u, v, uv.z); // Rewrites u, v.
+    }
+    else
+    {
+        // Handle boundary.
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+    }
+
+    u = u * (float)w;
+    v = v * (float)h;
+
+    int iu = __float2int_rd(u);
+    int iv = __float2int_rd(v);
+
+    // In zero boundary mode, return texture address -1.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        if (iu < 0 || iu >= w || iv < 0 || iv >= h)
+            return -1;
+    }
+
+    // Otherwise clamp and calculate the coordinate properly.
+    iu = min(max(iu, 0), w-1);
+    iv = min(max(iv, 0), h-1);
+    return iu + w * (iv + tz * h);
+}
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
+{
+    // Mip level size.
+    int2 sz = mipLevelSize(p, level);
+    int w = sz.x;
+    int h = sz.y;
+
+    // Compute texture-space u, v.
+    float u = uv.x;
+    float v = uv.y;
+    bool clampU = false;
+    bool clampV = false;
+
+    // Cube map indexing.
+    int face = 0;
+    if (CUBE_MODE)
+    {
+        // Neither clamp or wrap.
+        face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+    }
+    else
+    {
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            // Wrap.
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+
+        // Move to texel space.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
+        {
+            // Clamp to center of edge texels.
+            u = fminf(fmaxf(u, 0.f), w - 1.f);
+            v = fminf(fmaxf(v, 0.f), h - 1.f);
+            clampU = (u == 0.f || u == w - 1.f);
+            clampV = (v == 0.f || v == h - 1.f);
+        }
+    }
+
+    // Compute texel coordinates and weights.
+    int iu0 = __float2int_rd(u);
+    int iv0 = __float2int_rd(v);
+    int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
+    int iv1 = iv0 + (clampV ? 0 : 1);
+    u -= (float)iu0;
+    v -= (float)iv0;
+
+    // Cube map wrapping.
+    bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
+    if (cubeWrap)
+    {
+        tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
+        tcOut += 6 * tz * w * h;  // Bring in tz.
+        return make_float2(u, v); // Done.
+    }
+
+    // Fold cube map face into tz.
+    if (CUBE_MODE)
+        tz = 6 * tz + face;
+
+    // Wrap overflowing texel indices.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+    {
+        if (iu0 < 0) iu0 += w;
+        if (iv0 < 0) iv0 += h;
+        if (iu1 >= w) iu1 -= w;
+        if (iv1 >= h) iv1 -= h;
+    }
+
+    // Coordinates
+    iu0 += tz * w * h;
+    iu1 += tz * w * h;
+    tcOut.x = iu0 + w * iv0;
+    tcOut.y = iu1 + w * iv0;
+    tcOut.z = iu0 + w * iv1;
+    tcOut.w = iu1 + w * iv1;
+
+    // Invalidate texture addresses outside unit square if we are in zero mode.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        bool iu0_out = (iu0 < 0 || iu0 >= w);
+        bool iu1_out = (iu1 < 0 || iu1 >= w);
+        bool iv0_out = (iv0 < 0 || iv0 >= h);
+        bool iv1_out = (iv1 < 0 || iv1 >= h);
+        if (iu0_out || iv0_out) tcOut.x = -1;
+        if (iu1_out || iv0_out) tcOut.y = -1;
+        if (iu0_out || iv1_out) tcOut.z = -1;
+        if (iu1_out || iv1_out) tcOut.w = -1;
+    }
+
+    // All done.
+    return make_float2(u, v);
+}
+
+//------------------------------------------------------------------------
+// Mip level calculation.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
+{
+    // Do nothing if mips not in use.
+    if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
+        return;
+
+    // Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
+    if (!BIAS_ONLY)
+    {
+        // Get pixel derivatives of texture coordinates.
+        float4 uvDA;
+        float3 dvdX, dvdY; // Gradients use these later.
+        if (CUBE_MODE)
+        {
+            // Fetch.
+            float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
+            float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
+            float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];
+
+            // Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
+            dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
+            dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
+            uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
+        }
+        else
+        {
+            // Fetch.
+            uvDA = ((const float4*)p.uvDA)[pidx];
+        }
+
+        // Scaling factors.
+        float uscl = p.texWidth;
+        float vscl = p.texHeight;
+
+        // d[s,t]/d[X,Y].
+        float dsdx = uvDA.x * uscl;
+        float dsdy = uvDA.y * uscl;
+        float dtdx = uvDA.z * vscl;
+        float dtdy = uvDA.w * vscl;
+
+        // Calculate footprint axis lengths.
+        float A = dsdx*dsdx + dtdx*dtdx;
+        float B = dsdy*dsdy + dtdy*dtdy;
+        float C = dsdx*dsdy + dtdx*dtdy;
+        float l2b = 0.5 * (A + B);
+        float l2n = 0.25 * (A-B)*(A-B) + C*C;
+        float l2a = sqrt(l2n);
+        float lenMinorSqr = fmaxf(0.0, l2b - l2a);
+        float lenMajorSqr = l2b + l2a;
+
+        // Footprint vs. mip level gradient.
+        if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            float dw   = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
+            float AB   = dw * .5f * (A - B);
+            float Cw   = dw * C;
+            float l2aw = dw * l2a;
+            float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
+            float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
+            float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
+            float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
+
+            *pdw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
+
+            // In cube maps, there is also a texture coordinate vs. mip level gradient.
+            if (CUBE_MODE)
+            {
+                float4 dx, dy, dz;
+                indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
+
+                float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
+                float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
+                float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
+                float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
+
+                float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
+                d_f_dv += d_dsdX_dv * d_f_ddsdX;
+                d_f_dv += d_dsdY_dv * d_f_ddsdY;
+                d_f_dv += d_dtdX_dv * d_f_ddtdX;
+                d_f_dv += d_dtdY_dv * d_f_ddtdY;
+
+                *pdfdv = d_f_dv;
+            }
+        }
+
+        // Finally, calculate mip level.
+        flevel = .5f * __log2f(lenMajorSqr);
+    }
+
+    // Bias the mip level and clamp.
+    if (p.mipLevelBias)
+        flevel += p.mipLevelBias[pidx];
+    flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
+
+    // Calculate levels depending on filter mode.
+    level0 = __float2int_rd(flevel);
+
+    // Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
+    if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
+    {
+        level1 = min(level0 + 1, p.mipLevelMax);
+        flevel -= level0; // Fractional part. Zero if clamped on last level.
+    }
+}
+
+//------------------------------------------------------------------------
+// Texel fetch and accumulator helpers that understand cube map corners.
+
+template<class T>
+static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
+{
+    if (corner)
+    {
+        T avg = zero_value<T>();
+        if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
+        if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
+        if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
+        if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
+        avg *= 0.33333333f;
+        if (tc.x < 0) a00 = avg;
+        if (tc.y < 0) a10 = avg;
+        if (tc.z < 0) a01 = avg;
+        if (tc.w < 0) a11 = avg;
+    }
+    else
+    {
+        a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>();
+        a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>();
+        a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>();
+        a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>();
+    }
+}
+
+static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
+{
+    if (corner)
+    {
+        float cb;
+        if (tc.x < 0) cb = c.x;
+        if (tc.y < 0) cb = c.y;
+        if (tc.z < 0) cb = c.z;
+        if (tc.w < 0) cb = c.w;
+        cb *= 0.33333333f;
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
+    }
+    else
+    {
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
+    }
+}
+
+//------------------------------------------------------------------------
+// Mip builder kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
+{
+    // Sizes.
+    int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
+    int2 sz_out = mipLevelSize(p, p.mipLevelOut);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= sz_out.x || py >= sz_out.y)
+        return;
+
+    // Pixel indices.
+    int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
+    int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
+    int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
+
+    // Input and output pointers.
+    const float* pin = p.tex[p.mipLevelOut - 1];
+    float* pout = (float*)p.tex[p.mipLevelOut];
+
+    // Special case: Input texture height or width is 1.
+    if (sz_in.x == 1 || sz_in.y == 1)
+    {
+        if (sz_in.y == 1)
+            pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
+
+        for (int i=0; i < p.channels; i += C)
+        {
+            T v0 = *((const T*)&pin[pidx_in0 + i]);
+            T v1 = *((const T*)&pin[pidx_in1 + i]);
+            T avg = .5f * (v0 + v1);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+            avg = (avg - .5f) * 1.41421356f + .5f;
+#endif
+            *((T*)&pout[pidx_out + i]) = avg;
+        }
+
+        return;
+    }
+
+    for (int i=0; i < p.channels; i += C)
+    {
+        T v0 = *((const T*)&pin[pidx_in0 + i]);
+        T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
+        T v2 = *((const T*)&pin[pidx_in1 + i]);
+        T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
+        T avg = .25f * (v0 + v1 + v2 + v3);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+        avg = (avg - .5f) * 2.f + .5f;
+#endif
+        *((T*)&pout[pidx_out + i]) = avg;
+    }
+}
+
+// Template specializations.
+__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float,  1>(p); }
+__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
+__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Output ptr.
+    float* pOut = p.out + pidx * p.channels;
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        tc *= p.channels;
+        const float* pIn = p.tex[0];
+
+        // Copy if valid tc, otherwise output zero.
+        for (int i=0; i < p.channels; i += C)
+            *((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>();
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
+
+    // Get texel indices and pointer for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Bilinear fetch.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        // Interpolate.
+        for (int i=0; i < p.channels; i += C, tc0 += C)
+        {
+            T a00, a10, a01, a11;
+            fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            *((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
+        }
+        return; // Exit.
+    }
+
+    // Get texel indices and pointer for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Trilinear fetch.
+    for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
+    {
+        // First level.
+        T a00, a10, a01, a11;
+        fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        T a = bilerp(a00, a10, a01, a11, uv0);
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            T b00, b10, b01, b11;
+            fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            T b = bilerp(b00, b10, b01, b11, uv1);
+            a = lerp(a, b, flevel); // Interpolate between levels.
+        }
+
+        // Write.
+        *((T*)&pOut[i]) = a;
+    }
+}
+
+// Template specializations.
+__global__ void TextureFwdKernelNearest1                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest2                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest4                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelLinear1                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear2                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear4                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest1        (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest2        (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest4        (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear1         (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear2         (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear4         (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeNearest1                (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest2                (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest4                (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinear1                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear2                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear4                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest1    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest2    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest4    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear1     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear2     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear4     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO1      (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO2      (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO4      (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO1       (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO2       (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO4       (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1   (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2   (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4   (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
+// Gradient mip puller kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.texWidth || py >= p.texHeight)
+        return;
+
+    // Number of wide elements.
+    int c = p.channels;
+    if (C == 2) c >>= 1;
+    if (C == 4) c >>= 2;
+
+    // Dynamically allocated shared memory for holding a texel.
+    extern __shared__ float s_texelAccum[];
+    int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
+    int sharedStride = blockDim.x * blockDim.y;
+#   define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
+
+    // Clear the texel.
+    for (int i=0; i < p.channels; i++)
+        *TEXEL_ACCUM(i) = 0.f;
+
+    // Track texel position and accumulation weight over the mip stack.
+    int x = px;
+    int y = py;
+    float w = 1.f;
+
+    // Pull gradients from all levels.
+    int2 sz = mipLevelSize(p, 0); // Previous level size.
+    for (int level=1; level <= p.mipLevelMax; level++)
+    {
+        // Weight decay depends on previous level size.
+        if (sz.x > 1) w *= .5f;
+        if (sz.y > 1) w *= .5f;
+
+        // Current level size and coordinates.
+        sz = mipLevelSize(p, level);
+        x >>= 1;
+        y >>= 1;
+
+        T* pIn = (T*)(p.gradTex[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
+        for (int i=0; i < c; i++)
+            accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
+    }
+
+    // Add to main texture gradients.
+    T* pOut = (T*)(p.gradTex[0] + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
+    for (int i=0; i < c; i++)
+        accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
+}
+
+// Template specializations.
+__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float,  1>(p); }
+__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
+__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Early exit if output gradients are zero.
+    const float* pDy = p.dy + pidx * p.channels;
+    unsigned int dmax = 0u;
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i < p.channels; i += 4)
+        {
+            uint4 dy = *((const uint4*)&pDy[i]);
+            dmax |= (dy.x | dy.y | dy.z | dy.w);
+        }
+    }
+    else
+    {
+        for (int i=0; i < p.channels; i++)
+            dmax |= __float_as_uint(pDy[i]);
+    }
+
+    // Store zeros and exit.
+    if (__uint_as_float(dmax) == 0.f)
+    {
+        if (CUBE_MODE)
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                {
+                    ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
+                }
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        else
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                    ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        return;
+    }
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode - texture gradients only.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        if (tc < 0)
+            return; // Outside texture.
+
+        tc *= p.channels;
+        float* pOut = p.gradTex[0];
+
+        // Accumulate texture gradients.
+        for (int i=0; i < p.channels; i++)
+            caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
+    float3 dfdv = make_float3(0.f, 0.f, 0.f);
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
+
+    // UV gradient accumulators.
+    float gu = 0.f;
+    float gv = 0.f;
+
+    // Get texel indices and pointers for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    float* pOut0 = p.gradTex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Texel weights.
+    float uv011 = uv0.x * uv0.y;
+    float uv010 = uv0.x - uv011;
+    float uv001 = uv0.y - uv011;
+    float uv000 = 1.f - uv0.x - uv001;
+    float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
+
+    // Attribute weights.
+    int2 sz0 = mipLevelSize(p, level0);
+    float sclu0 = (float)sz0.x;
+    float sclv0 = (float)sz0.y;
+
+    // Bilinear mode - texture and uv gradients.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        for (int i=0; i < p.channels; i++, tc0 += 1)
+        {
+            float dy = pDy[i];
+            accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
+
+            float a00, a10, a01, a11;
+            fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            float ad = (a11 + a00 - a10 - a01);
+            gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
+            gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
+        }
+
+        // Store UV gradients and exit.
+        if (CUBE_MODE)
+            ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
+        else
+            ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+        return;
+    }
+
+    // Accumulate fractional mip level gradient.
+    float df = 0; // dL/df.
+
+    // Get texel indices and pointers for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    float* pOut1 = p.gradTex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Texel weights.
+    float uv111 = uv1.x * uv1.y;
+    float uv110 = uv1.x - uv111;
+    float uv101 = uv1.y - uv111;
+    float uv100 = 1.f - uv1.x - uv101;
+    float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
+
+    // Attribute weights.
+    int2 sz1 = mipLevelSize(p, level1);
+    float sclu1 = (float)sz1.x;
+    float sclv1 = (float)sz1.y;
+
+    // Trilinear mode.
+    for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
+    {
+        float dy = pDy[i];
+        float dy0 = (1.f - flevel) * dy;
+        accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
+
+        // UV gradients for first level.
+        float a00, a10, a01, a11;
+        fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        float ad = (a11 + a00 - a10 - a01);
+        gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
+        gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            // Texture gradients for second level.
+            float dy1 = flevel * dy;
+            accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
+
+            // UV gradients for second level.
+            float b00, b10, b01, b11;
+            fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            float bd = (b11 + b00 - b10 - b01);
+            gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
+            gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
+
+            // Mip level gradient.
+            float a = bilerp(a00, a10, a01, a11, uv0);
+            float b = bilerp(b00, b10, b01, b11, uv1);
+            df += (b-a) * dy;
+        }
+    }
+
+    // Store UV gradients.
+    if (CUBE_MODE)
+        ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
+    else
+        ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+    // Store mip level bias gradient.
+    if (p.gradMipLevelBias)
+        p.gradMipLevelBias[pidx] = df;
+
+    // Store UV pixel differential gradients.
+    if (!BIAS_ONLY)
+    {
+        // Final gradients.
+        dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
+
+        // Store them.
+        if (CUBE_MODE)
+        {
+            // Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
+            float3 g0, g1;
+            indexCubeMapGrad4(uv, dw, g0, g1);
+            ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
+            ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
+            ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
+        }
+        else
+            ((float4*)p.gradUVDA)[pidx] = dw;
+    }
+}
+
+// Template specializations.
+__global__ void TextureGradKernelNearest                    (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelLinear                     (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearest        (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinear         (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeNearest                (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinear                 (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearest    (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinear     (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearestBO      (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinearBO       (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearestBO  (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinearBO   (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.h
new file mode 100755
index 0000000000000000000000000000000000000000..f79b600fff0256cdadd38e265b49366549434ef8
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/common/texture.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+
+//------------------------------------------------------------------------
+// Constants.
+
+#define TEX_DEBUG_MIP_RETAIN_VARIANCE           0   // For debugging
+#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH          8
+#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT         8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH      8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT     8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH         8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT        8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH     8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT    8
+#define TEX_MAX_MIP_LEVEL                       16  // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
+#define TEX_MODE_NEAREST                        0   // Nearest on base level.
+#define TEX_MODE_LINEAR                         1   // Bilinear on base level.
+#define TEX_MODE_LINEAR_MIPMAP_NEAREST          2   // Bilinear on nearest mip level.
+#define TEX_MODE_LINEAR_MIPMAP_LINEAR           3   // Trilinear.
+#define TEX_MODE_COUNT                          4
+#define TEX_BOUNDARY_MODE_CUBE                  0   // Cube map mode.
+#define TEX_BOUNDARY_MODE_WRAP                  1   // Wrap (u, v).
+#define TEX_BOUNDARY_MODE_CLAMP                 2   // Clamp (u, v).
+#define TEX_BOUNDARY_MODE_ZERO                  3   // Pad with zeros.
+#define TEX_BOUNDARY_MODE_COUNT                 4
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct TextureKernelParams
+{
+    const float*    tex[TEX_MAX_MIP_LEVEL];         // Incoming texture buffer with mip levels.
+    const float*    uv;                             // Incoming texcoord buffer.
+    const float*    uvDA;                           // Incoming uv pixel diffs or NULL.
+    const float*    mipLevelBias;                   // Incoming mip level bias or NULL.
+    const float*    dy;                             // Incoming output gradient.
+    float*          out;                            // Outgoing texture data.
+    float*          gradTex[TEX_MAX_MIP_LEVEL];     // Outgoing texture gradients with mip levels.
+    float*          gradUV;                         // Outgoing texcoord gradient.
+    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
+    float*          gradMipLevelBias;               // Outgoing mip level bias gradient.
+    int             enableMip;                      // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
+    int             filterMode;                     // One of the TEX_MODE_ constants.
+    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
+    int             texConst;                       // If true, texture is known to be constant.
+    int             mipLevelLimit;                  // Mip level limit coming from the op.
+    int             channels;                       // Number of texture channels.
+    int             imgWidth;                       // Image width.
+    int             imgHeight;                      // Image height.
+    int             texWidth;                       // Texture width.
+    int             texHeight;                      // Texture height.
+    int             texDepth;                       // Texture depth.
+    int             n;                              // Minibatch size.
+    int             mipLevelMax;                    // Maximum mip level index. Zero if mips disabled.
+    int             mipLevelOut;                    // Mip level being calculated in builder kernel.
+};
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets);
+
+//------------------------------------------------------------------------
+// Macros.
+
+#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/__init__.py b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..cf62df8782d730f072ca5f4e4862a44dc8c3a086
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import rasterize, interpolate, texture, antialias
+from .plugin_loader import set_cache_dir
+
+__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/ops.py b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/ops.py
new file mode 100755
index 0000000000000000000000000000000000000000..be51deef13e0ecfbd5bfe8bc376af24a18db7224
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/ops.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import tensorflow as tf
+import numpy as np
+import os
+from . import plugin_loader
+
+#----------------------------------------------------------------------------
+# Helpers.
+#----------------------------------------------------------------------------
+
+# OpenGL-related linker options depending on platform.
+def _get_gl_opts():
+    libs = {
+        'posix': ['GL', 'EGL'],
+        'nt':    ['gdi32', 'opengl32', 'user32', 'setgpu'],
+    }
+    return ['-l' + x for x in libs[os.name]]
+
+# Load the cpp plugin.
+def _get_plugin():
+    fn = os.path.join(os.path.dirname(__file__), 'tf_all.cu')
+    return plugin_loader.get_plugin(fn, extra_nvcc_options=_get_gl_opts() + ['-DNVDR_TENSORFLOW'])
+
+# Convert parameter to a numpy array if possible.
+def _get_constant(x, dtype):
+    try:
+        return np.asarray(x, dtype=dtype)
+    except (TypeError, ValueError):
+        return None
+
+# Tests for a construction-time constantness instead of tf.constant node because
+# the latter can be overridden in Session.run() feed_dict at evaluation time.
+def _is_constant(x, dtype):
+    if isinstance(x, np.ndarray):
+        return np.can_cast(x.dtype, dtype, 'unsafe')
+    else:
+        return _get_constant(x, dtype) is not None
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+def rasterize(pos, tri, resolution, ranges=None, tri_const=False, output_db=True, grad_db=True):
+    assert tri_const is True or tri_const is False
+    assert output_db is True or output_db is False
+
+    # Known constant resolution?
+    resolution_c = _get_constant(resolution, np.int32)
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert all inputs to tensors / base types.
+    tri_const = 1 if tri_const else 0
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    resolution = tf.convert_to_tensor(resolution, dtype=tf.int32)
+    if ranges is None:
+        ranges = tf.convert_to_tensor(np.zeros(shape=[0, 2], dtype=np.int32)) # Empty tensor.
+    else:
+        ranges = tf.convert_to_tensor(ranges, dtype=tf.int32) # Convert input to tensor.
+
+    # Infer as much about the output shape as possible.
+    out_shape = [None, None, None, 4]
+    if pos.shape.rank == 3: # Instanced mode.
+        out_shape[0] = pos.shape[0].value
+    elif pos.shape.rank == 2: # Range mode.
+        if ranges.shape.rank not in [None, 0]:
+            out_shape[0] = ranges.shape[0].value
+    if resolution_c is not None:
+        assert resolution_c.shape == (2,)
+        out_shape[1], out_shape[2] = resolution_c
+
+    # Output pixel differentials.
+    @tf.custom_gradient
+    def func_db(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 1, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape)
+        def grad(dy, ddb):
+            if grad_db:
+                return _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+            else:
+                return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Do not output pixel differentials.
+    @tf.custom_gradient
+    def func(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 0, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape[:-1] + [0]) # Zero channels in out_db.
+        def grad(dy, _):
+            return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Choose stub.
+    if output_db:
+        return func_db(pos)
+    else:
+        return func(pos)
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = _get_constant(diff_attrs, np.int32)
+        assert (diff_attrs is not None) and len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    # Convert all inputs to tensors.
+    attr = tf.convert_to_tensor(attr, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    if diff_attrs:
+        rast_db = tf.convert_to_tensor(rast_db, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if rast.shape.rank is not None:
+        out_shape = [rast.shape[0].value, rast.shape[1].value, rast.shape[2].value, None]
+    if attr.shape.rank in [2, 3]:
+        out_shape[3] = attr.shape[-1].value
+
+    # Output pixel differentials for at least some attributes.
+    @tf.custom_gradient
+    def func_da(attr, rast, rast_db):
+        diff_attrs_all = int(diff_attrs == 'all')
+        diff_attrs_list = [] if diff_attrs_all else diff_attrs
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+
+        # Infer number of channels in out_da.
+        if not diff_attrs_all:
+            da_channels = 2 * len(diff_attrs)
+        if (attr.shape.rank in [2, 3]) and (attr.shape[-1].value is not None):
+            da_channels = 2 * attr.shape[-1].value
+        else:
+            da_channels = None
+
+        # Set output shapes.
+        out.set_shape(out_shape)
+        out_da.set_shape([out_shape[0], out_shape[1], out_shape[2], da_channels])
+
+        def grad(dy, dda):
+            return _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return (out, out_da), grad
+
+    # No pixel differentials for any attribute.
+    @tf.custom_gradient
+    def func(attr, rast):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        out.set_shape(out_shape)
+        out_da.set_shape(out_shape[:-1] + [0]) # Zero channels in out_da.
+        def grad(dy, _):
+            return _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return (out, out_da), grad
+
+    # Choose stub.
+    if diff_attrs:
+        return func_da(attr, rast, rast_db)
+    else:
+        return func(attr, rast)
+
+#----------------------------------------------------------------------------
+# Texture.
+#----------------------------------------------------------------------------
+
+def texture(tex, uv, uv_da=None, filter_mode='auto', boundary_mode='wrap', tex_const=False, max_mip_level=None):
+    assert tex_const is True or tex_const is False
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+
+    # Known constant texture?
+    tex_const = tex_const or _is_constant(tex, np.float32)
+
+    # Sanitize inputs.
+    tex_const = 1 if tex_const else 0
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Convert inputs to tensors.
+    tex = tf.convert_to_tensor(tex, dtype=tf.float32)
+    uv = tf.convert_to_tensor(uv, dtype=tf.float32)
+    if 'mipmap' in filter_mode:
+        uv_da = tf.convert_to_tensor(uv_da, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if uv.shape.rank is not None:
+        assert uv.shape.rank == 4
+        out_shape = [uv.shape[0].value, uv.shape[1].value, uv.shape[2].value, None]
+    if tex.shape.rank is not None:
+        assert tex.shape.rank == (5 if boundary_mode == 'cube' else 4)
+        out_shape[-1] = tex.shape[-1].value
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Linear-mipmap-linear: Mipmaps enabled, all gradients active.
+    @tf.custom_gradient
+    def func_linear_mipmap_linear(tex, uv, uv_da):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear-mipmap-nearest: Mipmaps enabled, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear_mipmap_nearest(tex, uv):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear: Mipmaps disabled, no uv_da, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear(tex, uv):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Nearest: Mipmaps disabled, no uv_da, no gradients to uv_da or uv.
+    @tf.custom_gradient
+    def func_nearest(tex):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear':
+        return func_linear_mipmap_linear(tex, uv, uv_da)
+    elif filter_mode == 'linear-mipmap-nearest':
+        return func_linear_mipmap_nearest(tex, uv)
+    elif filter_mode == 'linear':
+        return func_linear(tex, uv)
+    elif filter_mode == 'nearest':
+        return func_nearest(tex)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+def antialias(color, rast, pos, tri, tri_const=False, pos_gradient_boost=1.0):
+    assert tri_const is True or tri_const is False
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert inputs to tensors.
+    color = tf.convert_to_tensor(color, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+
+    # Sanitize inputs.
+    tri_const = 1 if tri_const else 0
+
+    @tf.custom_gradient
+    def func(color, pos):
+        color_out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, tri_const)
+        color_out.set_shape(color.shape)
+        def grad(dy):
+            grad_color, grad_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+            if pos_gradient_boost != 1.0:
+                grad_pos = grad_pos * pos_gradient_boost
+            return grad_color, grad_pos
+        return color_out, grad
+
+    return func(color, pos)
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/plugin_loader.py b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/plugin_loader.py
new file mode 100755
index 0000000000000000000000000000000000000000..d428c55de2194e42be331b1cad1b2162709a4cd4
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/plugin_loader.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+_nvdiffrast_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    '''Set CUDA kernel compilation temp dir.
+
+    If `set_cache_dir` is not called, the cache directory will default to
+    one of the below:
+
+    - Value of NVDIFFRAST_CACHE_DIR env var, if set
+    - $HOME/.cache/nvdiffrast if HOME env var is set
+    - $USERPROFILE/.cache/nvdiffrast if USERPROFILE is set.
+
+    Args:
+      path: Where to save CUDA kernel build temporaries
+    '''
+    global _nvdiffrast_cache_dir
+    _nvdiffrast_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _nvdiffrast_cache_dir is not None:
+        return os.path.join(_nvdiffrast_cache_dir, *paths)
+    if 'NVDIFFRAST_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['NVDIFFRAST_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'nvdiffrast', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'nvdiffrast', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'nvdiffrast', *paths)
+
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    vc_bin_dir = 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin'
+    if os.path.isdir(vc_bin_dir):
+        return vc_bin_dir
+    return None
+
+def _get_compute_cap(device):
+    caps_str = device.physical_device_desc
+    m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+    major = m.group(1)
+    minor = m.group(2)
+    return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+    gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU devices found')
+    (major, minor) = _get_compute_cap(gpus[0])
+    return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+    with os.popen(cmd) as pipe:
+        output = pipe.read()
+        status = pipe.close()
+    if status is not None:
+        raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+    cmd = 'nvcc ' + opts.strip()
+    cmd += ' --disable-warnings'
+    cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+    compiler_bindir = _find_compiler_bindir()
+    if compiler_bindir is None:
+        # Require that _find_compiler_bindir succeeds on Windows.  Allow
+        # nvcc to use whatever is the default on Linux.
+        if os.name == 'nt':
+            raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+    else:
+        cmd += ' --compiler-bindir "%s"' % compiler_bindir
+    cmd += ' 2>&1'
+    return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file, extra_nvcc_options=[]):
+    cuda_file_base = os.path.basename(cuda_file)
+    cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+    # Already in cache?
+    if cuda_file in _plugin_cache:
+        return _plugin_cache[cuda_file]
+
+    # Setup plugin.
+    if verbose:
+        print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+    try:
+        # Hash CUDA source.
+        md5 = hashlib.md5()
+        with open(cuda_file, 'rb') as f:
+            md5.update(f.read())
+        md5.update(b'\n')
+
+        # Hash headers included by the CUDA code by running it through the preprocessor.
+        if not do_not_hash_included_headers:
+            if verbose:
+                print('Preprocessing... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+                _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+                with open(tmp_file, 'rb') as f:
+                    bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+                    good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+                    for ln in f:
+                        if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+                            ln = ln.replace(bad_file_str, good_file_str)
+                            md5.update(ln)
+                    md5.update(b'\n')
+
+        # Select compiler options.
+        compile_opts = ''
+        if os.name == 'nt':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+            compile_opts += ' --library-path="%s"' % (os.path.dirname(__file__) + r"\..\lib") # Find libraries during compilation.
+        elif os.name == 'posix':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+            compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+        else:
+            assert False # not Windows or Linux, w00t?
+        compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+        compile_opts += ' --use_fast_math'
+        for opt in extra_nvcc_options:
+            compile_opts += ' ' + opt
+        nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+        # Hash build configuration.
+        md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+        md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+        md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+        # Compile if not already compiled.
+        bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+        cuda_cache_path = make_cache_dir_path()
+        bin_file = os.path.join(make_cache_dir_path(), cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+        if not os.path.isfile(bin_file):
+            if verbose:
+                print('Compiling... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+                _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+                os.makedirs(cuda_cache_path, exist_ok=True)
+                intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+                shutil.copyfile(tmp_file, intermediate_file)
+                os.rename(intermediate_file, bin_file) # atomic
+
+        # Load.
+        if verbose:
+            print('Loading... ', end='', flush=True)
+        plugin = tf.load_op_library(bin_file)
+
+        # Add to cache.
+        _plugin_cache[cuda_file] = plugin
+        if verbose:
+            print('Done.', flush=True)
+        return plugin
+
+    except:
+        if verbose:
+            print('Failed!', flush=True)
+        raise
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_all.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_all.cu
new file mode 100755
index 0000000000000000000000000000000000000000..122cc02700c7b8eeda56736eb1a27f8f5104051b
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_all.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+// TF-specific helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
+#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
+
+// Cuda kernels and CPP all together. What an absolute compilation unit.
+
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "../common/framework.h"
+#include "../common/glutil.cpp"
+
+#include "../common/common.h"
+#include "../common/common.cpp"
+
+#include "../common/rasterize.h"
+#include "../common/rasterize.cpp"
+#include "../common/rasterize.cu"
+#include "tf_rasterize.cu"
+
+#include "../common/interpolate.cu"
+#include "tf_interpolate.cu"
+
+#include "../common/texture.cpp"
+#include "../common/texture.cu"
+#include "tf_texture.cu"
+
+#include "../common/antialias.cu"
+#include "tf_antialias.cu"
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_antialias.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_antialias.cu
new file mode 100755
index 0000000000000000000000000000000000000000..4e5c9c6d4afa05489d6ff8179c7d32f8d8e92025
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_antialias.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct AntialiasFwdOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_attribs.tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color     = ctx->input(0);
+        const Tensor& rasterOut = ctx->input(1);
+        const Tensor& pos       = ctx->input(2);
+        const Tensor& tri       = ctx->input(3);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate output tensor.
+        Tensor* outputTensor = NULL;
+        TensorShape outputShape;
+        outputShape.AddDim(p.n);
+        outputShape.AddDim(p.height);
+        outputShape.AddDim(p.width);
+        outputShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, outputShape, &outputTensor));
+        p.output = outputTensor->flat<float>().data();
+
+        // Allocate work buffer. One extra int4 for storing counters.
+        Tensor* workTensor = NULL;
+        TensorShape workShape;
+        workShape.AddDim(p.n * p.width * p.height * 8 + 4); // 8 int for a maximum of two work items per pixel.
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, workShape, &workTensor));
+        p.workBuffer = (int4*)(workTensor->flat<int>().data());
+
+        // Clear the work counters.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rasterOut  &  7), errors::Internal("raster_out input tensor not aligned to float2"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Kernel parameters.
+        void* args[] = {&p};
+
+        // (Re-)calculate opposite vertex hash.
+        if (!p.evHash || !p.tri_const)
+        {            
+            if (p.allocTriangles < p.numTriangles)
+            {
+                p.allocTriangles = max(p.allocTriangles, 64);
+                while (p.allocTriangles < p.numTriangles)
+                    p.allocTriangles <<= 1; // Must be power of two.
+               
+                // (Re-)allocate memory for the hash.
+                OP_CHECK_CUDA_ERROR(ctx, cudaFree(p.evHash));
+                OP_CHECK_CUDA_ERROR(ctx, cudaMalloc(&p.evHash, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4)));
+                LOG(INFO) << "Increasing topology hash size to accommodate " << p.allocTriangles << " triangles";
+            }
+
+            // Clear the hash and launch the mesh kernel to populate it.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.evHash, 0, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4), stream));
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+        }
+
+        // Copy input to output as a baseline.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.output, p.color, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+        // Choose launch parameters for the discontinuity finder kernel and launch.
+        dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+        dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+        // Determine optimum block size for the persistent analysis kernel.
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+
+        // Launch analysis kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasFwd")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Output     ("output: float")
+    .Output     ("work_buffer: int32")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasFwd").Device(DEVICE_GPU), AntialiasFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct AntialiasGradOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color      = ctx->input(0);
+        const Tensor& rasterOut  = ctx->input(1);
+        const Tensor& pos        = ctx->input(2);
+        const Tensor& tri        = ctx->input(3);
+        const Tensor& dy         = ctx->input(4);
+        const Tensor& workBuffer = ctx->input(5);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) > 0 && dy.dim_size(2) > 0 && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        OP_REQUIRES(ctx, color.dim_size(1) == dy.dim_size(1) && color.dim_size(2) == dy.dim_size(2) && color.dim_size(3) == dy.dim_size(3), errors::InvalidArgument("color and dy inputs must have same dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.dy = dy.flat<float>().data();
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+        p.workBuffer = (int4*)(workBuffer.flat<int>().data());
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate color gradient output tensor.
+        Tensor* gradColor = NULL;
+        TensorShape gradColorShape;
+        gradColorShape.AddDim(p.n);
+        gradColorShape.AddDim(p.height);
+        gradColorShape.AddDim(p.width);
+        gradColorShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, gradColorShape, &gradColor));
+        p.gradColor = gradColor->flat<float>().data();
+
+        // Allocate position gradient output tensor.
+        Tensor* gradPos = NULL;
+        TensorShape gradPosShape;
+        if (p.instance_mode)
+            gradPosShape.AddDim(p.n);
+        gradPosShape.AddDim(p.numVertices);
+        gradPosShape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, gradPosShape, &gradPos));
+        p.gradPos = gradPos->flat<float>().data();
+
+        // Initialize all the stuff.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream)); // Gradient kernel work counter.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.gradColor, p.dy, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradPos, 0, (p.instance_mode ? p.n : 1) * p.numVertices * 4 * sizeof(float), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Launch the gradient kernel.
+        void* args[] = {&p};
+
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasGrad")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("work_buffer: int32")
+    .Output     ("grad_color: float")
+    .Output     ("grad_pos: float");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasGrad").Device(DEVICE_GPU), AntialiasGradOp);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_interpolate.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_interpolate.cu
new file mode 100755
index 0000000000000000000000000000000000000000..612ce1afc5ce41a25496523b193725c1edac64de
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_interpolate.cu
@@ -0,0 +1,301 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void interpolateParseOpAttributes(OpKernelConstruction* ctx, InterpolateKernelParams& p, bool enableDA)
+{
+    if (enableDA)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs_all", &p.diff_attrs_all));
+        if (!p.diff_attrs_all)
+        {
+            std::vector<int> diff_attrs_vec;
+            OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs", &diff_attrs_vec));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() > 0, errors::InvalidArgument("differentiation enabled with empty diff_attrs list"));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, errors::InvalidArgument("too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)"));
+            p.numDiffAttr = diff_attrs_vec.size();
+            memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateFwdOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 3 : 2);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, attr.dim_size(0) == p.depth || attr.dim_size(0) == 1, errors::InvalidArgument("minibatch size mismatch between inputs rast, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, rast_db.dims() == 4 && rast_db.dim_size(0) > 0 && rast_db.dim_size(1) > 0 && rast_db.dim_size(2) > 0 && rast_db.dim_size(3) == 4, errors::InvalidArgument("rast_db must have shape[>0, >0, >0, 4]"));
+            OP_REQUIRES(ctx, rast_db.dim_size(1) == rast.dim_size(1) && rast_db.dim_size(2) == rast.dim_size(2), errors::InvalidArgument("spatial size mismatch between inputs rast and rast_db"));
+            OP_REQUIRES(ctx, rast_db.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, rast_db"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr = attr.flat<float>().data();
+        p.rast = rast.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.attrBC = (p.instance_mode && attr.dim_size(0) == 1) ? 1 : 0;
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+
+        // Allocate main output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.depth);
+        out_shape.AddDim(p.height);
+        out_shape.AddDim(p.width);
+        out_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Allocate pixel differential output tensor.
+        Tensor* out_da_tensor = NULL;
+        out_shape.set_dim(3, p.numDiffAttr * 2);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, out_shape, &out_da_tensor));
+        p.outDA = ENABLE_DA ? out_da_tensor->flat<float>().data() : 0;
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+        if (ENABLE_DA)
+            OP_REQUIRES(ctx, !((uintptr_t)p.outDA & 7), errors::Internal("out_da output tensor not aligned to float2"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateFwd")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Output     ("out: float")
+    .Output     ("out_da: float");
+
+REGISTER_OP("InterpolateFwdDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("rast_db: float")
+    .Output     ("out: float")
+    .Output     ("out_da: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwd")  .Device(DEVICE_GPU), InterpolateFwdOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwdDa").Device(DEVICE_GPU), InterpolateFwdOp<true>);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateGradOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);      
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& dy      = ctx->input(3);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 4 : 3);
+        const Tensor& dda     = ctx->input(ENABLE_DA ? 5 : 3);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        int attr_depth = p.instance_mode ? (attr.dims() > 1 ? attr.dim_size(0) : 0) : 1;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape [>0, height, width, >0]"));
+        OP_REQUIRES(ctx, dy.dim_size(3) == p.numAttr, errors::InvalidArgument("argument count mismatch between inputs dy, attr"));
+        OP_REQUIRES(ctx, (attr_depth == p.depth || attr_depth == 1) && dy.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, dy, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, dda.dims() == 4 && dda.dim_size(0) > 0 && dda.dim_size(1) == p.height && dda.dim_size(2) == p.width, errors::InvalidArgument("dda must have shape [>0, height, width, ?]"));
+            OP_REQUIRES(ctx, dda.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between rast, dda"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr   = attr.flat<float>().data();
+        p.rast   = rast.flat<float>().data();
+        p.tri    = tri.flat<int>().data();
+        p.dy     = dy.flat<float>().data();
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+        p.dda    = ENABLE_DA ? dda.flat<float>().data() : 0;
+        p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+        // Allocate attribute gradient output tensor.
+        Tensor* grad_attr_tensor = NULL;
+        TensorShape grad_attr_shape;
+        if (p.instance_mode)
+            grad_attr_shape.AddDim(attr_depth);
+        grad_attr_shape.AddDim(p.numVertices);
+        grad_attr_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_attr_shape, &grad_attr_tensor));
+        p.gradAttr = grad_attr_tensor->flat<float>().data();
+
+        // Allocate bary gradient output tensor.
+        Tensor* grad_rast_tensor = NULL;
+        TensorShape grad_rast_shape;
+        grad_rast_shape.AddDim(p.depth);
+        grad_rast_shape.AddDim(p.height);
+        grad_rast_shape.AddDim(p.width);
+        grad_rast_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_rast_shape, &grad_rast_tensor));
+        p.gradRaster = grad_rast_tensor->flat<float>().data();
+
+        // Allocate bary pixel diff gradient output tensor.
+        if (ENABLE_DA)
+        {
+            Tensor* grad_rast_db_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_rast_shape, &grad_rast_db_tensor));
+            p.gradRasterDB = grad_rast_db_tensor->flat<float>().data();
+        }
+        
+        // Clear attribute gradients.
+        cudaMemsetAsync(p.gradAttr, 0, attr_depth * p.numVertices * p.numAttr * sizeof(float), stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.gradRaster & 15), errors::Internal("grad_rast output tensor not aligned to float4"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.dda & 7), errors::Internal("dda input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradRasterDB & 15), errors::Internal("grad_rast_db output tensor not aligned to float4"));
+        }
+    
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateGrad")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    ;
+
+REGISTER_OP("InterpolateGradDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("rast_db: float")
+    .Input      ("dda: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    .Output     ("grad_rast_db: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+    ;
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateGrad")  .Device(DEVICE_GPU), InterpolateGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateGradDa").Device(DEVICE_GPU), InterpolateGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_rasterize.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_rasterize.cu
new file mode 100755
index 0000000000000000000000000000000000000000..bc9d0714e5b9e5f172dc4985d3ead48c65117271
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_rasterize.cu
@@ -0,0 +1,241 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct RasterizeFwdOp : public OpKernel
+{
+    RasterizeGLState        m_glState;              // OpenGL-related persistent state.
+    int                     m_tri_const;            // 1 if triangle array is known to be constant.
+
+    RasterizeFwdOp(OpKernelConstruction* ctx):
+        OpKernel(ctx)
+    {
+        memset(&m_glState, 0, sizeof(RasterizeGLState));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Check that input shapes are correct.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& resolution = ctx->input(2);
+        const Tensor& ranges = ctx->input(3);
+
+        // Determine number of outputs
+        int num_outputs = m_glState.enableDB ? 2 : 1;
+
+        // Determine instance mode and check input dimensions.
+        bool instance_mode = pos.dims() > 2;
+        if (instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+            OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
+        }
+
+        // Get output shape.
+        const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
+        int height = res_in[0];
+        int width  = res_in[1];
+        int depth  = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
+        OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));
+
+        // Get position and triangle buffer sizes in int32/float32.
+        int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
+        int triCount = 3 * tri.dim_size(0);
+
+        // Init context and GL?
+        bool initCtx = !m_glState.glFBO;
+        if (initCtx)
+        {
+            const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
+            int cudaDeviceIdx = g ? g->gpu_id : -1;
+            rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
+        }
+        else
+            setGLContext(m_glState.glctx); // (Re-)Activate GL context.
+
+        // Resize all buffers.
+        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
+
+        // Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
+        if (initCtx)
+        {
+            // On first execution, do a bonus context swap.
+            releaseGLContext();
+            setGLContext(m_glState.glctx);
+        }
+
+        // Copy input data to GL and render.
+        const float* posPtr = pos.flat<float>().data();
+        const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
+        const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
+        int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
+        rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1);
+
+        // Allocate output tensors.
+        TensorShape output_shape;
+        output_shape.AddDim(depth);
+        output_shape.AddDim(height);
+        output_shape.AddDim(width);
+        output_shape.AddDim(4);
+        float* outputPtr[2];
+        for (int i=0; i < 2; i++)
+        {
+            if (i >= num_outputs)
+                output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
+            Tensor* output_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
+            if (i < num_outputs)
+                outputPtr[i] = output_tensor->flat<float>().data();
+        }
+
+        // Copy rasterized results into CUDA buffers.
+        rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);
+
+        // Done. Release GL context.
+        releaseGLContext();
+    }
+};
+
+REGISTER_OP("RasterizeFwd")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("resolution: int32")
+    .Input      ("ranges: int32")
+    .Output     ("out: float")
+    .Output     ("out_db: float")
+    .Attr       ("enable_db: int")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DB>
+struct RasterizeGradOp : public OpKernel
+{
+    RasterizeGradParams m_attribs;
+
+    RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        RasterizeGradParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Input tensors.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& out = ctx->input(2);
+        const Tensor& dy  = ctx->input(3);
+        const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);
+
+        // Determine instance mode.
+        p.instance_mode = (pos.dims() > 2) ? 1 : 0;
+
+        // Shape is taken from the rasterizer output tensor.
+        OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
+        p.depth  = out.dim_size(0);
+        p.height = out.dim_size(1);
+        p.width  = out.dim_size(2);
+        OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));
+
+        // Check other shapes.
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
+        else
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
+        OP_REQUIRES(ctx,  dy.dims() == 4 &&  dy.dim_size(0) == p.depth &&  dy.dim_size(1) == p.height &&  dy.dim_size(2) == p.width &&  dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));
+
+        // Populate parameters.
+        p.numTriangles = tri.dim_size(0);
+        p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
+        p.pos = pos.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.out = out.flat<float>().data();
+        p.dy  = dy.flat<float>().data();
+        p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
+
+        // Set up pixel position to clip space x, y transform.
+        p.xs = 2.f / (float)p.width;
+        p.xo = 1.f / (float)p.width - 1.f;
+        p.ys = 2.f / (float)p.height;
+        p.yo = 1.f / (float)p.height - 1.f;
+
+        // Allocate output tensor for position gradients.
+        Tensor* grad_tensor = NULL;
+        TensorShape grad_shape;
+        if (p.instance_mode)
+            grad_shape.AddDim(p.depth);
+        grad_shape.AddDim(p.numVertices);
+        grad_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
+        p.grad = grad_tensor->flat<float>().data();
+
+        // Clear the output buffers.
+        size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
+        cudaMemsetAsync(p.grad, 0, gradBytes, stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.dy  &  7), errors::Internal("dy input tensor not aligned to float2"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("RasterizeGrad")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Output     ("grad: float");
+
+REGISTER_OP("RasterizeGradDb")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Input      ("ddb: float")
+    .Output     ("grad: float");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeGrad")  .Device(DEVICE_GPU), RasterizeGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_texture.cu b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_texture.cu
new file mode 100755
index 0000000000000000000000000000000000000000..c5382fed28236da09d20a04c0524a937383daf5a
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_texture.cu
@@ -0,0 +1,525 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p)
+{
+    // Mip and filter modes.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode));
+    OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported"));
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit));
+        OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level"));
+        ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op.
+    }
+
+    // Boundary mode.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode));
+    OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported"));
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct TextureFwdOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+    PersistentTensor    m_persistentMipTensor; // Used if texture is constant and mips are enabled.
+    bool                m_persistentMipTensorInitialized;
+
+    TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        m_persistentMipTensorInitialized = false;
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+
+        // Allocate output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.n);
+        out_shape.AddDim(p.imgHeight);
+        out_shape.AddDim(p.imgWidth);
+        out_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Choose kernel variants based on channel count.
+        void* args[] = {&p};
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        float* pmip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Mip output tensor.
+            Tensor* mip_tensor = NULL;
+            TensorShape mip_shape;
+            mip_shape.AddDim(mipTotal);
+
+            // If texture is constant, calculate mip stack only once.
+            bool computeMip = true;
+            if (p.texConst)
+            {
+                // First execution?
+                if (!m_persistentMipTensorInitialized)
+                {
+                    // Allocate a persistent mip tensor.
+                    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor));
+                    m_persistentMipTensorInitialized = true;
+                }
+                else
+                {
+                    // Reuse the persistent tensor, do not recompute mip levels.
+                    mip_tensor = m_persistentMipTensor.AccessTensor(ctx);
+                    computeMip = false;
+                }
+
+                // Set as output tensor as well.
+                ctx->set_output(1, *mip_tensor);
+            }
+            else
+            {
+                // Allocate an output tensor as usual.
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor));
+            }
+
+            pmip = mip_tensor->flat<float>().data(); // Pointer to data.
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+            // Build mip levels if needed.
+            if (computeMip)
+            {
+                for (int i=1; i <= p.mipLevelMax; i++)
+                {
+                    int2 ms = mipLevelSize(p, i);
+                    int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+                    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+                    dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+                    p.mipLevelOut = i;
+
+                    void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+                    OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+                }
+            }
+        }
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 15), errors::Internal("out output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 15), errors::Internal("mip output tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 7), errors::Internal("out output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 7), errors::Internal("mip output tensor not aligned to float2"));
+        }
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+        else
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+
+        // Choose launch parameters for texture lookup kernel.
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        // Choose kernel based on filter mode, cube mode, and datatype.
+        void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
+            (void*)TextureFwdKernelNearest1,
+            (void*)TextureFwdKernelNearest2,
+            (void*)TextureFwdKernelNearest4,
+            (void*)TextureFwdKernelLinear1,
+            (void*)TextureFwdKernelLinear2,
+            (void*)TextureFwdKernelLinear4,
+            (void*)TextureFwdKernelLinearMipmapNearest1,
+            (void*)TextureFwdKernelLinearMipmapNearest2,
+            (void*)TextureFwdKernelLinearMipmapNearest4,
+            (void*)TextureFwdKernelLinearMipmapLinear1,
+            (void*)TextureFwdKernelLinearMipmapLinear2,
+            (void*)TextureFwdKernelLinearMipmapLinear4,
+            (void*)TextureFwdKernelCubeNearest1,
+            (void*)TextureFwdKernelCubeNearest2,
+            (void*)TextureFwdKernelCubeNearest4,
+            (void*)TextureFwdKernelCubeLinear1,
+            (void*)TextureFwdKernelCubeLinear2,
+            (void*)TextureFwdKernelCubeLinear4,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+        func_idx = func_idx * 3 + channel_div_idx;
+
+        // Launch kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("TextureFwd")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Output     ("out: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureFwdMip")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("uv_da: float")
+    .Output     ("out: float")
+    .Output     ("mip: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("tex_const: int")
+    .Attr       ("max_mip_level: int");
+
+REGISTER_KERNEL_BUILDER(Name("TextureFwd")   .Device(DEVICE_GPU), TextureFwdOp);
+REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct TextureGradOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+
+    TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& dy    = ctx->input(2);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2);
+        const Tensor& mip   = ctx->input(p.enableMip ? 4 : 2);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.dy = dy.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+        float* pmip = p.enableMip ? (float*)mip.flat<float>().data() : 0;
+
+        // Allocate output tensor for tex gradient.
+        Tensor* grad_tex_tensor = NULL;
+        TensorShape grad_tex_shape;
+        grad_tex_shape.AddDim(p.texDepth);
+        if (cube_mode)
+            grad_tex_shape.AddDim(6);
+        grad_tex_shape.AddDim(p.texHeight);
+        grad_tex_shape.AddDim(p.texWidth);
+        grad_tex_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor));
+        p.gradTex[0] = grad_tex_tensor->flat<float>().data();
+
+        // Allocate output tensor for uv gradient.
+        if (p.filterMode != TEX_MODE_NEAREST)
+        {
+            TensorShape grad_uv_shape;
+            Tensor* grad_uv_tensor = NULL;
+            grad_uv_shape.AddDim(p.n);
+            grad_uv_shape.AddDim(p.imgHeight);
+            grad_uv_shape.AddDim(p.imgWidth);
+            grad_uv_shape.AddDim(uv.dim_size(3));
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor));
+            p.gradUV = grad_uv_tensor->flat<float>().data();
+
+            // Allocate output tensor for uv_da gradient.
+            if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                Tensor* grad_uv_da_tensor = NULL;
+                grad_uv_shape.set_dim(3, uv_da.dim_size(3));
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor));
+                p.gradUVDA = grad_uv_da_tensor->flat<float>().data();
+            }
+        }
+
+        // Choose kernel variants based on channel count.
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        Tensor grad_mip_tensor;
+        float* pgradMip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Get space for temporary mip gradients.
+            TensorShape grad_mip_shape;
+            grad_mip_shape.AddDim(mipTotal);
+            ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor);
+            pgradMip = grad_mip_tensor.flat<float>().data();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+
+            // Clear mip gradients.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream));
+        }
+
+        // Initialize texture gradients to zero.
+        int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float);
+        if (cube_mode)
+            texBytes *= 6;
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv       & 7), errors::Internal("uv input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUV   & 7), errors::Internal("grad_uv output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2"));
+        }
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 15), errors::Internal("dy input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 15), errors::Internal("mip input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 15), errors::Internal("internal mip gradient tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 7), errors::Internal("dy output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 7), errors::Internal("mip input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 7), errors::Internal("internal mip gradient tensor not aligned to float2"));
+        }
+
+        // Choose launch parameters for main gradient kernel.
+        void* args[] = {&p};
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        void* func_tbl[TEX_MODE_COUNT * 2] = {
+            (void*)TextureGradKernelNearest,
+            (void*)TextureGradKernelLinear,
+            (void*)TextureGradKernelLinearMipmapNearest,
+            (void*)TextureGradKernelLinearMipmapLinear,
+            (void*)TextureGradKernelCubeNearest,
+            (void*)TextureGradKernelCubeLinear,
+            (void*)TextureGradKernelCubeLinearMipmapNearest,
+            (void*)TextureGradKernelCubeLinearMipmapLinear,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+
+        // Launch main gradient kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+        // Launch kernel to pull gradients from mip levels.
+        if (p.enableMip)
+        {
+            dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+            dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+            int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+            void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+        }
+    }
+};
+
+REGISTER_OP("TextureGradNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinearMipmapNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_OP("TextureGradLinearMipmapLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Output     ("grad_uv_da: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_KERNEL_BUILDER(Name("TextureGradNearest")            .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinear")             .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp);
+        
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/__init__.py b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..2f8ae0f25516d8445f8f745230a3a383f5b05b52
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
+__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/ops.py b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/ops.py
new file mode 100755
index 0000000000000000000000000000000000000000..be603675873437b01fd0976588a6020d7086fd9b
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/ops.py
@@ -0,0 +1,640 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import logging
+import numpy as np
+import os
+import sys
+import torch
+import torch.utils.cpp_extension
+
+#----------------------------------------------------------------------------
+# C++/Cuda plugin compiler/loader.
+
+_cached_plugin = None
+def _get_plugin():
+    # Return cached plugin if already loaded.
+    global _cached_plugin
+    if _cached_plugin is not None:
+        return _cached_plugin
+
+    # Make sure we can find the necessary compiler and libary binaries.
+    if os.name == 'nt':
+        lib_dir = os.path.dirname(__file__) + r"\..\lib"
+        def find_cl_path():
+            import glob
+            for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
+                paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
+                if paths:
+                    return paths[0]
+
+        # If cl.exe is not on path, try to find it.
+        if os.system("where cl.exe >nul 2>nul") != 0:
+            cl_path = find_cl_path()
+            if cl_path is None:
+                raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+            os.environ['PATH'] += ';' + cl_path
+
+    # Compiler options.
+    opts = ['-DNVDR_TORCH']
+
+    # Linker options.
+    if os.name == 'posix':
+        ldflags = ['-lGL', '-lEGL']
+    elif os.name == 'nt':
+        libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
+        ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
+
+    # List of source files.
+    source_files = [
+        '../common/common.cpp',
+        '../common/glutil.cpp',
+        '../common/rasterize.cu',
+        '../common/rasterize.cpp',
+        '../common/interpolate.cu',
+        '../common/texture.cu',
+        '../common/texture.cpp',
+        '../common/antialias.cu',
+        'torch_bindings.cpp',
+        'torch_rasterize.cpp',
+        'torch_interpolate.cpp',
+        'torch_texture.cpp',
+        'torch_antialias.cpp',
+    ]
+
+    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
+    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
+    plugin_name = 'nvdiffrast_plugin'
+    try:
+        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
+        if os.path.exists(lock_fn):
+            logging.getLogger('nvdiffrast').warning("Lock file exists in build directory: '%s'" % lock_fn)
+    except:
+        pass
+
+    # Compile and load.
+    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
+    torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=False)
+
+    # Import, cache, and return the compiled module.
+    import nvdiffrast_plugin
+    _cached_plugin = nvdiffrast_plugin
+    return _cached_plugin
+
+#----------------------------------------------------------------------------
+# Log level.
+#----------------------------------------------------------------------------
+
+def get_log_level():
+    '''Get current log level.
+
+    Returns:
+      Current log level in nvdiffrast. See `set_log_level()` for possible values.
+    '''
+    return _get_plugin().get_log_level()
+
+def set_log_level(level):
+    '''Set log level.
+
+    Log levels follow the convention on the C++ side of Torch:
+      0 = Info,
+      1 = Warning,
+      2 = Error,
+      3 = Fatal.
+    The default log level is 1.
+
+    Args:
+      level: New log level as integer. Internal nvdiffrast messages of this 
+             severity or higher will be printed, while messages of lower
+             severity will be silent.
+    '''
+    _get_plugin().set_log_level(level)
+
+#----------------------------------------------------------------------------
+# GL State wrapper.
+#----------------------------------------------------------------------------
+
+class RasterizeGLContext:
+    def __init__(self, output_db=True, mode='automatic', device=None):
+        '''Create a new OpenGL rasterizer context.
+
+        Creating an OpenGL context is a slow operation so you should reuse the same
+        context in all calls to `rasterize()` on the same CPU thread. The OpenGL context
+        is deleted when the object is destroyed.
+
+        Args:
+          output_db (bool): Compute and output image-space derivates of barycentrics.
+          mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
+        Returns:
+          The newly created OpenGL rasterizer context.
+        '''
+        assert output_db is True or output_db is False
+        assert mode in ['automatic', 'manual']
+        self.output_db = output_db
+        self.mode = mode
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
+        self.active_depth_peeler = None # For error checking only
+
+    def set_context(self):
+        '''Set (activate) OpenGL context in the current CPU thread.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.set_context()
+
+    def release_context(self):
+        '''Release (deactivate) currently active OpenGL context.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.release_context()
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+class _rasterize_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, glctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
+        out, out_db = _get_plugin().rasterize_fwd(glctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        ctx.save_for_backward(pos, tri, out)
+        ctx.saved_grad_db = grad_db
+        return out, out_db
+
+    @staticmethod
+    def backward(ctx, dy, ddb):
+        pos, tri, out = ctx.saved_variables
+        if ctx.saved_grad_db:
+            g_pos = _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+        else:
+            g_pos = _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return None, g_pos, None, None, None, None, None
+
+# Op wrapper.
+def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
+    '''Rasterize triangles.
+
+    All input tensors must be contiguous and reside in GPU memory except for
+    the `ranges` tensor that, if specified, has to reside in CPU memory. The
+    output tensors will be contiguous and reside in GPU memory.
+
+    Args:
+        glctx: OpenGL context of type `RasterizeGLContext`.
+        pos: Vertex position tensor with dtype `torch.float32`. To enable range
+             mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+             instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        resolution: Output resolution as integer tuple (height, width).
+        ranges: In range mode, tensor with shape [minibatch_size, 2] and dtype
+                `torch.int32`, specifying start indices and counts into `tri`.
+                Ignored in instanced mode.
+        grad_db: Propagate gradients of image-space derivatives of barycentrics
+                 into `pos` in backward pass. Ignored if OpenGL context was
+                 not configured to output image-space derivatives.
+
+    Returns:
+        A tuple of two tensors. The first output tensor has shape [minibatch_size,
+        height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+        triangle_id). If the OpenGL context was configured to output image-space
+        derivatives of barycentrics, the second output tensor will also have shape
+        [minibatch_size, height, width, 4] and contain said derivatives in order
+        (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    '''
+    assert isinstance(glctx, RasterizeGLContext)
+    assert grad_db is True or grad_db is False
+    grad_db = grad_db and glctx.output_db
+
+    # Sanitize inputs.
+    assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+    resolution = tuple(resolution)
+    if ranges is None:
+        ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+    else:
+        assert isinstance(ranges, torch.Tensor)
+
+    # Check that context is not currently reserved for depth peeling.
+    if glctx.active_depth_peeler is not None:
+        return RuntimeError("Cannot call rasterize() during depth peeling operation, use rasterize_next_layer() instead")
+
+    # Instantiate the function.
+    return _rasterize_func.apply(glctx, pos, tri, resolution, ranges, grad_db, -1)
+
+#----------------------------------------------------------------------------
+# Depth peeler context manager for rasterizing multiple depth layers.
+#----------------------------------------------------------------------------
+
+class DepthPeeler:
+    def __init__(self, glctx, pos, tri, resolution, ranges=None, grad_db=True):
+        '''Create a depth peeler object for rasterizing multiple depth layers.
+
+        Arguments are the same as in `rasterize()`.
+
+        Returns:
+          The newly created depth peeler.
+        '''
+        assert isinstance(glctx, RasterizeGLContext)
+        assert grad_db is True or grad_db is False
+        grad_db = grad_db and glctx.output_db
+
+        # Sanitize inputs as usual.
+        assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+        resolution = tuple(resolution)
+        if ranges is None:
+            ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+        else:
+            assert isinstance(ranges, torch.Tensor)
+
+        # Store all the parameters.
+        self.glctx = glctx
+        self.pos = pos
+        self.tri = tri
+        self.resolution = resolution
+        self.ranges = ranges
+        self.grad_db = grad_db
+        self.peeling_idx = None
+
+    def __enter__(self):
+        if self.glctx is None:
+            raise RuntimeError("Cannot re-enter a terminated depth peeling operation")
+        if self.glctx.active_depth_peeler is not None:
+            raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a RasterizeGLContext")
+        self.glctx.active_depth_peeler = self
+        self.peeling_idx = 0
+        return self
+
+    def __exit__(self, *args):
+        assert self.glctx.active_depth_peeler is self
+        self.glctx.active_depth_peeler = None
+        self.glctx = None # Remove all references to input tensor so they're not left dangling.
+        self.pos = None
+        self.tri = None
+        self.resolution = None
+        self.ranges = None
+        self.grad_db = None
+        self.peeling_idx = None
+        return None
+
+    def rasterize_next_layer(self):
+        '''Rasterize next depth layer.
+
+        Operation is equivalent to `rasterize()` except that previously reported
+        surface points are culled away.
+
+        Returns:
+          A tuple of two tensors as in `rasterize()`.
+        '''
+        assert self.glctx.active_depth_peeler is self
+        assert self.peeling_idx >= 0
+        result = _rasterize_func.apply(self.glctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
+        self.peeling_idx += 1
+        return result
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+# Output pixel differentials for at least some attributes.
+class _interpolate_func_da(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list):
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+        ctx.save_for_backward(attr, rast, tri, rast_db)
+        ctx.saved_misc = diff_attrs_all, diff_attrs_list
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, dda):
+        attr, rast, tri, rast_db = ctx.saved_variables
+        diff_attrs_all, diff_attrs_list = ctx.saved_misc
+        g_attr, g_rast, g_rast_db = _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return g_attr, g_rast, None, g_rast_db, None, None
+
+# No pixel differential for any attribute.
+class _interpolate_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        ctx.save_for_backward(attr, rast, tri)
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, _):
+        attr, rast, tri = ctx.saved_variables
+        g_attr, g_rast = _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return g_attr, g_rast, None
+
+# Op wrapper.
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    """Interpolate vertex attributes.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensors
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        attr: Attribute tensor with dtype `torch.float32`. 
+              Shape is [num_vertices, num_attributes] in range mode, or 
+              [minibatch_size, num_vertices, num_attributes] in instanced mode.
+              Broadcasting is supported along the minibatch axis.
+        rast: Main output tensor from `rasterize()`.
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        rast_db: (Optional) Tensor containing image-space derivatives of barycentrics, 
+                 i.e., the second output tensor from `rasterize()`. Enables computing
+                 image-space derivatives of attributes.
+        diff_attrs: (Optional) List of attribute indices for which image-space
+                    derivatives are to be computed. Special value 'all' is equivalent
+                    to list [0, 1, ..., num_attributes - 1].
+
+    Returns:
+        A tuple of two tensors. The first output tensor contains interpolated
+        attributes and has shape [minibatch_size, height, width, num_attributes].
+        If `rast_db` and `diff_attrs` were specified, the second output tensor contains
+        the image-space derivatives of the selected attributes and has shape
+        [minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+        first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+        Otherwise, the second output tensor will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    """
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = np.asarray(diff_attrs, np.int32)
+        assert len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    diff_attrs_all = int(diff_attrs == 'all')
+    diff_attrs_list = [] if diff_attrs_all else diff_attrs
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (attr, rast, tri))
+    if diff_attrs:
+        assert isinstance(rast_db, torch.Tensor)
+
+    # Choose stub.
+    if diff_attrs:
+        return _interpolate_func_da.apply(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+    else:
+        return _interpolate_func.apply(attr, rast, tri)
+
+#----------------------------------------------------------------------------
+# Texture
+#----------------------------------------------------------------------------
+
+# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
+class _texture_func_mip(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack):
+        empty = torch.tensor([])
+        if uv_da is None:
+            uv_da = empty
+        if mip_level_bias is None:
+            mip_level_bias = empty
+        if mip_wrapper is None:
+            mip_wrapper = _get_plugin().TextureMipWrapper()
+        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv, uv_da, mip_level_bias, *mip_stack)
+        ctx.saved_misc = filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv, uv_da, mip_level_bias, *mip_stack = ctx.saved_variables
+        filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear-mipmap-linear':
+            g_tex, g_uv, g_uv_da, g_mip_level_bias, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None) + tuple(g_mip_stack)
+        else: # linear-mipmap-nearest
+            g_tex, g_uv, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, None, None, None, None, None) + tuple(g_mip_stack)
+
+# Linear and nearest: Mipmaps disabled.
+class _texture_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv)
+        ctx.saved_misc = filter_mode, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv = ctx.saved_variables
+        filter_mode, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear':
+            g_tex, g_uv = _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, None, None
+        else: # nearest
+            g_tex = _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, None, None, None
+
+# Op wrapper.
+def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
+    """Perform texture sampling.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
+             [minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+             must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+             tex_width and tex_height are equal. Note that `boundary_mode` must also be set
+             to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
+        uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
+            must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+            texture, must have shape [minibatch_size, height, width, 3].
+        uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
+               Must have same shape as `uv` except for the last dimension that is to be twice
+               as long.
+        mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
+                        determines mip level directly. Must have shape [minibatch_size, height, width].
+        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call, or a list
+                        of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
+                        the tensors in the list must follow the same format as `tex` except for width and
+                        height that must follow the usual rules for mipmap sizes. The base level texture
+                        is still supplied in `tex` and must not be included in the list. Gradients of a
+                        custom mipmap stack are not automatically propagated to base texture but the mipmap
+                        tensors will receive gradients of their own. If a mipmap stack is not specified
+                        but the chosen filter mode requires it, the mipmap stack is constructed internally
+                        and discarded afterwards.
+        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+                     selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
+                     'linear-mipmap-linear' when at least one of them is specified, these being
+                     the highest-quality modes possible depending on the availability of the
+                     image-space derivatives of the texture coordinates or direct mip level information.
+        boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
+                       cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+                       part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+                       centers of the boundary texels. Mode 'zero' virtually extends the texture with
+                       all-zero values in all directions.
+        max_mip_level: If specified, limits the number of mipmaps constructed and used in mipmap-based
+                       filter modes.
+
+    Returns:
+        A tensor containing the results of the texture sampling with shape
+        [minibatch_size, height, width, tex_channels].
+    """
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
+
+    # Sanitize inputs.
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Check inputs.
+    assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
+    if 'mipmap' in filter_mode:
+        assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Construct a mipmap if necessary.
+    if 'mipmap' in filter_mode:
+        mip_wrapper, mip_stack = None, []
+        if mip is not None:
+            assert isinstance(mip, (_get_plugin().TextureMipWrapper, list))
+            if isinstance(mip, list):
+                assert all(isinstance(x, torch.Tensor) for x in mip)
+                mip_stack = mip
+            else:
+                mip_wrapper = mip
+        else:
+            mip_wrapper = _get_plugin().texture_construct_mip(tex, max_mip_level, boundary_mode == 'cube')
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
+        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack)
+    else:
+        return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
+
+# Mipmap precalculation for cases where the texture stays constant.
+def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
+    """Construct a mipmap stack for a texture.
+
+    This function can be used for constructing a mipmap stack for a texture that is known to remain
+    constant. This avoids reconstructing it every time `texture()` is called.
+
+    Args:
+        tex: Texture tensor with the same constraints as in `texture()`.
+        max_mip_level: If specified, limits the number of mipmaps constructed.
+        cube_mode: Must be set to True if `tex` specifies a cube map texture.
+
+    Returns:
+        An opaque object containing the mipmap stack. This can be supplied in a call to `texture()` 
+        in the `mip` argument.
+    """
+
+    assert isinstance(tex, torch.Tensor)
+    assert cube_mode is True or cube_mode is False
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+    return _get_plugin().texture_construct_mip(tex, max_mip_level, cube_mode)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+class _antialias_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, color, rast, pos, tri, topology_hash, pos_gradient_boost):
+        out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, topology_hash)
+        ctx.save_for_backward(color, rast, pos, tri)
+        ctx.saved_misc = pos_gradient_boost, work_buffer
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        color, rast, pos, tri = ctx.saved_variables
+        pos_gradient_boost, work_buffer = ctx.saved_misc
+        g_color, g_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+        if pos_gradient_boost != 1.0:
+            g_pos = g_pos * pos_gradient_boost
+        return g_color, None, g_pos, None, None, None
+
+# Op wrapper.
+def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
+    """Perform antialiasing.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
+        rast: Main output tensor from `rasterize()`.
+        pos: Vertex position tensor used in the rasterization operation.
+        tri: Triangle tensor used in the rasterization operation.
+        topology_hash: (Optional) Preconstructed topology hash for the triangle tensor. If not
+                       specified, the topology hash is constructed internally and discarded afterwards.
+        pos_gradient_boost: (Optional) Multiplier for gradients propagated to `pos`.
+
+    Returns:
+        A tensor containing the antialiased image with the same shape as `color` input tensor.
+    """
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (color, rast, pos, tri))
+
+    # Construct topology hash unless provided by user.
+    if topology_hash is not None:
+        assert isinstance(topology_hash, _get_plugin().TopologyHashWrapper)
+    else:
+        topology_hash = _get_plugin().antialias_construct_topology_hash(tri)
+
+    # Instantiate the function.
+    return _antialias_func.apply(color, rast, pos, tri, topology_hash, pos_gradient_boost)
+
+# Topology hash precalculation for cases where the triangle array stays constant.
+def antialias_construct_topology_hash(tri):
+    """Construct a topology hash for a triangle tensor.
+
+    This function can be used for constructing a topology hash for a triangle tensor that is 
+    known to remain constant. This avoids reconstructing it every time `antialias()` is called.
+
+    Args:
+        tri: Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+             GPU memory.
+
+    Returns:
+        An opaque object containing the topology hash. This can be supplied in a call to 
+        `antialias()` in the `topology_hash` argument.
+    """
+    assert isinstance(tri, torch.Tensor)
+    return _get_plugin().antialias_construct_topology_hash(tri)
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_antialias.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_antialias.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..a926adc7dc68eb30811de6a3571a0a545c7b2a20
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_antialias.cpp
@@ -0,0 +1,239 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/antialias.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void AntialiasFwdMeshKernel         (const AntialiasKernelParams p);
+void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p);
+void AntialiasFwdAnalysisKernel     (const AntialiasKernelParams p);
+void AntialiasGradKernel            (const AntialiasKernelParams p);
+
+//------------------------------------------------------------------------
+// Topology hash construction.
+
+TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tri);
+    NVDR_CHECK_CONTIGUOUS(tri);
+    NVDR_CHECK_I32(tri);
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Fill in kernel parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = 0x7fffffff; // Let's not require vertex positions just to enable an error check.
+    p.tri = tri.data_ptr<int>();
+
+    // Kernel parameters.
+    p.allocTriangles = p.allocTriangles < 64 ? 64 : p.allocTriangles;
+    while (p.allocTriangles < p.numTriangles)
+        p.allocTriangles <<= 1; // Must be power of two.
+
+    // Construct the hash tensor and get pointer.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
+    torch::Tensor ev_hash = torch::zeros({p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * 4}, opts);
+    p.evHash = (uint4*)(ev_hash.data_ptr<int>());
+
+    // Check alignment.
+    NVDR_CHECK(!((uintptr_t)p.evHash & 15), "ev_hash internal tensor not aligned to int4");
+
+    // Populate the hash.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return.
+    TopologyHashWrapper hash_wrap;
+    hash_wrap.ev_hash = ev_hash;
+    return hash_wrap;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    torch::Tensor& topology_hash = topology_hash_wrap.ev_hash; // Unwrap.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_F32(color, rast, pos);
+    NVDR_CHECK_I32(tri, topology_hash);
+
+    // Sanity checks.
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and rast inputs must have same spatial dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.evHash = (uint4*)(topology_hash.data_ptr<int>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+    p.allocTriangles = topology_hash.size(0) / (4 * AA_HASH_ELEMENTS_PER_TRIANGLE);
+
+    // Allocate output tensors.
+    torch::Tensor out = color.detach().clone(); // Use color as base.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
+    p.output = out.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Clear the work counters.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rasterOut  &  7), "raster_out input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+    NVDR_CHECK(!((uintptr_t)p.evHash     & 15), "topology_hash internal tensor not aligned to int4");
+
+    // Choose launch parameters for the discontinuity finder kernel and launch.
+    void* args[] = {&p};
+    dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+    // Determine optimum block size for the persistent analysis kernel and launch.
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, work_buffer);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, dy, work_buffer);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, work_buffer);
+    NVDR_CHECK_F32(color, rast, pos, dy, work_buffer);
+    NVDR_CHECK_I32(tri);
+
+    // Sanity checks.
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) > 0 && dy.size(2) > 0 && dy.size(3) > 0, "dy must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "raster_out must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and raster_out inputs must have same spatial dimensions");
+    NVDR_CHECK(color.size(1) == dy.size(1) && color.size(2) == dy.size(2) && color.size(3) == dy.size(3), "color and dy inputs must have same dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out, pos");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0) && pos.size(0) ==color.size(0), "minibatch size mismatch between inputs dy, color, raster_out, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0), "minibatch size mismatch between inputs dy, color, raster_out");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Ensure dy is contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+
+    // Allocate output tensors.
+    torch::Tensor grad_color = dy_.detach().clone(); // Use dy as base.
+    torch::Tensor grad_pos = torch::zeros_like(pos);
+    p.gradColor = grad_color.data_ptr<float>();
+    p.gradPos = grad_pos.data_ptr<float>();
+
+    // Clear gradient kernel work counter.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+
+    // Determine optimum block size for the gradient kernel and launch.
+    void* args[] = {&p};
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(grad_color, grad_pos);
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..ed0ae0645a5ed82e4a0760c3e3a5f92aea8f85e6
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes. Return type macros for readability.
+
+#define OP_RETURN_T     torch::Tensor
+#define OP_RETURN_TT    std::tuple<torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTT   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTTT  std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTV   std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+#define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+
+OP_RETURN_TT        rasterize_fwd                       (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int depth_idx);
+OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
+OP_RETURN_T         rasterize_grad_db                   (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
+OP_RETURN_TT        interpolate_fwd                     (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
+OP_RETURN_TT        interpolate_fwd_da                  (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy);
+OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
+OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TTV       texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_TTTTV     texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
+OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
+OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
+        .def("set_context",     &RasterizeGLStateWrapper::setContext)
+        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
+    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>());
+    pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
+
+    // Plumbing to torch/c10 logging system.
+    m.def("get_log_level", [](void)     { return FLAGS_caffe2_log_level;  }, "get log level");
+    m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
+
+    // Ops.
+    m.def("rasterize_fwd",                      &rasterize_fwd,                         "rasterize forward op");
+    m.def("rasterize_grad",                     &rasterize_grad,                        "rasterize gradient op ignoring db gradients");
+    m.def("rasterize_grad_db",                  &rasterize_grad_db,                     "rasterize gradient op with db gradients");
+    m.def("interpolate_fwd",                    &interpolate_fwd,                       "interpolate forward op with attribute derivatives");
+    m.def("interpolate_fwd_da",                 &interpolate_fwd_da,                    "interpolate forward op without attribute derivatives");
+    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
+    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
+    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op without mipmapping");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op with mipmapping");
+    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
+    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
+    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");
+    m.def("texture_grad_linear_mipmap_linear",  &texture_grad_linear_mipmap_linear,     "texture gradient op in linear-mipmap-linear mode");
+    m.def("antialias_construct_topology_hash",  &antialias_construct_topology_hash,     "antialias topology hash construction");
+    m.def("antialias_fwd",                      &antialias_fwd,                         "antialias forward op");
+    m.def("antialias_grad",                     &antialias_grad,                        "antialias gradient op");
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_common.inl b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_common.inl
new file mode 100755
index 0000000000000000000000000000000000000000..74dea41528822294878d9ee5d36d1230d1df7ae6
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_common.inl
@@ -0,0 +1,29 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "../common/framework.h"
+
+//------------------------------------------------------------------------
+// Input check helpers.
+//------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
+#endif
+
+#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
+#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
+#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
+#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
+#define NVDR_CHECK_I32(...) do { nvdr_check_i32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be int32 tensors"); } while(0)
+inline void nvdr_check_cpu(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.device().type() == c10::DeviceType::CPU, func, err_msg); }
+inline void nvdr_check_contiguous(at::ArrayRef<at::Tensor> ts, const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.is_contiguous(), func, err_msg); }
+inline void nvdr_check_f32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kFloat32, func, err_msg); }
+inline void nvdr_check_i32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kInt32, func, err_msg); }
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_interpolate.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_interpolate.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..b2c99fccfe0b11b71018e2c0ddcf637a337522b8
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_interpolate.cpp
@@ -0,0 +1,250 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "../common/common.h"
+#include "../common/interpolate.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void InterpolateFwdKernel   (const InterpolateKernelParams p);
+void InterpolateFwdKernelDa (const InterpolateKernelParams p);
+void InterpolateGradKernel  (const InterpolateKernelParams p);
+void InterpolateGradKernelDa(const InterpolateKernelParams p);
+
+//------------------------------------------------------------------------
+// Helper
+
+static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    if (diff_attrs_all)
+    {
+        p.numDiffAttr = p.numAttr;
+        p.diff_attrs_all = 1;
+    }
+    else
+    {
+        NVDR_CHECK(diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, "too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)");
+        p.numDiffAttr = diff_attrs_vec.size();
+        memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, rast_db);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, rast_db);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK( tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    if (p.instance_mode)
+        NVDR_CHECK(attr.size(0) == rast.size(0) || attr.size(0) == 1, "minibatch size mismatch between inputs rast, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
+    torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);
+
+    p.out = out.data_ptr<float>();
+    p.outDA = enable_da ? out_da.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast   & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.outDA  &  7), "out_da output tensor not aligned to float2");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_da);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    return interpolate_fwd_da(attr, rast, tri, empty_tensor, false, empty_vec);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy, rast_db, dda);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, dy, rast_db, dda);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Depth of attributes.
+    int attr_depth = p.instance_mode ? (attr.sizes().size() > 1 ? attr.size(0) : 0) : 1;
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) == rast.size(1) && dy.size(2) == rast.size(2) && dy.size(3) > 0, "dy must have shape [>0, height, width, >0]");
+    NVDR_CHECK(dy.size(3) == attr.size(attr.sizes().size() - 1), "argument count mismatch between inputs dy, attr");
+    NVDR_CHECK((attr_depth == rast.size(0) || attr_depth == 1) && dy.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, dy, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(dda.sizes().size() == 4 && dda.size(0) > 0 && dda.size(1) == rast.size(1) && dda.size(2) == rast.size(2), "dda must have shape [>0, height, width, ?]");
+        NVDR_CHECK(dda.size(0) == rast.size(0), "minibatch size mismatch between rast, dda");
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor dda_;
+    if (enable_da)
+        dda_ = dda.contiguous();
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.dy = dy_.data_ptr<float>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.dda = enable_da ? dda_.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor gradAttr = torch::zeros_like(attr);
+    torch::Tensor gradRaster = torch::empty_like(rast);
+    torch::Tensor gradRasterDB;
+    if (enable_da)
+        gradRasterDB = torch::empty_like(rast_db);
+
+    p.gradAttr = gradAttr.data_ptr<float>();
+    p.gradRaster = gradRaster.data_ptr<float>();
+    p.gradRasterDB = enable_da ? gradRasterDB.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast         & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB       & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dda          &  7), "dda input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.gradRaster   & 15), "grad_rast output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.gradRasterDB & 15), "grad_rast_db output tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(gradAttr, gradRaster, gradRasterDB);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_grad(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = interpolate_grad_da(attr, rast, tri, dy, empty_tensor, empty_tensor, false, empty_vec);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..a52960347e04cfb84e1762b2d401106ce25ee609
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize.cpp
@@ -0,0 +1,223 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void RasterizeGradKernel(const RasterizeGradParams p);
+void RasterizeGradKernelDb(const RasterizeGradParams p);
+
+//------------------------------------------------------------------------
+// Python GL state wrapper methods.
+
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
+{
+    pState = new RasterizeGLState();
+    automatic = automatic_;
+    cudaDeviceIdx = cudaDeviceIdx_;
+    memset(pState, 0, sizeof(RasterizeGLState));
+    pState->enableDB = enableDB ? 1 : 0;
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
+    releaseGLContext();
+}
+
+RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
+{
+    destroyGLContext(pState->glctx);
+    delete pState;
+}
+
+void RasterizeGLStateWrapper::setContext(void)
+{
+    setGLContext(pState->glctx);
+}
+
+void RasterizeGLStateWrapper::releaseContext(void)
+{
+    releaseGLContext();
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGLState& s = *stateWrapper.pState;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+
+    // Check that GL context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
+
+    // Determine number of outputs
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Get output shape.
+    int height = std::get<0>(resolution);
+    int width  = std::get<1>(resolution);
+    int depth  = instance_mode ? pos.size(0) : ranges.size(0);
+    NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
+
+    // Get position and triangle buffer sizes in int32/float32.
+    int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
+    int triCount = 3 * tri.size(0);
+
+    // Set the GL context unless manual context.
+    if (stateWrapper.automatic)
+        setGLContext(s.glctx);
+
+    // Resize all buffers.
+    rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, posCount, triCount, width, height, depth);
+
+    // Copy input data to GL and render.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    int vtxPerInstance = instance_mode ? pos.size(1) : 0;
+    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
+    float* outputPtr[2];
+    outputPtr[0] = out.data_ptr<float>();
+    outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
+
+    // Copy rasterized results into CUDA buffers.
+    rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
+
+    // Done. Release GL context and return.
+    if (stateWrapper.automatic)
+        releaseGLContext();
+
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGradParams p;
+    bool enable_db = ddb.defined();
+
+    // Check inputs.
+    if (enable_db)
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy, ddb);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy, ddb);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Determine instance mode.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Shape is taken from the rasterizer output tensor.
+    NVDR_CHECK(out.sizes().size() == 4, "tensor out must be rank-4");
+    p.depth  = out.size(0);
+    p.height = out.size(1);
+    p.width  = out.size(2);
+    NVDR_CHECK(p.depth > 0 && p.height > 0 && p.width > 0, "resolution must be [>0, >0, >0]");
+
+    // Check other shapes.
+    if (p.instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) == p.depth && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [depth, >0, 4]");
+    else
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(out.sizes().size() == 4 && out.size(0) == p.depth && out.size(1) == p.height && out.size(2) == p.width && out.size(3) == 4, "out must have shape [depth, height, width, 4]");
+    NVDR_CHECK( dy.sizes().size() == 4 &&  dy.size(0) == p.depth &&  dy.size(1) == p.height &&  dy.size(2) == p.width &&  dy.size(3) == 4, "dy must have shape [depth, height, width, 4]");
+    if (enable_db)
+        NVDR_CHECK(ddb.sizes().size() == 4 && ddb.size(0) == p.depth && ddb.size(1) == p.height && ddb.size(2) == p.width && ddb.size(3) == 4, "ddb must have shape [depth, height, width, 4]");
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor ddb_;
+    if (enable_db)
+        ddb_ = ddb.contiguous();
+
+    // Populate parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = p.instance_mode ? pos.size(1) : pos.size(0);
+    p.pos = pos.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.out = out.data_ptr<float>();
+    p.dy  = dy_.data_ptr<float>();
+    p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
+
+    // Set up pixel position to clip space x, y transform.
+    p.xs = 2.f / (float)p.width;
+    p.xo = 1.f / (float)p.width - 1.f;
+    p.ys = 2.f / (float)p.height;
+    p.yo = 1.f / (float)p.height - 1.f;
+
+    // Allocate output tensor for position gradients.
+    torch::Tensor grad = torch::zeros_like(pos);
+    p.grad = grad.data_ptr<float>();
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dy  &  7), "dy input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.ddb & 15), "ddb input tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_db ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return the gradients.
+    return grad;
+}
+
+// Version without derivatives.
+torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
+{
+    torch::Tensor empty_tensor;
+    return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_texture.cpp b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_texture.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..2257f566623495c7044ea3f532ef00e327477dc7
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_texture.cpp
@@ -0,0 +1,718 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/texture.h"
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void MipBuildKernel1                            (const TextureKernelParams p);
+void MipBuildKernel2                            (const TextureKernelParams p);
+void MipBuildKernel4                            (const TextureKernelParams p);
+void TextureFwdKernelNearest1                   (const TextureKernelParams p);
+void TextureFwdKernelNearest2                   (const TextureKernelParams p);
+void TextureFwdKernelNearest4                   (const TextureKernelParams p);
+void TextureFwdKernelLinear1                    (const TextureKernelParams p);
+void TextureFwdKernelLinear2                    (const TextureKernelParams p);
+void TextureFwdKernelLinear4                    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest1       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest2       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest4       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear1        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear2        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear4        (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest1               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest2               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest4               (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear1                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear2                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear4                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest1   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest2   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest4   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear1    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear2    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear4    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO1     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO2     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO4     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO1      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO2      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO4      (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO1  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO2  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO4  (const TextureKernelParams p);
+void MipGradKernel1                             (const TextureKernelParams p);
+void MipGradKernel2                             (const TextureKernelParams p);
+void MipGradKernel4                             (const TextureKernelParams p);
+void TextureGradKernelNearest                   (const TextureKernelParams p);
+void TextureGradKernelLinear                    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearest       (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinear        (const TextureKernelParams p);
+void TextureGradKernelCubeNearest               (const TextureKernelParams p);
+void TextureGradKernelCubeLinear                (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearest   (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinear    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearestBO     (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinearBO      (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinearBO  (const TextureKernelParams p);
+
+//------------------------------------------------------------------------
+// Modeselektor.
+
+static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode, int max_mip_level)
+{
+    // Mip and filter modes.
+    p.filterMode = filter_mode;
+    NVDR_CHECK(p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, "filter_mode unsupported");
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        p.mipLevelLimit = max_mip_level;
+        NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+    }
+
+    // Boundary mode.
+    p.boundaryMode = boundary_mode;
+    NVDR_CHECK(p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, "boundary_mode unsupported");
+}
+
+//------------------------------------------------------------------------
+// Mipmap construction.
+
+TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    p.mipLevelLimit = max_mip_level;
+    p.boundaryMode = cube_mode ? TEX_BOUNDARY_MODE_CUBE : TEX_BOUNDARY_MODE_WRAP;
+    NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex);
+    NVDR_CHECK_CONTIGUOUS(tex);
+    NVDR_CHECK_F32(tex);
+
+    // Populate parameters and sanity check tex shape.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+    }
+    p.texDepth  = tex.size(0);
+    p.texHeight = tex.size(cube_mode ? 2 : 1);
+    p.texWidth  = tex.size(cube_mode ? 3 : 2);
+    p.channels  = tex.size(cube_mode ? 4 : 3);
+
+    // Set texture pointer.
+    p.tex[0] = tex.data_ptr<float>();
+
+    // Generate mip offsets and calculate total size.
+    int mipOffsets[TEX_MAX_MIP_LEVEL];
+    int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+
+    // Allocate and set mip tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor mip = torch::empty({mipTotal}, opts);
+    float* pmip = mip.data_ptr<float>();
+    for (int i=1; i <= p.mipLevelMax; i++)
+        p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Build mip levels.
+    for (int i=1; i <= p.mipLevelMax; i++)
+    {
+        int2 ms = mipLevelSize(p, i);
+        int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+        dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+        p.mipLevelOut = i;
+
+        void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+    }
+
+    // Return the mip tensor in a wrapper.
+    TextureMipWrapper mip_wrapper;
+    mip_wrapper.mip = mip;
+    mip_wrapper.max_mip_level = max_mip_level;
+    mip_wrapper.texture_size = tex.sizes().vec();
+    mip_wrapper.cube_mode = cube_mode;
+    return mip_wrapper;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.n, p.imgHeight, p.imgWidth, p.channels}, opts);
+    p.out = out.data_ptr<float>();
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    float* pmip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match and assign.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in custom mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+                p.tex[i] = t.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets, check mipmap size, and set mip data pointer.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "wrapped mip tensor size mismatch");
+            pmip = mip_w.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2");
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 15), "tex or mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.out    & 15), "out output tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip     & 15), "mip input tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 7), "tex or mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.out    & 7), "out output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip     & 7), "mip input tensor not aligned to float2");
+    }
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 15), "uv_da input tensor not aligned to float4");
+    else
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 7), "uv_da input tensor not aligned to float2");
+
+    // Choose launch parameters for texture lookup kernel.
+    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    // Choose kernel based on filter mode, cube mode, bias-only mode, and datatype.
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2 * 3] = {
+        (void*)TextureFwdKernelNearest1,
+        (void*)TextureFwdKernelNearest2,
+        (void*)TextureFwdKernelNearest4,
+        (void*)TextureFwdKernelLinear1,
+        (void*)TextureFwdKernelLinear2,
+        (void*)TextureFwdKernelLinear4,
+        (void*)TextureFwdKernelLinearMipmapNearest1,
+        (void*)TextureFwdKernelLinearMipmapNearest2,
+        (void*)TextureFwdKernelLinearMipmapNearest4,
+        (void*)TextureFwdKernelLinearMipmapLinear1,
+        (void*)TextureFwdKernelLinearMipmapLinear2,
+        (void*)TextureFwdKernelLinearMipmapLinear4,
+        (void*)TextureFwdKernelCubeNearest1,
+        (void*)TextureFwdKernelCubeNearest2,
+        (void*)TextureFwdKernelCubeNearest4,
+        (void*)TextureFwdKernelCubeLinear1,
+        (void*)TextureFwdKernelCubeLinear2,
+        (void*)TextureFwdKernelCubeLinear4,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelLinearMipmapLinearBO4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO4,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+    func_idx = func_idx * 3 + channel_div_idx; // Choose vector size.
+
+    // Launch kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Return output tensor.
+    return out;
+}
+
+// Version without mipmaps.
+torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    return texture_fwd_mip(tex, uv, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]");
+
+    // Get contiguous version of dy.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor for tex gradient.
+    torch::Tensor grad_tex = torch::zeros_like(tex);
+    p.gradTex[0] = grad_tex.data_ptr<float>();
+
+    // Allocate output tensor for uv gradient.
+    torch::Tensor grad_uv;
+    torch::Tensor grad_uv_da;
+    torch::Tensor grad_mip_level_bias;
+    if (p.filterMode != TEX_MODE_NEAREST)
+    {
+        grad_uv = torch::empty_like(uv);
+        p.gradUV = grad_uv.data_ptr<float>();
+
+        // Gradients for things affecting mip level.
+        if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            // Allocate output tensor for uv_da gradient.
+            if (has_uv_da)
+            {
+                grad_uv_da = torch::empty_like(uv_da);
+                p.gradUVDA = grad_uv_da.data_ptr<float>();
+            }
+
+            // Allocate output tensor for mip_level_bias gradient.
+            if (has_mip_level_bias)
+            {
+                grad_mip_level_bias = torch::empty_like(mip_level_bias);
+                p.gradMipLevelBias = grad_mip_level_bias.data_ptr<float>();
+            }
+        }
+    }
+
+    // Choose kernel variants based on channel count.
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    torch::Tensor grad_mip;
+    std::vector<torch::Tensor> grad_mip_stack;
+    float* pmip = 0;
+    float* pgradMip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match, assign, construct gradient tensors.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+
+                torch::Tensor g = torch::zeros_like(t);
+                grad_mip_stack.push_back(g);
+
+                p.tex[i] = t.data_ptr<float>();
+                p.gradTex[i] = g.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets and get space for temporary mip gradients.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "mip tensor size mismatch");
+            grad_mip = torch::zeros_like(mip_w);
+            pmip = (float*)mip_w.data_ptr<float>();
+            pgradMip = grad_mip.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(!((uintptr_t)p.uv       & 7), "uv input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUV   & 7), "grad_uv output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 15), "uv_da input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 15), "grad_uv_da output tensor not aligned to float4");
+    }
+    else
+    {
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 7), "uv_da input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 7), "grad_uv_da output tensor not aligned to float2");
+    }
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 15), "tex or mip input tensor not aligned to float4");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 15), "grad_tex output tensor not aligned to float4");
+        }
+        NVDR_CHECK(!((uintptr_t)p.dy         & 15), "dy input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip         & 15), "mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 15), "internal mip gradient tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 7), "tex or mip input tensor not aligned to float2");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 7), "grad_tex output tensor not aligned to float2");
+        }
+         NVDR_CHECK(!((uintptr_t)p.dy         & 7), "dy output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip         & 7), "mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 7), "internal mip gradient tensor not aligned to float2");
+    }
+
+    // Choose launch parameters for main gradient kernel.
+    void* args[] = {&p};
+    dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2] = {
+        (void*)TextureGradKernelNearest,
+        (void*)TextureGradKernelLinear,
+        (void*)TextureGradKernelLinearMipmapNearest,
+        (void*)TextureGradKernelLinearMipmapLinear,
+        (void*)TextureGradKernelCubeNearest,
+        (void*)TextureGradKernelCubeLinear,
+        (void*)TextureGradKernelCubeLinearMipmapNearest,
+        (void*)TextureGradKernelCubeLinearMipmapLinear,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelLinearMipmapNearestBO,
+        (void*)TextureGradKernelLinearMipmapLinearBO,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelCubeLinearMipmapNearestBO,
+        (void*)TextureGradKernelCubeLinearMipmapLinearBO,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+
+    // Launch main gradient kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Launch kernel to pull gradients from mip levels. Don't do this if mip stack was supplied - individual level gradients are already there.
+    if (p.enableMip && !has_mip_stack)
+    {
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+        int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+        void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+    }
+
+    // Return output tensors.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(grad_tex, grad_uv, grad_uv_da, grad_mip_level_bias, grad_mip_stack);
+}
+
+// Version for nearest filter mode.
+torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::get<0>(result);
+}
+
+// Version for linear filter mode.
+std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+// Version for linear-mipmap-nearest mode.
+std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(std::get<0>(result), std::get<1>(result), std::get<4>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_types.h b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_types.h
new file mode 100755
index 0000000000000000000000000000000000000000..d047cc67d4c901f26ab59bb8eb93c7a209368fc4
--- /dev/null
+++ b/pose_estimation/nvdiffrast/build/lib/nvdiffrast/torch/torch_types.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+
+//------------------------------------------------------------------------
+// Python GL state wrapper.
+
+class RasterizeGLState;
+class RasterizeGLStateWrapper
+{
+public:
+    RasterizeGLStateWrapper     (bool enableDB, bool automatic, int cudaDeviceIdx);
+    ~RasterizeGLStateWrapper    (void);
+
+    void setContext             (void);
+    void releaseContext         (void);
+
+    RasterizeGLState*           pState;
+    bool                        automatic;
+    int                         cudaDeviceIdx;
+};
+
+//------------------------------------------------------------------------
+// Mipmap wrapper to prevent intrusion from Python side.
+
+class TextureMipWrapper
+{
+public:
+    torch::Tensor               mip;
+    int                         max_mip_level;
+    std::vector<int64_t>        texture_size;   // For error checking.
+    bool                        cube_mode;      // For error checking.
+};
+
+
+//------------------------------------------------------------------------
+// Antialias topology hash wrapper to prevent intrusion from Python side.
+
+class TopologyHashWrapper
+{
+public:
+    torch::Tensor               ev_hash;
+};
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/docker/10_nvidia.json b/pose_estimation/nvdiffrast/docker/10_nvidia.json
new file mode 100755
index 0000000000000000000000000000000000000000..2bfcca059e24759472b3dbe7350fa2e1aff088f0
--- /dev/null
+++ b/pose_estimation/nvdiffrast/docker/10_nvidia.json
@@ -0,0 +1,6 @@
+{
+    "file_format_version" : "1.0.0",
+    "ICD" : {
+        "library_path" : "libEGL_nvidia.so.0"
+    }
+}
diff --git a/pose_estimation/nvdiffrast/docker/Dockerfile b/pose_estimation/nvdiffrast/docker/Dockerfile
new file mode 100755
index 0000000000000000000000000000000000000000..163b7f4b2020228e4cf252b561c944528a816b50
--- /dev/null
+++ b/pose_estimation/nvdiffrast/docker/Dockerfile
@@ -0,0 +1,51 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# Note: Should also work with NVIDIA's Docker image builds such as
+#
+# nvcr.io/nvidia/pytorch:20.09-py3
+#
+# This file defaults to pytorch/pytorch as it works on slightly older
+# driver versions.
+ARG BASE_IMAGE=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel
+FROM $BASE_IMAGE
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    pkg-config \
+    libglvnd0 \
+    libgl1 \
+    libglx0 \
+    libegl1 \
+    libgles2 \
+    libglvnd-dev \
+    libgl1-mesa-dev \
+    libegl1-mesa-dev \
+    libgles2-mesa-dev \
+    cmake \
+    curl
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# for GLEW
+ENV LD_LIBRARY_PATH /usr/lib64:$LD_LIBRARY_PATH
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics
+
+# Default pyopengl to EGL for good headless rendering support
+ENV PYOPENGL_PLATFORM egl
+
+COPY docker/10_nvidia.json /usr/share/glvnd/egl_vendor.d/10_nvidia.json
+
+RUN pip install imageio imageio-ffmpeg
+
+COPY nvdiffrast /tmp/pip/nvdiffrast/
+COPY README.md setup.py /tmp/pip/
+RUN cd /tmp/pip && pip install .
diff --git a/pose_estimation/nvdiffrast/docs/img/cube.png b/pose_estimation/nvdiffrast/docs/img/cube.png
new file mode 100755
index 0000000000000000000000000000000000000000..92b63e611d95b2b4b898117e789e55bc280ec0b1
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/cube.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/earth.png b/pose_estimation/nvdiffrast/docs/img/earth.png
new file mode 100755
index 0000000000000000000000000000000000000000..d30989a6f2eb544cc7449c6847494cbc6135bea2
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/earth.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/envphong.png b/pose_estimation/nvdiffrast/docs/img/envphong.png
new file mode 100755
index 0000000000000000000000000000000000000000..2c6f3902b850a52ca0006207c9b824fd0cdea538
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/envphong.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/logo.png b/pose_estimation/nvdiffrast/docs/img/logo.png
new file mode 100755
index 0000000000000000000000000000000000000000..827d907fef3dc5455f42d32f52dd65b37578a42a
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/logo.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/pipe_cube.png b/pose_estimation/nvdiffrast/docs/img/pipe_cube.png
new file mode 100755
index 0000000000000000000000000000000000000000..6410c7207e12533ab9d9991175ffc193c8a77a4d
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/pipe_cube.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/pipe_earth.png b/pose_estimation/nvdiffrast/docs/img/pipe_earth.png
new file mode 100755
index 0000000000000000000000000000000000000000..c46ab68ef358bbe2a798907a8ac42ebda61e328b
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/pipe_earth.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/pipe_envphong.png b/pose_estimation/nvdiffrast/docs/img/pipe_envphong.png
new file mode 100755
index 0000000000000000000000000000000000000000..524c5c4e34ec11f0be49e1b7ca5a79c1d43e918b
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/pipe_envphong.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/pose.png b/pose_estimation/nvdiffrast/docs/img/pose.png
new file mode 100755
index 0000000000000000000000000000000000000000..908c0978b0f6da697af87bfe73eb7c18aa3a3a95
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/pose.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_aa.png b/pose_estimation/nvdiffrast/docs/img/spot_aa.png
new file mode 100755
index 0000000000000000000000000000000000000000..c957e3bae63a622b70b4f69f9157afc793354af0
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_aa.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_crop1.png b/pose_estimation/nvdiffrast/docs/img/spot_crop1.png
new file mode 100755
index 0000000000000000000000000000000000000000..c43c699863e12c2529a0ad50bd6f479c5a96cddd
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_crop1.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_crop2.png b/pose_estimation/nvdiffrast/docs/img/spot_crop2.png
new file mode 100755
index 0000000000000000000000000000000000000000..e2c5a04674634b820f4ff9c99e332590f11ab39b
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_crop2.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_diff1.png b/pose_estimation/nvdiffrast/docs/img/spot_diff1.png
new file mode 100755
index 0000000000000000000000000000000000000000..ebc65a274b3df5a3f1fa14f756b7ceb1ded9539d
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_diff1.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_diff2.png b/pose_estimation/nvdiffrast/docs/img/spot_diff2.png
new file mode 100755
index 0000000000000000000000000000000000000000..14a7b6dd58c66cc74bbfb342d0d962751e957a95
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_diff2.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_peel1.png b/pose_estimation/nvdiffrast/docs/img/spot_peel1.png
new file mode 100755
index 0000000000000000000000000000000000000000..80970c5b23437a43232164cbe5daee007759bb69
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_peel1.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_peel2.png b/pose_estimation/nvdiffrast/docs/img/spot_peel2.png
new file mode 100755
index 0000000000000000000000000000000000000000..269fa4b02a838685360c08f99f52122049914df2
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_peel2.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_st.png b/pose_estimation/nvdiffrast/docs/img/spot_st.png
new file mode 100755
index 0000000000000000000000000000000000000000..669470ff96b98ce8a060565df4a4ab194bd2b00b
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_st.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_tex.png b/pose_estimation/nvdiffrast/docs/img/spot_tex.png
new file mode 100755
index 0000000000000000000000000000000000000000..8308898719f608df81b03e23d4aaee46627fbd19
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_tex.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_texture.png b/pose_estimation/nvdiffrast/docs/img/spot_texture.png
new file mode 100755
index 0000000000000000000000000000000000000000..630944877af378f0816781a41f0a8f1020c123aa
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_texture.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_texw.png b/pose_estimation/nvdiffrast/docs/img/spot_texw.png
new file mode 100755
index 0000000000000000000000000000000000000000..6191c79b45b8805320b85217af053728cc919704
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_texw.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_tri.png b/pose_estimation/nvdiffrast/docs/img/spot_tri.png
new file mode 100755
index 0000000000000000000000000000000000000000..814227914beba49dd5b96dbe2567301dac6f0552
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_tri.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/spot_uv.png b/pose_estimation/nvdiffrast/docs/img/spot_uv.png
new file mode 100755
index 0000000000000000000000000000000000000000..da2f7447d73bdd9b795a6c28a298acdb27225078
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/spot_uv.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/teaser.png b/pose_estimation/nvdiffrast/docs/img/teaser.png
new file mode 100755
index 0000000000000000000000000000000000000000..cca878e31faa75d03a56e0fd2c90ad6a667a9df2
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/teaser.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/teaser1.png b/pose_estimation/nvdiffrast/docs/img/teaser1.png
new file mode 100755
index 0000000000000000000000000000000000000000..defdaf8821913ec852d4d8397519485cd59c85d7
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/teaser1.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/teaser2.png b/pose_estimation/nvdiffrast/docs/img/teaser2.png
new file mode 100755
index 0000000000000000000000000000000000000000..a950a66395bb69c8fd521f5719c66af12d88895d
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/teaser2.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/teaser3.png b/pose_estimation/nvdiffrast/docs/img/teaser3.png
new file mode 100755
index 0000000000000000000000000000000000000000..13450160d4f32e0bcb040e2281aa192aec5c427d
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/teaser3.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/teaser4.png b/pose_estimation/nvdiffrast/docs/img/teaser4.png
new file mode 100755
index 0000000000000000000000000000000000000000..a0dceb8fa5d979c785735483d6fd449c57a28d4a
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/teaser4.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/teaser5.png b/pose_estimation/nvdiffrast/docs/img/teaser5.png
new file mode 100755
index 0000000000000000000000000000000000000000..439de8a4f6d1614ad7e95ea72c21d6f2d97e3990
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/teaser5.png differ
diff --git a/pose_estimation/nvdiffrast/docs/img/thumb.jpg b/pose_estimation/nvdiffrast/docs/img/thumb.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..aab9d25a4853df29220ac1f1c289e9f3f348fad2
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/thumb.jpg differ
diff --git a/pose_estimation/nvdiffrast/docs/img/tri.png b/pose_estimation/nvdiffrast/docs/img/tri.png
new file mode 100755
index 0000000000000000000000000000000000000000..45b17356321b56df5c482fa49a47b95d61d6d878
Binary files /dev/null and b/pose_estimation/nvdiffrast/docs/img/tri.png differ
diff --git a/pose_estimation/nvdiffrast/docs/index.html b/pose_estimation/nvdiffrast/docs/index.html
new file mode 100755
index 0000000000000000000000000000000000000000..467d67c1db4386e3a970a76cb392d45216c53314
--- /dev/null
+++ b/pose_estimation/nvdiffrast/docs/index.html
@@ -0,0 +1,1014 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
+<head>
+  <meta charset="utf-8" />
+  <title>nvdiffrast</title>
+  <meta property="og:title" content="nvdiffrast">
+  <meta property="og:description" content="Modular Primitives for High-Performance Differentiable Rendering">
+  <meta property="og:image" content="https://nvlabs.github.io/nvdiffrast/img/thumb.jpg">
+  <meta property="og:url" content="https://nvlabs.github.io/nvdiffrast/">
+
+<style type='text/css'>
+
+:root {
+    --func-vert-padding: 0.5em;
+}
+
+span.smallcaps{font-variant: small-caps;}
+span.underline{text-decoration: underline;}
+div.column{display: inline-block; vertical-align: top; width: 50%;}
+
+body {
+    font-family: 'Segoe UI', sans-serif;
+    color: #000;
+    line-height: 1.5;
+}
+.tocstyle nav {
+    display: table;
+    padding: .4em 2em .5em 0;
+    margin-top: 1em;
+    background-color: #f6f8fa;
+    border: 1px solid DarkSlateGray;
+}
+h1 {
+    font-family: 'Montserrat', 'Segoe UI', sans-serif;
+    line-height: 1.2;
+    font-size: 3em;
+    margin-top: 0.5em;
+    margin-bottom: 0.2em;
+}
+h2, h3, h4, h5, h6 {
+    font-family: 'Segoe UI', sans-serif;
+    font-weight: 600;
+    margin-bottom: 0.1em;
+    color: DarkSlateGray;
+}
+h2     { margin-top: 2em; }
+h2, h3 { border-bottom: 1px solid #ccc; }
+p {
+  margin-left: 0px;
+  margin-right: 0px;
+  margin-top: 0.75em;
+  margin-bottom: 0.75em;
+}
+
+.max-width {
+    margin: 1em;
+}
+
+@media screen and (min-width: 680px) {
+    .max-width {
+       margin-left: auto;
+       margin-right: auto;
+       margin-top: 60px;
+       margin-bottom: 60px;
+       max-width: 800px;
+    }
+}
+
+.pixelated {
+    image-rendering: pixelated;
+}
+
+strong {
+    font-weight: 600;
+}
+
+.title {
+    text-align: center;
+}
+.subtitle {
+ 	font-size: 1.25em;
+ 	margin-top: 0px;
+ 	padding-top: 0px;
+ 	padding-bottom: 1em;
+ 	margin-bottom: 2em;
+    border-bottom: 1px solid #ccc;
+ 	color: #444;
+}
+
+.centered {
+    text-align: center;
+}
+
+.spaced {
+    margin: 2em 0;
+}
+.no-bottom-margin {
+    margin-bottom: 0;
+}
+.top-lined {
+    padding-top: 2em;
+    border-top: 1px solid #000;
+}
+.bottom-lined {
+    padding-bottom: 2em;
+    border-bottom: 1px solid #888;
+}
+.intro {
+    display: flex;
+    flex-direction: column;
+}
+
+.permalinked {
+    color: #222;
+    text-decoration: none;
+}
+.permalinked:hover,
+.permalinked:focus {
+    text-decoration: underline;
+}
+.flattr-note {
+    vertical-align: top;
+}
+
+pre {
+  font-family: 'Consolas', monospace, sans-serif;
+  font-size: 11pt;
+  font-weight: normal;
+  background-color: #f6f8fa;
+  border-radius: 3px;
+  padding: 12px;
+  line-height: 1.3;
+  overflow-x:auto;
+  white-space: pre-wrap;
+}
+
+code {
+  font-family: 'Consolas', monospace, sans-serif;
+  font-size: 11pt;
+  font-weight: normal;
+  background-color: #f6f8fa;
+  line-height: 1.3;
+  white-space: pre;
+}
+
+img.nob {
+  height: 250px;
+}
+
+img.pipe {
+  height: 250px;
+  padding-left: 50px;
+  padding-right: 50px;
+}
+
+img.brd {
+  height: 250px;
+  border: 1px solid #aaa;
+  box-shadow: 2px 2px 4px 0 #ddd;
+}
+
+img.teaser {
+  width: 160px;
+  height: 160px;
+  border: 1px solid #aaa;
+  box-shadow: 2px 2px 4px 0 #ddd;
+  margin: 20px 5px 0 5px;
+}
+
+td.mip {
+  text-align: center;
+  vertical-align: middle;
+  padding: 0 5px 0 5px;
+  line-height: 1.0;
+}
+
+td.cmd {
+  text-align: left;
+  vertical-align: top;
+  padding: 0 1em 0 0;
+  margin: 0;
+  line-height: 1.1;
+}
+
+div.image-parent {
+    display: flex;
+    flex-direction: row;
+    justify-content: center;
+}
+
+/* CSS for an image row with a caption */
+.image-row {
+    display: flex;
+    flex-direction: row;
+    align-items: top;
+    width: min-content;
+}
+
+.image-row > div { margin:10px; }
+
+.image-caption {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+}
+
+.image-caption .caption {
+    margin-top: 2px;
+}
+
+/* Styles for API reference */
+.apifunc {
+    margin-bottom: 1.5em;
+}
+.apifunc h4 {
+    margin-top: var(--func-vert-padding);
+    margin-bottom: var(--func-vert-padding);
+    overflow-x: hidden;
+}
+.apifunc h4 .defarg {
+    color:MediumBlue;
+}
+.apifunc h4 .sym_class,.sym_function,.sym_method {
+    border-radius: 4px;
+    padding: 0px 5px 0px 5px;
+    border: 0;
+    margin: 0;
+    font-size: 11pt;
+    font-weight: 600;
+    color: #fff;
+}
+.apifunc h4 .sym_class {
+    background-color: #d66;
+}
+.apifunc h4 .sym_function {
+    background-color: #66f;
+}
+.apifunc h4 .sym_method {
+    background-color: #6a9;
+}
+.apifunc p {
+    margin-top: var(--func-vert-padding);
+    margin-bottom: var(--func-vert-padding);
+}
+.apifunc code {
+    color: #000;
+    background-color: #f6f8fa;
+    font-family: 'Consolas', monospace, sans-serif;
+    font-weight: normal;
+    line-height: 1.3;
+    white-space: pre-wrap;
+}
+.apifunc h4 code {
+    font-size: 12pt;
+}
+.apifunc .returns, .arguments {
+    margin-top: .5em;
+    margin-bottom: 0em;
+}
+.apifunc {
+    padding-bottom: 1em;
+    border-bottom: 1px solid #cdcdcd;
+}
+.apifunc:last-child {
+    border-bottom: none;
+}
+
+.apifunc .args,.return_description {
+    line-height: 1.4;
+    margin-bottom: 0.5em;
+	margin-left: 2em;
+}
+.apifunc .args .arg .argname  {
+    font-family: 'Consolas', monospace, sans-serif;
+    font-weight: normal;
+    font-size: 12pt;
+    padding-right: .5em;
+    padding-left: 0em;
+}
+.apifunc .args .arg {
+    vertical-align: baseline;
+}
+.apifunc .args .arg .arg_short {
+    padding-left: .5em;
+}
+
+</style>
+<link href="https://fonts.googleapis.com/css?family=Montserrat|Segoe+UI" rel="stylesheet">
+</head>
+
+<body class='max-width'>
+    <header id='title-block-header'>
+        <div style='display: flex; flex-direction: row; align-items: center; margin-top: 20px'>
+            <img class="pixelated" style='margin-top: 1.0em' width='34px' height='34px' src='img/logo.png'></img>
+            <h1 style='padding-bottom: 0.0em; margin-left: 3px;' class="title">nvdiffrast</h1>
+        </div>
+        <div class="subtitle">Modular Primitives for High-Performance Differentiable Rendering</div>
+
+    </header>
+
+<h2 style='border-bottom: 0; padding-bottom: 0;'>Table of contents</h2>
+<div class="tocstyle">
+<nav id="TOC">
+<ul>
+<li><a href="#overview">Overview</a></li>
+<li><a href="#installation">Installation</a><ul>
+<li><a href="#linux">Linux</a></li>
+<li><a href="#windows">Windows</a></li>
+</ul></li>
+<li><a href="#primitive-operations">Primitive operations</a><ul>
+<li><a href="#rasterization">Rasterization</a></li>
+<li><a href="#interpolation">Interpolation</a></li>
+<li><a href="#texturing">Texturing</a></li>
+<li><a href="#antialiasing">Antialiasing</a></li>
+</ul></li>
+<li><a href="#beyond-the-basics">Beyond the basics</a><ul>
+<li><a href="#coordinate-systems">Coordinate systems</a></li>
+<li><a href="#geometry-and-minibatches-range-mode-vs-instanced-mode">Geometry and minibatches: Range mode vs Instanced mode</a></li>
+<li><a href="#image-space-derivatives">Image-space derivatives</a></li>
+<li><a href="#mipmaps-and-texture-dimensions">Mipmaps and texture dimensions</a></li>
+<li><a href="#running-on-multiple-gpus">Running on multiple GPUs</a></li>
+<li><a href="#rendering-multiple-depth-layers">Rendering multiple depth layers</a></li>
+<li><a href="#differences-between-pytorch-and-tensorflow">Differences between PyTorch and TensorFlow</a><ul>
+<li><a href="#manual-opengl-contexts-in-pytorch">Manual OpenGL contexts in PyTorch</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#samples">Samples</a><ul>
+<li><a href="#triangle.py"><span>triangle.py</span></a></li>
+<li><a href="#cube.py"><span>cube.py</span></a></li>
+<li><a href="#earth.py"><span>earth.py</span></a></li>
+<li><a href="#envphong.py"><span>envphong.py</span></a></li>
+<li><a href="#pose.py"><span>pose.py</span></a></li>
+</ul></li>
+<li><a href="#pytorch-api-reference">PyTorch API reference</a></li>
+<li><a href="#licenses">Licenses</a></li>
+<li><a href="#citation">Citation</a></li>
+<li><a href="#acknowledgements">Acknowledgements</a></li>
+</ul>
+</nav></div>
+
+<h2 id="overview">Overview</h2>
+<p>Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering. It is a lower-level library compared to previous ones such as <a href="https://github.com/BachiLi/redner">redner</a>, <a href="https://github.com/ShichenLiu/SoftRas">SoftRas</a>, or <a href="https://github.com/facebookresearch/pytorch3d">PyTorch3D</a> — nvdiffrast has no built-in camera models, lighting/material models, etc. Instead, the provided operations encapsulate only the most graphics-centric steps in the modern hardware graphics pipeline: rasterization, interpolation, texturing, and antialiasing. All of these operations (and their gradients) are GPU-accelerated, either via CUDA or via the hardware graphics pipeline.</p>
+This documentation is intended to serve as a user's guide to nvdiffrast. For detailed discussion on the design principles, implementation details, and benchmarks, please see our paper:
+<blockquote>
+<strong>Modular Primitives for High-Performance Differentiable Rendering</strong><br> Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br> ACM Transactions on Graphics 39(6) (proc. SIGGRAPH Asia 2020)
+</blockquote>
+<p>Paper: <a href="http://arxiv.org/abs/2011.03277" class="uri">http://arxiv.org/abs/2011.03277</a><br> GitHub: <a href="https://github.com/NVlabs/nvdiffrast" class="uri">https://github.com/NVlabs/nvdiffrast</a></p>
+<div class="image-parent">
+<div class="image-caption">
+<div class="image-row">
+<img class="teaser" src="img/teaser4.png"/> <img class="teaser" src="img/teaser1.png"/> <img class="teaser" src="img/teaser2.png"/> <img class="teaser" src="img/teaser3.png"/> <img class="teaser" src="img/teaser5.png"/>
+</div>
+<div class="caption">
+Examples of things we've done with nvdiffrast
+</div>
+</div>
+</div>
+<h2 id="installation">Installation</h2>
+<p>Requirements:</p>
+<ul>
+<li>Linux or Windows operating system.</li>
+<li>64-bit Python 3.6 or 3.7. We recommend Anaconda3 with numpy 1.14.3 or newer.</li>
+<li>PyTorch 1.6 (recommended) or TensorFlow 1.14. TensorFlow 2.x is currently not supported.</li>
+<li>A high-end NVIDIA GPU, NVIDIA drivers, CUDA 10.2 toolkit, and cuDNN 7.6.</li>
+</ul>
+<p>To download nvdiffrast, either download the repository at <a href="https://github.com/NVlabs/nvdiffrast" class="uri">https://github.com/NVlabs/nvdiffrast</a> as a .zip file, or clone the repository using git:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode bash"><code class="sourceCode bash"><a class="sourceLine" id="cb1-1" data-line-number="1"><span class="fu">git</span> clone https://github.com/NVlabs/nvdiffrast</a></code></pre></div>
+<h3 id="linux">Linux</h3>
+<p>We recommend running nvdiffrast on <a href="https://www.docker.com/">Docker</a>. To build a Docker image with nvdiffrast and PyTorch 1.6 installed, run:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode bash"><code class="sourceCode bash"><a class="sourceLine" id="cb2-1" data-line-number="1"><span class="ex">./run_sample.sh</span> --build-container</a></code></pre></div>
+<p>To try out some of the provided code examples, run:</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode bash"><code class="sourceCode bash"><a class="sourceLine" id="cb3-1" data-line-number="1"><span class="ex">./run_sample.sh</span> ./samples/torch/cube.py --resolution 32</a></code></pre></div>
+<p>Alternatively, if you have all the dependencies taken care of (consult the included Dockerfile for reference), you can install nvdiffrast in your local Python site-packages by running</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode bash"><code class="sourceCode bash"><a class="sourceLine" id="cb4-1" data-line-number="1"><span class="ex">pip</span> install .</a></code></pre></div>
+<p>at the root of the repository. You can also just add the repository root directory to your <code>PYTHONPATH</code>.</p>
+<h3 id="windows">Windows</h3>
+<p>On Windows, nvdiffrast requires an external compiler for compiling the CUDA kernels. The development was done using Microsoft Visual Studio 2017 Professional Edition, and this version works with both PyTorch and TensorFlow versions of nvdiffrast. VS 2019 Professional Edition has also been confirmed to work with the PyTorch version of nvdiffrast. Other VS editions besides Professional Edition, including the Community Edition, should work but have not been tested.</p>
+<p>If the compiler binary (<code>cl.exe</code>) cannot be found in <code>PATH</code>, nvdiffrast will search for it heuristically. If this fails you may need to add it manually via</p>
+<pre><code>&quot;C:\Program Files (x86)\Microsoft Visual Studio\...\...\VC\Auxiliary\Build\vcvars64.bat&quot;</code></pre>
+<p>where the exact path depends on the version and edition of VS you have installed.</p>
+<p>To install nvdiffrast in your local site-packages, run:</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode bash"><code class="sourceCode bash"><a class="sourceLine" id="cb6-1" data-line-number="1"><span class="co"># Ninja is required run-time to build PyTorch extensions</span></a>
+<a class="sourceLine" id="cb6-2" data-line-number="2"><span class="ex">pip</span> install ninja</a>
+<a class="sourceLine" id="cb6-3" data-line-number="3"></a>
+<a class="sourceLine" id="cb6-4" data-line-number="4"><span class="co"># Run at the root of the repository to install nvdiffrast</span></a>
+<a class="sourceLine" id="cb6-5" data-line-number="5"><span class="ex">pip</span> install .</a></code></pre></div>
+<p>Instead of <code>pip install .</code> you can also just add the repository root directory to your <code>PYTHONPATH</code>.</p>
+<h2 id="primitive-operations">Primitive operations</h2>
+<p>Nvdiffrast offers four differentiable rendering primitives: <strong>rasterization</strong>, <strong>interpolation</strong>, <strong>texturing</strong>, and <strong>antialiasing</strong>. The operation of the primitives is described here in a platform-agnostic way. Platform-specific documentation can be found in the API reference section.</p>
+<p>In this section we ignore the minibatch axis for clarity and assume a minibatch size of one. However, all operations support minibatches as detailed later.</p>
+<h3 id="rasterization">Rasterization</h3>
+<p>The rasterization operation takes as inputs a tensor of vertex positions and a tensor of vertex index triplets that specify the triangles. Vertex positions are specified in NDC (Normalized Device Coordinate) space, i.e., after modelview and projection transformations. Performing these transformations is left as the user's responsibility. In NDC, the view frustum is a cube in homogeneous coordinates where <span class="math inline"><em>x</em>/<em>w</em></span>, <span class="math inline"><em>y</em>/<em>w</em></span>, <span class="math inline"><em>z</em>/<em>w</em></span> are all between -1 and +1.</p>
+<p>The output of the rasterization operation is a 4-channel float32 image with tuple (<span class="math inline"><em>u</em></span>, <span class="math inline"><em>v</em></span>, <span class="math inline"><em>z</em>/<em>w</em></span>, <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span>) in each pixel. Values <span class="math inline"><em>u</em></span> and <span class="math inline"><em>v</em></span> are the barycentric coordinates within a triangle: the first vertex in the vertex index triplet obtains <span class="math inline">(<em>u</em>, <em>v</em>) = (1, 0)</span>, the second vertex <span class="math inline">(<em>u</em>, <em>v</em>) = (0, 1)</span> and the third vertex <span class="math inline">(<em>u</em>, <em>v</em>) = (0, 0)</span>. NDC-space depth value <span class="math inline"><em>z</em>/<em>w</em></span> is used later by the antialiasing operation to infer occlusion relations between triangles, and it does not propagate gradients to the vertex position input. Field <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span> is the triangle index, offset by one. Pixels where no triangle was rasterized will receive a zero in all channels.</p>
+<p>Rasterization is point-sampled, i.e., the geometry is not smoothed, blurred, or made partially transparent in any way, in contrast to some previous differentiable rasterizers. The contents of a pixel always represent a single surface point that is on the closest surface visible along the ray through the pixel center.</p>
+<p>Point-sampled coverage does not produce vertex position gradients related to occlusion and visibility effects. This is because the motion of vertices does not change the coverage in a continuous way — a triangle is either rasterized into a pixel or not. In nvdiffrast, the occlusion/visibility related gradients are generated in the antialiasing operation that typically occurs towards the end of the rendering pipeline.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_uv.png"/>
+<div class="caption">
+<code>[..., 0:2]</code> = barycentrics <span class="math inline">(<em>u</em>, <em>v</em>)</span>
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_tri.png"/>
+<div class="caption">
+<code>[..., 3]</code> = <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span>
+</div>
+</div>
+</div>
+</div>
+<p>The images above illustrate the output of the rasterizer. The left image shows the contents of channels 0 and 1, i.e., the barycentric coordinates, rendered as red and green, respectively. The right image shows channel 3, i.e., the triangle ID, using a random color per triangle. <a href="http://www.cs.cmu.edu/~kmcrane/Projects/ModelRepository/index.html#spot">Spot</a> model was created and released into public domain by <a href="http://www.cs.cmu.edu/~kmcrane/index.html">Keenan Crane</a>.</p>
+<h3 id="interpolation">Interpolation</h3>
+<p>Depending on the shading and lighting models, a mesh typically specifies a number of attributes at its vertices. These can include, e.g., texture coordinates, vertex normals, reflection vectors, and material parameters. The purpose of the interpolation operation is to transfer these attributes specified at vertices to image space. In the hardware graphics pipeline, this happens automatically between vertex and pixel shaders. The interpolation operation in nvdiffrast supports an arbitrary number of attributes.</p>
+<p>Concretely, the interpolation operation takes as inputs the buffer produced by the rasterizer and a buffer specifying the vertex attributes. The output is an image-size buffer with as many channels as there are attributes. Pixels where no triangle was rendered will contain all zeros in the output.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_st.png"/>
+<div class="caption">
+Texture coordinates <span class="math inline">(<em>s</em>, <em>t</em>)</span>
+</div>
+</div>
+</div>
+</div>
+<p>Above is an example of interpolated texture coordinates visualized in red and green channels. This image was created using the output of the rasterizer from the previous step, and an attribute buffer containing the texture coordinates.</p>
+<h3 id="texturing">Texturing</h3>
+<p>Texture sampling is a fundamental operation in hardware graphics pipelines, and the same is true in nvdiffrast. The basic principle is simple: given a per-pixel texture coordinate vector, fetch a value from a texture and place it in the output. In nvdiffrast, the textures may have an arbitrary number of channels, which is useful in case you want to learn, say, an abstract field that acts as an input to a neural network further down the pipeline.</p>
+<p>When sampling a texture, it is typically desirable to use some form of filtering. Most previous differentiable rasterizers support at most bilinear filtering, where sampling at a texture coordinate between texel centers will interpolate the value linearly from the four nearest texels. While this works fine when viewing the texture up close, it yields badly aliased results when the texture is viewed from a distance. To avoid this, the texture needs to be <em>prefiltered</em> prior to sampling it, removing the frequencies that are too high compared to how densely it is being sampled.</p>
+<p>Nvdiffrast supports prefiltered texture sampling based on <a href="https://en.wikipedia.org/wiki/Mipmap">mipmapping</a>. The required mipmap levels can be generated internally in the texturing operation, so that the user only needs to specify the highest-resolution (base level) texture. Currently the highest-quality filtering mode is isotropic trilinear filtering. The lack of anisotropic filtering means that a texture viewed at a steep angle will not alias in any direction, but it may appear blurry across the <q>non-squished</q> direction.</p>
+<p>In addition to standard 2D textures, the texture sampling operation also supports cube maps. Cube maps are addressed using 3D texture coordinates, and the transitions between cube map faces are properly filtered so there will be no visible seams. Cube maps support trilinear filtering similar to 2D textures. There is no explicit support for 1D textures but they can be simulated efficiently with 1<span class="math inline">×</span><span class="math inline"><em>n</em></span> textures. All the filtering, mipmapping etc. work with such textures just as they would with true 1D textures. For now there is no support for 3D volume textures.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_texture.png"/>
+<div class="caption">
+Texture of Spot
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_tex.png"/>
+<div class="caption">
+Output of the texture sampling operation
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_texw.png"/>
+<div class="caption">
+Background replaced with white
+</div>
+</div>
+</div>
+</div>
+<p>The middle image above shows the result of texture sampling using the interpolated texture coordinates from the previous step. Why is the background pink? The texture coordinates <span class="math inline">(<em>s</em>, <em>t</em>)</span> read as zero at those pixels, but that is a perfectly valid point to sample the texture. It happens that Spot's texture (left) has pink color at its <span class="math inline">(0, 0)</span> corner, and therefore all pixels in the background obtain that color as a result of the texture sampling operation. On the right, we have replaced the color of the <q>empty</q> pixels with a white color. Here's one way to do this in PyTorch:</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode python"><code class="sourceCode python"><a class="sourceLine" id="cb7-1" data-line-number="1">img_right <span class="op">=</span> torch.where(rast_out[..., <span class="dv">3</span>:] <span class="op">&gt;</span> <span class="dv">0</span>, img_left, torch.tensor(<span class="fl">1.0</span>).cuda())</a></code></pre></div>
+<p>where <code>rast_out</code> is the output of the rasterization operation. We simply test if the <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span> field, i.e., channel 3 of the rasterizer output, is greater than zero, indicating that a triangle was rendered in that pixel. If so, we take the color from the textured image, and otherwise we take constant 1.0.</p>
+<h3 id="antialiasing">Antialiasing</h3>
+<p>The last of the four primitive operations in nvdiffrast is antialiasing. Based on the geometry input (vertex positions and triangles), it will smooth out discontinuties at silhouette edges in a given image. The smoothing is based on a local approximation of coverage — an approximate integral over a pixel is calculated based on the exact location of relevant edges and the point-sampled colors at pixel centers.</p>
+<p>In this context, a silhouette is any edge that connects to just one triangle, or connects two triangles so that one folds behind the other. Specifically, this includes both silhouettes against the background and silhouettes against another surface, unlike some previous methods (<a href="https://github.com/nv-tlabs/DIB-R">DIB-R</a>) that only support the former kind.</p>
+<p>It is worth discussing why we might want to go through this trouble to improve the image a tiny bit. If we're attempting to, say, match a real-world photograph, a slightly smoother edge probably won't match the captured image much better than a jagged one. However, that is not the point of the antialiasing operation — the real goal is to obtain gradients w.r.t. vertex positions related to occlusion, visibility, and coverage.</p>
+<p>Remember that everything up to this point in the rendering pipeline is point-sampled. In particular, the coverage, i.e., which triangle is rasterized to which pixel, changes discontinuously in the rasterization operation.</p>
+<p>This is the reason why previous differentiable rasterizers apply nonstandard image synthesis model with blur and transparency: Something has to make coverage continuous w.r.t. vertex positions if we wish to optimize vertex positions, camera position, etc., based on an image-space loss. In nvdiffrast, we do everything point-sampled so that we know that every pixel corresponds to a single, well-defined surface point. This lets us perform arbitrary shading computations without worrying about things like accidentally blurring texture coordinates across silhouettes, or having attributes mysteriously tend towards background color when getting close to the edge of the object. Only towards the end of the pipeline, the antialiasing operation ensures that the motion of vertex positions results in continuous change on silhouettes.</p>
+<p>The antialiasing operation supports any number of channels in the image to be antialiased. Thus, if your rendering pipeline produces an abstract representation that is fed to a neural network for further processing, that is not a problem.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_aa.png"/>
+<div class="caption">
+Antialiased image
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_crop1.png"/>
+<div class="caption">
+Closeup, before AA
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_crop2.png"/>
+<div class="caption">
+Closeup, after AA
+</div>
+</div>
+</div>
+</div>
+<p>The left image above shows the result image from the last step, after performing antialiasing. The effect is quite small — some boundary pixels become less jagged, as shown in the closeups.</p>
+<p>Notably, not all boundary pixels are antialiased as revealed by the left-side image below. This is because the accuracy of the antialiasing operation in nvdiffrast depends on the rendered size of triangles: Because we store knowledge of just one surface point per pixel, antialiasing is possible only when the triangle that contains the actual geometric silhouette edge is visible in the image. The example image is rendered in very low resolution and the triangles are tiny compared to pixels. Thus, triangles get easily lost between the pixels.</p>
+<p>This results in incomplete-looking antialiasing, and the gradients provided by antialiasing become noisier when edge triangles are missed. Therefore it is advisable to render images in resolutions where the triangles are large enough to show up in the image at least most of the time.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_diff1.png"/>
+<div class="caption">
+Pixels touched by antialiasing, original resolution
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_diff2.png"/>
+<div class="caption">
+Rendered in 4×4 higher resolution and downsampled
+</div>
+</div>
+</div>
+</div>
+<p>The left image above shows which pixels were modified by the antialiasing operation in this example. On the right, we performed the rendering in 4×4 higher resolution and downsampled the final images back to the original size. This yields more accurate position gradients related to the silhouettes, so if you suspect your position gradients are too noisy, you may want to try simply increasing the resolution in which rasterization and antialiasing is done.</p>
+<p>For purposes of shape optimization, the sparse-looking situation on the left would probably be perfectly fine. The gradients are still going to point in the right direction even if they are somewhat sparse, and you will need to use some sort of shape regularization anyway, which will greatly increase tolerance to noisy shape gradients.</p>
+<h2 id="beyond-the-basics">Beyond the basics</h2>
+<p>Rendering images is easy with nvdiffrast, but there are a few practical things that you will need to take into account. The topics in this section explain the operation and usage of nvdiffrast in more detail, and hopefully help you avoid any potential misunderstandings and pitfalls.</p>
+<h3 id="coordinate-systems">Coordinate systems</h3>
+<p>Nvdiffrast follows OpenGL's coordinate systems and other conventions. This is partially because we use OpenGL to accelerate the rasterization operation, but mostly so that there is a <a href="https://xkcd.com/927/">single standard to follow</a>.</p>
+<ul>
+<li>
+The NDC coordinate system, used for specifying vertex positions in rasterization, maps to screen so that <span class="math inline"><em>x</em></span> increases towards right side of screen, <span class="math inline"><em>y</em></span> increases towards top of screen, and <strong><span class="math inline"><em>z</em></span> increases towards the viewer</strong>.
+</li>
+<li>
+<strong>The memory order of image data in OpenGL, and consequently in nvdiffrast, is bottom-up.</strong> This means that row 0 of a tensor containing an image is the bottom row of the texture/image, which is the opposite of the more common scanline order. If you want to keep your image data in the conventional top-down order in your code, but have it logically the right way up inside nvdiffrast, you will need to flip the images vertically when crossing the boundary.
+</li>
+<li>
+For 2D textures, the coordinate origin <span class="math inline">(<em>s</em>, <em>t</em>) = (0, 0)</span> is at the bottom left corner with <span class="math inline"><em>s</em></span> increasing to the right and <span class="math inline"><em>t</em></span> increasing to the top. When specifying the faces of a cube map texture, the orientation varies between the faces, but nvdiffrast follows the <a href="https://www.khronos.org/opengl/wiki/Cubemap_Texture">OpenGL convention</a> here as well.
+</li>
+</ul>
+<p>As a word of advice, it is best to stay on top of coordinate systems and orientations used in your program. When something appears to be the wrong way around, it is much better to identify and fix the root cause than to randomly flip coordinates, images, buffers, and matrices until the immediate problem goes away.</p>
+<h3 id="geometry-and-minibatches-range-mode-vs-instanced-mode">Geometry and minibatches: Range mode vs Instanced mode</h3>
+<p>As mentioned earlier, all operations in nvdiffrast support the minibatch axis efficiently. Related to this, we support two ways for representing the geometry: <strong>range mode</strong> and <strong>instanced mode</strong>. If you want to render a different mesh in each minibatch index, you need to use the range mode. However, if you are rendering the same mesh, but with potentially different viewpoints, vertex positions, attributes, textures, etc., in each minibatch index, the instanced mode will be much more convenient.</p>
+<p>In <strong>range mode</strong>, you specify triangle index triplets as a 2D tensor of shape [<em>num_triangles</em>, 3], and vertex positions as a 2D tensor of shape [<em>num_vertices</em>, 4]. In addition to these, the rasterization operation requires an additional 2D <em>range tensor</em> of shape [<em>minibatch_size</em>, 2] where each row specifies a start index and count into the triangle tensor. As a result, the rasterizer will render the triangles in the specified ranges into each minibatch index of the output tensor. If you have multiple meshes, you should place all of them into the vertex and triangle tensors, and then choose which mesh to rasterize into each minibatch index via the contents of the range tensor. The attribute tensor in interpolation operation is handled in the same way as positions, and it has to be of shape [<em>num_vertices</em>, <em>num_attributes</em>] in range mode.</p>
+<p>In <strong>instanced mode</strong>, the topology of the mesh will be shared for each minibatch index. The triangle tensor is still a 2D tensor with shape [<em>num_triangles</em>, 3], but the vertex positions are specified using a 3D tensor of shape [<em>minibatch_size</em>, <em>num_vertices</em>, 4]. With a 3D vertex position tensor, the rasterizer will not require the range tensor input, but will take the minibatch size from the first dimension of the vertex position tensor. The same triangles are rendered to each minibatch index, but with vertex positions taken from the corresponding slice of the vertex position tensor. In this mode, the attribute tensor in interpolation has to be a 3D tensor similar to position tensor, i.e., of shape [<em>minibatch_size</em>, <em>num_vertices</em>, <em>num_attributes</em>]. However, you can provide an attribute tensor with minibatch size of 1, and it will be broadcast across the minibatch.</p>
+<h3 id="image-space-derivatives">Image-space derivatives</h3>
+<p>We skirted around a pretty fundamental question in the description of the texturing operation above. In order to determine the proper amount of prefiltering for sampling a texture, we need to know how densely it is being sampled. But how can we know the sampling density when each pixel knows of a just a single surface point?</p>
+<p>The solution is to track the image-space derivatives of all things leading up to the texture sampling operation. <em>These are not the same thing as the gradients used in the backward pass</em>, even though they both involve differentiation! Consider the barycentrics <span class="math inline">(<em>u</em>, <em>v</em>)</span> produced by the rasterization operation. They change by some amount when moving horizontally or vertically in the image plane. If we denote the image-space coordinates as <span class="math inline">(<em>X</em>, <em>Y</em>)</span>, the image-space derivatives of the barycentrics would be <span class="math inline">∂<em>u</em>/∂<em>X</em></span>, <span class="math inline">∂<em>u</em>/∂<em>Y</em></span>, <span class="math inline">∂<em>v</em>/∂<em>X</em></span>, and <span class="math inline">∂<em>v</em>/∂<em>Y</em></span>. We can organize these into a 2×2 Jacobian matrix that describes the local relationship between <span class="math inline">(<em>u</em>, <em>v</em>)</span> and <span class="math inline">(<em>X</em>, <em>Y</em>)</span>. This matrix is generally different at every pixel. For the purpose of image-space derivatives, the units of <span class="math inline"><em>X</em></span> and <span class="math inline"><em>Y</em></span> are pixels. Hence, <span class="math inline">∂<em>u</em>/∂<em>X</em></span> is the local approximation of how much <span class="math inline"><em>u</em></span> changes when moving a distance of one pixel in the horizontal direction, and so on.</p>
+<p>Once we know how the barycentrics change w.r.t. pixel position, the interpolation operation can use this to determine how the attributes change w.r.t. pixel position. When attributes are used as texture coordinates, we can therefore tell how the texture sampling position (in texture space) changes when moving around within the pixel (up to a local, linear approximation, that is). This <em>texture footprint</em> tells us the scale on which the texture should be prefiltered. In more practical terms, it tells us which mipmap level(s) to use when sampling the texture.</p>
+<p>In nvdiffrast, the rasterization operation can be configured to output the image-space derivatives of the barycentrics in an auxiliary 4-channel output tensor, ordered (<span class="math inline">∂<em>u</em>/∂<em>X</em></span>, <span class="math inline">∂<em>u</em>/∂<em>Y</em></span>, <span class="math inline">∂<em>v</em>/∂<em>X</em></span>, <span class="math inline">∂<em>v</em>/∂<em>Y</em></span>) from channel 0 to 3. The interpolation operation can take this auxiliary tensor as input and compute image-space derivatives of any set of attributes being interpolated. Finally, the texture sampling operation can use the image-space derivatives of the texture coordinates to determine the amount of prefiltering.</p>
+<p>There is nothing magic about these image-space derivatives. They are tensors like the, e.g., the texture coordinates themselves, they propagate gradients backwards, and so on. For example, if you want to artificially blur or sharpen the texture when sampling it, you can simply multiply the tensor carrying the image-space derivatives of the texture coordinates <span class="math inline">∂{<em>s</em>, <em>t</em>}/∂{<em>X</em>, <em>Y</em>}</span> by a scalar value before feeding it into the texture sampling operation. This scales the texture footprints and thus adjusts the amount of prefiltering. If your loss function prefers a different level of sharpness, this multiplier will receive a nonzero gradient. <em>Update:</em> Since version 0.2.1, the texture sampling operation also supports a separate mip level bias input that would be better suited for this particular task, but the gist is the same nonetheless.</p>
+<p>One might wonder if it would have been easier to determine the texture footprints simply from the texture coordinates in adjacent pixels, and skip all this derivative rubbish? In easy cases the answer is yes, but silhouettes, occlusions, and discontinuous texture parameterizations would make this approach rather unreliable in practice. Computing the image-space derivatives analytically keeps everything point-like, local, and well-behaved.</p>
+<p>It should be noted that computing gradients related to image-space derivatives is somewhat involved and requires additional computation. At the same time, they are often not crucial for the convergence of the training/optimization. Because of this, the primitive operations in nvdiffrast offer options to disable the calculation of these gradients. We're talking about things like <span class="math inline">∂<em>L</em><em>o</em><em>s</em><em>s</em>/∂(∂{<em>u</em>, <em>v</em>}/∂{<em>X</em>, <em>Y</em>})</span> that may look second-order-ish, but they're not.</p>
+<h3 id="mipmaps-and-texture-dimensions">Mipmaps and texture dimensions</h3>
+<p>Prefiltered texture sampling modes require <a href="https://en.wikipedia.org/wiki/Mipmap">mipmaps</a>, i.e., downsampled versions, of the texture. The texture sampling operation can construct these internally, or you can provide your own mipmap stack, but there are limits to texture dimensions that need to be considered.</p>
+<p>When mipmaps are constructed internally, each mipmap level is constructed by averaging 2×2 pixel patches of the preceding level (or of the texture itself for the first mipmap level). The size of the buffer to be averaged therefore has to be divisible by 2 in both directions. There is one exception: side length of 1 is valid, and it will remain as 1 in the downsampling operation.</p>
+<p>For example, a 32×32 texture will produce the following mipmap stack:</p>
+<div class="image-parent">
+<table>
+<tr>
+<td class="mip">
+32×32
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+16×16
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+8×8
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+4×4
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+2×2
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+1×1
+</td>
+</tr>
+<tr>
+<td class="mip">
+Base texture
+</td>
+<td class="mip">
+Mip level 1
+</td>
+<td class="mip">
+Mip level 2
+</td>
+<td class="mip">
+Mip level 3
+</td>
+<td class="mip">
+Mip level 4
+</td>
+<td class="mip">
+Mip level 5
+</td>
+</tr>
+</table>
+</div>
+<p>And a 32×8 texture, with both sides powers of two but not equal, will result in:</p>
+<div class="image-parent">
+<table>
+<tr>
+<td class="mip">
+32×8
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+16×4
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+8×2
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+4×1
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+2×1
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+1×1
+</td>
+</tr>
+<tr>
+<td class="mip">
+Base texture
+</td>
+<td class="mip">
+Mip level 1
+</td>
+<td class="mip">
+Mip level 2
+</td>
+<td class="mip">
+Mip level 3
+</td>
+<td class="mip">
+Mip level 4
+</td>
+<td class="mip">
+Mip level 5
+</td>
+</tr>
+</table>
+</div>
+<p>For texture sizes like this, everything will work automatically and mipmaps are constructed down to 1×1 pixel size. Therefore, if you wish to use prefiltered texture sampling, you should <strong>scale your textures to power-of-two dimensions</strong> that do not, however, need to be equal.</p>
+<p>How about texture atlases? You may have an object whose texture is composed of multiple individual patches, or a collection of textured meshes with a unique texture for each. Say we have a texture atlas composed of five 32×32 sub-images, i.e., a total size of 160×32 pixels. Now we cannot compute mipmap levels all the way down to 1×1 size, because there is a 5×1 mipmap in the way that cannot be downsampled (because 5 is not even):</p>
+<div class="image-parent">
+<table>
+<tr>
+<td class="mip">
+160×32
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+80×16
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+40×8
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+20×4
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+10×2
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+<span style="color: #c00"><b>5</b></span>×1
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip" rowspan="2">
+Error!
+</td>
+</tr>
+<tr>
+<td class="mip">
+Base texture
+</td>
+<td class="mip">
+Mip level 1
+</td>
+<td class="mip">
+Mip level 2
+</td>
+<td class="mip">
+Mip level 3
+</td>
+<td class="mip">
+Mip level 4
+</td>
+<td class="mip">
+Mip level 5
+</td>
+</tr>
+</table>
+</div>
+<p>Scaling the atlas to, say, 256×32 pixels would feel silly because the dimensions of the sub-images are perfectly fine, and downsampling the different sub-images together — which would happen after the 5×1 resolution — would not make sense anyway. For this reason, the texture sampling operation allows the user to specify the maximum number of mipmap levels to be constructed and used. In this case, setting <code>max_mip_level=5</code> would stop at the 5×1 mipmap and prevent the error.</p>
+<p>It is a deliberate design choice that nvdiffrast doesn't just stop automatically at a mipmap size it cannot downsample, but requires the user to specify a limit when the texture dimensions are not powers of two. The goal is to avoid bugs where prefiltered texture sampling mysteriously doesn't work due to an oddly sized texture. It would be confusing if a 256×256 texture gave beautifully prefiltered texture samples, a 255×255 texture suddenly had no prefiltering at all, and a 254×254 texture did just a bit of prefiltering (one level) but not more.</p>
+<p>If you compute your own mipmaps, their sizes must follow the scheme described above. There is no need to specify mipmaps all the way to 1×1 resolution, but the stack can end at any point and it will work equivalently to an internally constructed mipmap stack with a <code>max_mip_level</code> limit. Importantly, the gradients of user-provided mipmaps are not propagated automatically to the base texture — naturally so, because nvdiffrast knows nothing about the relation between them. Instead, the tensors that specify the mip levels in a user-provided mipmap stack will receive gradients of their own.</p>
+<h3 id="running-on-multiple-gpus">Running on multiple GPUs</h3>
+<p>Nvdiffrast supports computation on multiple GPUs in both PyTorch and TensorFlow. As is the convention in PyTorch, the operations are always executed on the device on which the input tensors reside. All GPU input tensors must reside on the same device, and the output tensors will unsurprisingly end up on that same device. In addition, the rasterization operation requires that its OpenGL context was created for the correct device. In TensorFlow, the OpenGL context is automatically created on the device of the rasterization operation when it is executed for the first time.</p>
+<p>On Windows, nvdiffrast implements OpenGL device selection in a way that can be done only once per process — after one context is created, all future ones will end up on the same GPU. Hence you cannot expect to run the rasterization operation on multiple GPUs within the same process. Trying to do so will either cause a crash or incur a significant performance penalty. However, with PyTorch it is common to distribute computation across GPUs by launching a separate process for each GPU, so this is not a huge concern. Note that any OpenGL context created within the same process, even for something like a GUI window, will prevent changing the device later. Therefore, if you want to run the rasterization operation on other than the default GPU, be sure to create its OpenGL context before initializing any other OpenGL-powered libraries.</p>
+<p>On Linux everything just works, and you can create rasterizer OpenGL contexts on multiple devices within the same process.</p>
+<h3 id="rendering-multiple-depth-layers">Rendering multiple depth layers</h3>
+<p>Sometimes there is a need to render scenes with partially transparent surfaces. In this case, it is not sufficient to find only the surfaces that are closest to the camera, as you may also need to know what lies behind them. For this purpose, nvdiffrast supports <em>depth peeling</em> that lets you extract multiple closest surfaces for each pixel.</p>
+<p>With depth peeling, we start by rasterizing the closest surfaces as usual. We then perform a second rasterization pass with the same geometry, but this time we cull all previously rendered surface points at each pixel, effectively extracting the second-closest depth layer. This can be repeated as many times as desired, so that we can extract as many depth layers as we like. See the images below for example results of depth peeling with each depth layer shaded and antialiased.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_aa.png"/>
+<div class="caption">
+First depth layer
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_peel1.png"/>
+<div class="caption">
+Second depth layer
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_peel2.png"/>
+<div class="caption">
+Third depth layer
+</div>
+</div>
+</div>
+</div>
+<p>The API for depth peeling is based on <code>DepthPeeler</code> object that acts as a <a href="https://docs.python.org/3/reference/datamodel.html#context-managers">context manager</a>, and its <code>rasterize_next_layer</code> method. The first call to <code>rasterize_next_layer</code> is equivalent to calling the traditional <code>rasterize</code> function, and subsequent calls report further depth layers. The arguments for rasterization are specified when instantiating the <code>DepthPeeler</code> object. Concretely, your code might look something like this:</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode python"><code class="sourceCode python"><a class="sourceLine" id="cb8-1" data-line-number="1"><span class="cf">with</span> nvdiffrast.torch.DepthPeeler(glctx, pos, tri, resolution) <span class="im">as</span> peeler:</a>
+<a class="sourceLine" id="cb8-2" data-line-number="2">  <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(num_layers):</a>
+<a class="sourceLine" id="cb8-3" data-line-number="3">    rast, rast_db <span class="op">=</span> peeler.rasterize_next_layer()</a>
+<a class="sourceLine" id="cb8-4" data-line-number="4">    (process <span class="kw">or</span> store the results)</a></code></pre></div>
+<p>There is no performance penalty compared to the basic rasterization op if you end up extracting only the first depth layer. In other words, the code above with <code>num_layers=1</code> runs exactly as fast as calling <code>rasterize</code> once.</p>
+<p>Depth peeling is only supported in the PyTorch version of nvdiffrast. For implementation reasons, depth peeling reserves the OpenGL context so that other rasterization operations cannot be performed while the peeling is ongoing, i.e., inside the <code>with</code> block. Hence you cannot start a nested depth peeling operation or call <code>rasterize</code> inside the <code>with</code> block, unless you use a different OpenGL context.</p>
+<p>For the sake of completeness, let us note the following small caveat: Depth peeling relies on depth values to distinguish surface points from each other. Therefore, culling &quot;previously rendered surface points&quot; actually means culling all surface points at the same or closer depth as those rendered into the pixel in previous passes. This matters only if you have multiple layers of geometry at matching depths — if your geometry consists of, say, nothing but two exactly overlapping triangles, you will see one of them in the first pass but never see the other one in subsequent passes, as it's at the exact depth that is already considered done.</p>
+<h3 id="differences-between-pytorch-and-tensorflow">Differences between PyTorch and TensorFlow</h3>
+<p>Nvdiffrast can be used from PyTorch and from TensorFlow 1.x; the latter may change to TensorFlow 2.x if there is demand. These frameworks operate somewhat differently and that is reflected in the respective APIs. Simplifying a bit, in TensorFlow 1.x you construct a persistent graph out of persistent nodes, and run many batches of data through it. In PyTorch, there is no persistent graph or nodes, but a new, ephemeral graph is constructed for each batch of data and destroyed immediately afterwards. Therefore, there is also no persistent state for the operations. There is the <code>torch.nn.Module</code> abstraction for festooning operations with persistent state, but we do not use it.</p>
+<p>As a consequence, things that would be part of persistent state of an nvdiffrast operation in TensorFlow must be stored by the user in PyTorch, and supplied to the operations as needed. In practice, this is a very small difference and amounts to just a couple of lines of code in most cases.</p>
+<p>As an example, consider the OpenGL context used by the rasterization operation. In order to use hardware-accelerated rendering, an OpenGL context must be created and switched into before issuing OpenGL commands internally. Creating the context is an expensive operation, so we don't want to create and destroy one at every call of the rasterization operation. In TensorFlow, the rasterization operation creates a context when it is executed for the first time, and stashes it away in its persistent state to be reused later. In PyTorch, the user has to create the context using a separate function call, and supply it as a parameter to the rasterization operation.</p>
+<p>Similarly, if you have a constant texture and want to use prefiltered texture sampling modes, the mipmap stack only needs to be computed once. In TensorFlow, you can specify that the texture is constant, in which case the texture sampling operation only computes the mipmap stack on the first execution and stores it internally. In PyTorch, you can compute the mipmap stack once using a separate function call, and supply it to the texture sampling operation every time. If you don't do that, the operation will compute the mipmap stack internally and discard it afterwards. This is exactly what you want if your texture changes at every iteration, and it's not wrong even if the texture is constant, just a bit inefficient.</p>
+<p>Finally, the same holds for a thing called the <em>topology hash</em> that the antialiasing operation uses for identifying potential silhouette edges. Its contents depend only on the triangle tensor, not the vertex positions, so if the topology is constant, this auxiliary structure needs to be constructed only once. As before, in TensorFlow this is handled internally, whereas in PyTorch a separate function is provided for <q>off-line</q> construction.</p>
+<h4 id="manual-opengl-contexts-in-pytorch">Manual OpenGL contexts in PyTorch</h4>
+<p>First, please note that handling OpenGL contexts manually is a very small optimization. It almost certainly won't be relevant unless you've already profiled and optimized your code <em>with gusto</em>, and you're on a mission to extract every last bit of performance possible.</p>
+<p>In TensorFlow, the only option is to let nvdiffrast handle the OpenGL context management internally. This is because TensorFlow utilizes multiple CPU threads under the hood, and the active OpenGL context is a thread-local resource.</p>
+<p>PyTorch isn't as unpredictable, and stays in the same CPU thread by default (although things like <code>torch.utils.data.DataLoader</code> do invoke additional CPU threads). As such, nvdiffrast lets the user choose between handling OpenGL context switching in <strong>automatic</strong> or <strong>manual</strong> mode. The default is automatic mode where the rasterization operation always sets/releases the context at the beginning/end of each execution, like we do in TensorFlow. This ensures that the rasterizer will always use the context that you supply, and the context won't remain active so nobody else can mess with it.</p>
+<p>In manual mode, the user assumes the responsibility of setting and releasing the OpenGL context. Most of the time, if you don't have any other libraries that would be using OpenGL, you can just set the context once after having created it and keep it set until the program exits. However, keep in mind that the active OpenGL context is a thread-local resource, so it needs to be set in the same CPU thread as it will be used, and it cannot be set simultaneously in multiple CPU threads.</p>
+<h2 id="samples">Samples</h2>
+<p>Nvdiffrast comes with a set of samples that were crafted to support the research paper. Each sample is available in both PyTorch and TensorFlow versions. Details such as command-line parameters, logging format, etc., may not be identical between the versions, and generally the PyTorch versions should be considered definitive. The command-line examples below are for the PyTorch versions.</p>
+<h3 id="triangle.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/triangle.py">triangle.py</a></h3>
+<p>This is a minimal sample that renders a triangle and saves the resulting image into a file (<code>tri.png</code>) in the current directory. Running this should be the first step to verify that you have everything set up correctly. Rendering is done using the rasterization and interpolation operations, so getting the correct output image means that both OpenGL and CUDA are working as intended under the hood.</p>
+<p>Example command line:</p>
+<pre><code>python triangle.py</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/tri.png"/>
+<div class="caption">
+The expected output image
+</div>
+</div>
+</div>
+</div>
+<h3 id="cube.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/cube.py">cube.py</a></h3>
+<p>In this sample, we optimize the vertex positions and colors of a cube mesh, starting from a semi-randomly initialized state. The optimization is based on image-space loss in extremely low resolutions such as 4×4, 8×8, or 16×16 pixels. The goal of this sample is to examine the rate of geometrical convergence when the triangles are only a few pixels in size. It serves to illustrate that the antialiasing operation, despite being approximative, yields good enough position gradients even in 4×4 resolution to guide the optimization to the goal.</p>
+<p>Example command line:</p>
+<pre><code>python cube.py --resolution 16 --display-interval 10</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/cube.png"/>
+<div class="caption">
+Interactive view of cube.py
+</div>
+</div>
+<div class="image-caption">
+<img class="pipe" src="img/pipe_cube.png"/>
+<div class="caption">
+Rendering pipeline
+</div>
+</div>
+</div>
+</div>
+<p>The image above shows a live view of the sample. Top row shows the low-resolution rendered image and reference image that the image-space loss is calculated from. Bottom row shows the current mesh (and colors) and reference mesh in high resolution so that convergence can be seen more easily visually.</p>
+<p>In the pipeline diagram, green boxes indicate nvdiffrast operations, whereas blue boxes are other computation. Red boxes are the learned tensors and gray are non-learned tensors or other data.</p>
+<h3 id="earth.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/earth.py">earth.py</a></h3>
+<p>The goal of this sample is to compare texture convergence with and without prefiltered texture sampling. The texture is learned based on image-space loss against high-quality reference renderings in random orientations and at random distances. When prefiltering is disabled, the texture is not learned properly because of spotty gradient updates caused by aliasing. This shows as a much worse PSNR for the texture, compared to learning with prefiltering enabled. See the paper for further discussion.</p>
+<p>Example command lines:</p>
+<table>
+<tr>
+<td class="cmd">
+<code>python earth.py --display-interval 10</code>
+</td>
+<td class="cmd">
+No prefiltering, bilinear interpolation.
+</td>
+</tr>
+<tr>
+<td class="cmd">
+<code>python earth.py --display-interval 10 --mip</code>
+</td>
+<td class="cmd">
+Prefiltering enabled, trilinear interpolation.
+</td>
+</tr>
+</table>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/earth.png"/>
+<div class="caption">
+Interactive view of earth.py, prefiltering disabled
+</div>
+</div>
+<div class="image-caption">
+<img class="pipe" src="img/pipe_earth.png"/>
+<div class="caption">
+Rendering pipeline
+</div>
+</div>
+</div>
+</div>
+<p>The interactive view shows the current texture mapped onto the mesh, with or without prefiltered texture sampling as specified via the command-line parameter. In this sample, no antialiasing is performed because we are not learning vertex positions and hence need no gradients related to them.</p>
+<h3 id="envphong.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/envphong.py">envphong.py</a></h3>
+<p>In this sample, a more complex shading model is used compared to the vertex colors or plain texture in the previous ones. Here, we learn a reflected environment map and parameters of a Phong BRDF model given a known mesh. The optimization is based on image-space loss against reference renderings in random orientations. The shading model of mirror reflection plus a Phong BRDF is not physically sensible, but it works as a reasonably simple strawman that would not be possible to implement with previous differentiable rasterizers that bundle rasterization, shading, lighting, and texturing together. The sample also illustrates the use of cube mapping for representing a learned texture in a spherical domain.</p>
+<p>Example command line:</p>
+<pre><code>python envphong.py --display-interval 10</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/envphong.png"/>
+<div class="caption">
+Interactive view of envphong.py
+</div>
+</div>
+<div class="image-caption">
+<img class="pipe" src="img/pipe_envphong.png"/>
+<div class="caption">
+Rendering pipeline
+</div>
+</div>
+</div>
+</div>
+<p>In the interactive view, we see the rendering with the current environment map and Phong BRDF parameters, both gradually improving during the optimization.</p>
+<h3 id="pose.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/pose.py">pose.py</a></h3>
+<p>Pose fitting based on an image-space loss is a classical task in differentiable rendering. In this sample, we solve a pose optimization problem with a simple cube with differently colored sides. We detail the optimization method in the paper, but in brief, it combines gradient-free greedy optimization in an initialization phase and gradient-based optimization in a fine-tuning phase.</p>
+<p>Example command line:</p>
+<pre><code>python pose.py --display-interval 10</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/pose.png"/>
+<div class="caption">
+Interactive view of pose.py
+</div>
+</div>
+</div>
+</div>
+<p>The interactive view shows, from left to right: target pose, best found pose, and current pose. When viewed live, the two stages of optimization are clearly visible. In the first phase, the best pose updates intermittently when a better initialization is found. In the second phase, the solution converges smoothly to the target via gradient-based optimization.</p>
+<h2 id="pytorch-api-reference">PyTorch API reference</h2>
+<div style="padding-top: 1em;">
+<div class="apifunc"><h4><code>nvdiffrast.torch.RasterizeGLContext(<em>output_db</em>=<span class="defarg">True</span>, <em>mode</em>=<span class="defarg">'automatic'</span>, <em>device</em>=<span class="defarg">None</span>)</code>&nbsp;<span class="sym_class">Class</span></h4>
+<p class="shortdesc">Create a new OpenGL rasterizer context.</p><p class="longdesc">Creating an OpenGL context is a slow operation so you should reuse the same
+context in all calls to <code>rasterize()</code> on the same CPU thread. The OpenGL context
+is deleted when the object is destroyed.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">output_db</td><td class="arg_short">Compute and output image-space derivates of barycentrics.</td></tr><tr class="arg"><td class="argname">mode</td><td class="arg_short">OpenGL context handling mode. Valid values are 'manual' and 'automatic'.</td></tr><tr class="arg"><td class="argname">device</td><td class="arg_short">Cuda device on which the context is created. Type can be
+<code>torch.device</code>, string (e.g., <code>'cuda:1'</code>), or int. If not
+specified, context will be created on currently active Cuda
+device.</td></tr></table><div class="methods">Methods, only available if context was created in manual mode:</div><table class="args"><tr class="arg"><td class="argname">set_context()</td><td class="arg_short">Set (activate) OpenGL context in the current CPU thread.</td></tr><tr class="arg"><td class="argname">release_context()</td><td class="arg_short">Release (deactivate) currently active OpenGL context.</td></tr></table><div class="returns">Returns:<div class="return_description">The newly created OpenGL rasterizer context.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.rasterize(<em>glctx</em>, <em>pos</em>, <em>tri</em>, <em>resolution</em>, <em>ranges</em>=<span class="defarg">None</span>, <em>grad_db</em>=<span class="defarg">True</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Rasterize triangles.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory except for
+the <code>ranges</code> tensor that, if specified, has to reside in CPU memory. The
+output tensors will be contiguous and reside in GPU memory.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">glctx</td><td class="arg_short">OpenGL context of type <code>RasterizeGLContext</code>.</td></tr><tr class="arg"><td class="argname">pos</td><td class="arg_short">Vertex position tensor with dtype <code>torch.float32</code>. To enable range
+mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].</td></tr><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor with shape [num_triangles, 3] and dtype <code>torch.int32</code>.</td></tr><tr class="arg"><td class="argname">resolution</td><td class="arg_short">Output resolution as integer tuple (height, width).</td></tr><tr class="arg"><td class="argname">ranges</td><td class="arg_short">In range mode, tensor with shape [minibatch_size, 2] and dtype
+<code>torch.int32</code>, specifying start indices and counts into <code>tri</code>.
+Ignored in instanced mode.</td></tr><tr class="arg"><td class="argname">grad_db</td><td class="arg_short">Propagate gradients of image-space derivatives of barycentrics
+into <code>pos</code> in backward pass. Ignored if OpenGL context was
+not configured to output image-space derivatives.</td></tr></table><div class="returns">Returns:<div class="return_description">A tuple of two tensors. The first output tensor has shape [minibatch_size,
+height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+triangle_id). If the OpenGL context was configured to output image-space
+derivatives of barycentrics, the second output tensor will also have shape
+[minibatch_size, height, width, 4] and contain said derivatives in order
+(du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+[minibatch_size, height, width, 0].</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.DepthPeeler(<em>...</em>)</code>&nbsp;<span class="sym_class">Class</span></h4>
+<p class="shortdesc">Create a depth peeler object for rasterizing multiple depth layers.</p><p class="longdesc">Arguments are the same as in <code>rasterize()</code>.</p><div class="returns">Returns:<div class="return_description">The newly created depth peeler.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.DepthPeeler.rasterize_next_layer()</code>&nbsp;<span class="sym_method">Method</span></h4>
+<p class="shortdesc">Rasterize next depth layer.</p><p class="longdesc">Operation is equivalent to <code>rasterize()</code> except that previously reported
+surface points are culled away.</p><div class="returns">Returns:<div class="return_description">A tuple of two tensors as in <code>rasterize()</code>.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.interpolate(<em>attr</em>, <em>rast</em>, <em>tri</em>, <em>rast_db</em>=<span class="defarg">None</span>, <em>diff_attrs</em>=<span class="defarg">None</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Interpolate vertex attributes.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory. The output tensors
+will be contiguous and reside in GPU memory.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">attr</td><td class="arg_short">Attribute tensor with dtype <code>torch.float32</code>. 
+Shape is [num_vertices, num_attributes] in range mode, or 
+[minibatch_size, num_vertices, num_attributes] in instanced mode.
+Broadcasting is supported along the minibatch axis.</td></tr><tr class="arg"><td class="argname">rast</td><td class="arg_short">Main output tensor from <code>rasterize()</code>.</td></tr><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor with shape [num_triangles, 3] and dtype <code>torch.int32</code>.</td></tr><tr class="arg"><td class="argname">rast_db</td><td class="arg_short">(Optional) Tensor containing image-space derivatives of barycentrics, 
+i.e., the second output tensor from <code>rasterize()</code>. Enables computing
+image-space derivatives of attributes.</td></tr><tr class="arg"><td class="argname">diff_attrs</td><td class="arg_short">(Optional) List of attribute indices for which image-space
+derivatives are to be computed. Special value 'all' is equivalent
+to list [0, 1, ..., num_attributes - 1].</td></tr></table><div class="returns">Returns:<div class="return_description">A tuple of two tensors. The first output tensor contains interpolated
+attributes and has shape [minibatch_size, height, width, num_attributes].
+If <code>rast_db</code> and <code>diff_attrs</code> were specified, the second output tensor contains
+the image-space derivatives of the selected attributes and has shape
+[minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+Otherwise, the second output tensor will be an empty tensor with shape
+[minibatch_size, height, width, 0].</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.texture(<em>tex</em>, <em>uv</em>, <em>uv_da</em>=<span class="defarg">None</span>, <em>mip_level_bias</em>=<span class="defarg">None</span>, <em>mip</em>=<span class="defarg">None</span>, <em>filter_mode</em>=<span class="defarg">'auto'</span>, <em>boundary_mode</em>=<span class="defarg">'wrap'</span>, <em>max_mip_level</em>=<span class="defarg">None</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Perform texture sampling.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory. The output tensor
+will be contiguous and reside in GPU memory.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">tex</td><td class="arg_short">Texture tensor with dtype <code>torch.float32</code>. For 2D textures, must have shape
+[minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+tex_width and tex_height are equal. Note that <code>boundary_mode</code> must also be set
+to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.</td></tr><tr class="arg"><td class="argname">uv</td><td class="arg_short">Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
+must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+texture, must have shape [minibatch_size, height, width, 3].</td></tr><tr class="arg"><td class="argname">uv_da</td><td class="arg_short">(Optional) Tensor containing image-space derivatives of texture coordinates.
+Must have same shape as <code>uv</code> except for the last dimension that is to be twice
+as long.</td></tr><tr class="arg"><td class="argname">mip_level_bias</td><td class="arg_short">(Optional) Per-pixel bias for mip level selection. If <code>uv_da</code> is omitted,
+determines mip level directly. Must have shape [minibatch_size, height, width].</td></tr><tr class="arg"><td class="argname">mip</td><td class="arg_short">(Optional) Preconstructed mipmap stack from a <code>texture_construct_mip()</code> call, or a list
+of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
+the tensors in the list must follow the same format as <code>tex</code> except for width and
+height that must follow the usual rules for mipmap sizes. The base level texture
+is still supplied in <code>tex</code> and must not be included in the list. Gradients of a
+custom mipmap stack are not automatically propagated to base texture but the mipmap
+tensors will receive gradients of their own. If a mipmap stack is not specified
+but the chosen filter mode requires it, the mipmap stack is constructed internally
+and discarded afterwards.</td></tr><tr class="arg"><td class="argname">filter_mode</td><td class="arg_short">Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+selects 'linear' if neither <code>uv_da</code> or <code>mip_level_bias</code> is specified, and
+'linear-mipmap-linear' when at least one of them is specified, these being
+the highest-quality modes possible depending on the availability of the
+image-space derivatives of the texture coordinates or direct mip level information.</td></tr><tr class="arg"><td class="argname">boundary_mode</td><td class="arg_short">Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If <code>tex</code> defines a
+cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+centers of the boundary texels. Mode 'zero' virtually extends the texture with
+all-zero values in all directions.</td></tr><tr class="arg"><td class="argname">max_mip_level</td><td class="arg_short">If specified, limits the number of mipmaps constructed and used in mipmap-based
+filter modes.</td></tr></table><div class="returns">Returns:<div class="return_description">A tensor containing the results of the texture sampling with shape
+[minibatch_size, height, width, tex_channels].</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.texture_construct_mip(<em>tex</em>, <em>max_mip_level</em>=<span class="defarg">None</span>, <em>cube_mode</em>=<span class="defarg">False</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Construct a mipmap stack for a texture.</p><p class="longdesc">This function can be used for constructing a mipmap stack for a texture that is known to remain
+constant. This avoids reconstructing it every time <code>texture()</code> is called.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">tex</td><td class="arg_short">Texture tensor with the same constraints as in <code>texture()</code>.</td></tr><tr class="arg"><td class="argname">max_mip_level</td><td class="arg_short">If specified, limits the number of mipmaps constructed.</td></tr><tr class="arg"><td class="argname">cube_mode</td><td class="arg_short">Must be set to True if <code>tex</code> specifies a cube map texture.</td></tr></table><div class="returns">Returns:<div class="return_description">An opaque object containing the mipmap stack. This can be supplied in a call to <code>texture()</code> 
+in the <code>mip</code> argument.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.antialias(<em>color</em>, <em>rast</em>, <em>pos</em>, <em>tri</em>, <em>topology_hash</em>=<span class="defarg">None</span>, <em>pos_gradient_boost</em>=<span class="defarg">1.0</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Perform antialiasing.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory. The output tensor
+will be contiguous and reside in GPU memory.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">color</td><td class="arg_short">Input image to antialias with shape [minibatch_size, height, width, num_channels].</td></tr><tr class="arg"><td class="argname">rast</td><td class="arg_short">Main output tensor from <code>rasterize()</code>.</td></tr><tr class="arg"><td class="argname">pos</td><td class="arg_short">Vertex position tensor used in the rasterization operation.</td></tr><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor used in the rasterization operation.</td></tr><tr class="arg"><td class="argname">topology_hash</td><td class="arg_short">(Optional) Preconstructed topology hash for the triangle tensor. If not
+specified, the topology hash is constructed internally and discarded afterwards.</td></tr><tr class="arg"><td class="argname">pos_gradient_boost</td><td class="arg_short">(Optional) Multiplier for gradients propagated to <code>pos</code>.</td></tr></table><div class="returns">Returns:<div class="return_description">A tensor containing the antialiased image with the same shape as <code>color</code> input tensor.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.antialias_construct_topology_hash(<em>tri</em>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Construct a topology hash for a triangle tensor.</p><p class="longdesc">This function can be used for constructing a topology hash for a triangle tensor that is 
+known to remain constant. This avoids reconstructing it every time <code>antialias()</code> is called.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+GPU memory.</td></tr></table><div class="returns">Returns:<div class="return_description">An opaque object containing the topology hash. This can be supplied in a call to 
+<code>antialias()</code> in the <code>topology_hash</code> argument.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.get_log_level(<em></em>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Get current log level.</p><p class="longdesc"></p><div class="returns">Returns:<div class="return_description">Current log level in nvdiffrast. See <code>set_log_level()</code> for possible values.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.set_log_level(<em>level</em>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Set log level.</p><p class="longdesc">Log levels follow the convention on the C++ side of Torch:
+  0 = Info,
+  1 = Warning,
+  2 = Error,
+  3 = Fatal.
+The default log level is 1.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">level</td><td class="arg_short">New log level as integer. Internal nvdiffrast messages of this 
+severity or higher will be printed, while messages of lower
+severity will be silent.</td></tr></table></div>
+
+</div>
+<h2 id="licenses">Licenses</h2>
+<p>Copyright © 2020, NVIDIA Corporation. All rights reserved.</p>
+<p>This work is made available under the <a href="https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt">Nvidia Source Code License</a>.</p>
+<p>For business inquiries, please contact <a href="mailto:researchinquiries@nvidia.com">researchinquiries@nvidia.com</a></p>
+<p>We do not currently accept outside contributions in the form of pull requests.</p>
+<p>Environment map stored as part of <code>samples/data/envphong.npz</code> is derived from a Wave Engine <a href="https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap">sample material</a> originally shared under <a href="https://github.com/WaveEngine/Samples/blob/master/LICENSE.md">MIT License</a>. Mesh and texture stored as part of <code>samples/data/earth.npz</code> are derived from <a href="https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125">3D Earth Photorealistic 2K</a> model originally made available under <a href="https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license">TurboSquid 3D Model License</a>.</p>
+<h2 id="citation">Citation</h2>
+<pre><code>@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}</code></pre>
+<h2 id="acknowledgements">Acknowledgements</h2>
+<p>We thank David Luebke, Simon Yuen, Jaewoo Seo, Tero Kuosmanen, Sanja Fidler, Wenzheng Chen, Jacob Munkberg, Jon Hasselgren, and Onni Kosomaa for discussions, test data, support with compute infrastructure, testing, reviewing, and suggestions for features and improvements.</p>
+<div style="height: 100px">
+ 
+</div>
+</body>
+
+</html>
diff --git a/pose_estimation/nvdiffrast/nvdiffrast.egg-info/PKG-INFO b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/PKG-INFO
new file mode 100755
index 0000000000000000000000000000000000000000..8fa5dfbad9365c5854d1261b636fc43d255e0b20
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/PKG-INFO
@@ -0,0 +1,59 @@
+Metadata-Version: 2.1
+Name: nvdiffrast
+Version: 0.2.5
+Summary: nvdiffrast - modular primitives for high-performance differentiable rendering
+Home-page: https://github.com/NVlabs/nvdiffrast
+Author: Samuli Laine
+Author-email: slaine@nvidia.com
+License: UNKNOWN
+Platform: UNKNOWN
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/markdown
+License-File: LICENSE.txt
+
+## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
+
+![Teaser image](./docs/img/teaser.png)
+
+**Modular Primitives for High-Performance Differentiable Rendering**<br>
+Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
+[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
+
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
+Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
+
+## Licenses
+
+Copyright &copy; 2020, NVIDIA Corporation. All rights reserved.
+
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
+
+For business inquiries, please contact [researchinquiries@nvidia.com](mailto:researchinquiries@nvidia.com)
+
+We do not currently accept outside code contributions in the form of pull requests.
+
+Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
+[sample material](https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
+originally shared under 
+[MIT License](https://github.com/WaveEngine/Samples/blob/master/LICENSE.md).
+Mesh and texture stored as part of `samples/data/earth.npz` are derived from
+[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
+model originally made available under
+[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
+
+## Citation
+
+```
+@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}
+```
+
+
diff --git a/pose_estimation/nvdiffrast/nvdiffrast.egg-info/SOURCES.txt b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/SOURCES.txt
new file mode 100755
index 0000000000000000000000000000000000000000..0066dc8e72788c314cd0c10e7e5b5df1c8d22520
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/SOURCES.txt
@@ -0,0 +1,14 @@
+LICENSE.txt
+README.md
+setup.py
+nvdiffrast/__init__.py
+nvdiffrast.egg-info/PKG-INFO
+nvdiffrast.egg-info/SOURCES.txt
+nvdiffrast.egg-info/dependency_links.txt
+nvdiffrast.egg-info/requires.txt
+nvdiffrast.egg-info/top_level.txt
+nvdiffrast/tensorflow/__init__.py
+nvdiffrast/tensorflow/ops.py
+nvdiffrast/tensorflow/plugin_loader.py
+nvdiffrast/torch/__init__.py
+nvdiffrast/torch/ops.py
\ No newline at end of file
diff --git a/pose_estimation/nvdiffrast/nvdiffrast.egg-info/dependency_links.txt b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/dependency_links.txt
new file mode 100755
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/pose_estimation/nvdiffrast/nvdiffrast.egg-info/requires.txt b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/requires.txt
new file mode 100755
index 0000000000000000000000000000000000000000..24ce15ab7ead32f98c7ac3edcd34bb2010ff4326
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/requires.txt
@@ -0,0 +1 @@
+numpy
diff --git a/pose_estimation/nvdiffrast/nvdiffrast.egg-info/top_level.txt b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/top_level.txt
new file mode 100755
index 0000000000000000000000000000000000000000..1f7ac63ea3ddf40303d5b342835040fb9b354c9e
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast.egg-info/top_level.txt
@@ -0,0 +1 @@
+nvdiffrast
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/__init__.py b/pose_estimation/nvdiffrast/nvdiffrast/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..3678b790f5e025f8943eee49e9dafa2489dce867
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+__version__ = '0.2.5'
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/antialias.cu b/pose_estimation/nvdiffrast/nvdiffrast/common/antialias.cu
new file mode 100755
index 0000000000000000000000000000000000000000..5411b0873c800f9e9a578383d4c42d226e31dc6c
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/antialias.cu
@@ -0,0 +1,558 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "antialias.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define F32_MAX (3.402823466e+38f)
+static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
+static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
+static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
+{
+    bool g10 = rational_gt(n1, n0, d1, d0);
+    bool g20 = rational_gt(n2, n0, d2, d0);
+    bool g21 = rational_gt(n2, n1, d2, d1);
+    if (g20 && g21) return 2;
+    if (g10) return 1;
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
+
+struct AAWorkItem
+{
+    enum
+    {
+        EDGE_MASK       = 3,    // Edge index in lowest bits.
+        FLAG_DOWN_BIT   = 2,    // Down instead of right.
+        FLAG_TRI1_BIT   = 3,    // Edge is from other pixel's triangle.
+    };
+
+    int             px, py;         // Pixel x, y.
+    unsigned int    pz_flags;       // High 16 bits = pixel z, low 16 bits = edge index and flags.
+    float           alpha;          // Antialiasing alpha value. Zero if no AA.
+};
+
+//------------------------------------------------------------------------
+// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
+
+#define JENKINS_MAGIC (0x9e3779b9u)
+static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
+{
+    a -= b; a -= c; a ^= (c>>13);
+    b -= c; b -= a; b ^= (a<<8);
+    c -= a; c -= b; c ^= (b>>13);
+    a -= b; a -= c; a ^= (c>>12);
+    b -= c; b -= a; b ^= (a<<16);
+    c -= a; c -= b; c ^= (b>>5);
+    a -= b; a -= c; a ^= (c>>3);
+    b -= c; b -= a; b ^= (a<<10);
+    c -= a; c -= b; c ^= (b>>15);
+}
+
+// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
+class HashIndex
+{
+public:
+    __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
+    {
+        m_mask = p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE - 1;
+        m_idx  = (uint32_t)(key & 0xffffffffu);
+        m_skip = (uint32_t)(key >> 32);
+        uint32_t dummy = JENKINS_MAGIC;
+        jenkins_mix(m_idx, m_skip, dummy);
+        m_idx &= m_mask;
+        m_skip &= m_mask;
+        m_skip |= 1;
+    }
+    __device__ __forceinline__ int get(void) const { return m_idx; }
+    __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
+private:
+    uint32_t m_idx, m_skip, m_mask;
+};
+
+static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
+        if (prev == 0 || prev == key)
+            break;
+        idx.next();
+    }
+    int* q = (int*)&p.evHash[idx.get()];
+    int a = atomicCAS(q+2, 0, v);
+    if (a != 0 && a != v)
+        atomicCAS(q+3, 0, v);
+}
+
+static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint4 entry = p.evHash[idx.get()];
+        uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
+        if (k == key || k == 0)
+            return make_int2((int)entry.z, (int)entry.w);
+        idx.next();
+    }
+}
+
+static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
+{
+    if (va == vb)
+        return;
+    
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    hash_insert(p, vk, vn + 1);
+}
+
+static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
+{
+    if (va == vb)
+        return -1;
+
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    int2 vn = hash_find(p, vk) - 1;
+    if (vn.x == vr) return vn.y;
+    if (vn.y == vr) return vn.x;
+    return -1;
+}
+
+//------------------------------------------------------------------------
+// Mesh analysis kernel.
+
+__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= p.numTriangles)
+        return;
+
+    int v0 = p.tri[idx * 3 + 0];
+    int v1 = p.tri[idx * 3 + 1];
+    int v2 = p.tri[idx * 3 + 2];
+
+    if (v0 < 0 || v0 >= p.numVertices ||
+        v1 < 0 || v1 >= p.numVertices ||
+        v2 < 0 || v2 >= p.numVertices)
+        return;
+
+    if (v0 == v1 || v1 == v2 || v2 == v0)
+        return;
+
+    evhash_insert_vertex(p, v1, v2, v0);
+    evhash_insert_vertex(p, v2, v0, v1);
+    evhash_insert_vertex(p, v0, v1, v2);
+}
+
+//------------------------------------------------------------------------
+// Discontinuity finder kernel.
+
+__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
+    int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.n)
+        return;
+
+    // Pointer to our TriIdx and fetch.
+    int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
+    float tri0 = p.rasterOut[pidx0];
+
+    // Look right, clamp at edge.
+    int pidx1 = pidx0;
+    if (px < p.width - 1)
+        pidx1 += 4;
+    float tri1 = p.rasterOut[pidx1];
+
+    // Look down, clamp at edge.
+    int pidx2 = pidx0;
+    if (py < p.height - 1)
+        pidx2 += p.width << 2;
+    float tri2 = p.rasterOut[pidx2];
+
+    // Determine amount of work.
+    int count = 0;
+    if (tri1 != tri0) count  = 1;
+    if (tri2 != tri0) count += 1;
+    if (!count)
+        return; // Exit warp.
+
+    // Coalesce work counter update to once per CTA.
+    __shared__ int s_temp;
+    s_temp = 0;
+    __syncthreads();
+    int idx = atomicAdd(&s_temp, count);
+    __syncthreads();
+    if (idx == 0)
+    {
+        int base = atomicAdd(&p.workBuffer[0].x, s_temp);
+        s_temp = base + 1; // don't clobber the counters in first slot.
+    }
+    __syncthreads();
+    idx += s_temp;
+
+    // Write to memory.
+    if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
+    if (tri2 != tri0) p.workBuffer[idx]   = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
+}
+
+//------------------------------------------------------------------------
+// Forward analysis kernel.
+
+__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
+{
+    __shared__ int s_base;
+    int workCount = p.workBuffer[0].x;
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        int4* pItem = p.workBuffer + thread_idx + 1;
+        int4 item = *pItem;
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d  = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
+        float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
+        int tri0 = (int)zt0.y - 1;
+        int tri1 = (int)zt1.y - 1;
+
+        // Select triangle based on background / depth.
+        int tri = (tri0 >= 0) ? tri0 : tri1;
+        if (tri0 >= 0 && tri1 >= 0)
+            tri = (zt0.x < zt1.x) ? tri0 : tri1;
+        if (tri == tri1)
+        {
+            // Calculate with respect to neighbor pixel if chose that triangle.
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        if (tri < 0 || tri >= p.numTriangles)
+            continue;
+
+        // Fetch vertex indices.
+        int vi0 = p.tri[tri * 3 + 0];
+        int vi1 = p.tri[tri * 3 + 1];
+        int vi2 = p.tri[tri * 3 + 2];
+
+        // Bail out if vertex indices are corrupt.
+        if (vi0 < 0 || vi0 >= p.numVertices ||
+            vi1 < 0 || vi1 >= p.numVertices ||
+            vi2 < 0 || vi2 >= p.numVertices)
+            continue;
+
+        // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
+        int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
+        int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
+        int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
+
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            int vbase = pz * p.numVertices;
+            vi0 += vbase; 
+            vi1 += vbase; 
+            vi2 += vbase;
+            if (op0 >= 0) op0 += vbase;
+            if (op1 >= 0) op1 += vbase;
+            if (op2 >= 0) op2 += vbase;
+        }
+
+        // Fetch vertex positions.
+        float4 p0 = ((float4*)p.pos)[vi0];
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+        float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
+        float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
+        float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
+
+        // Project vertices to pixel space.
+        float w0  = 1.f / p0.w;
+        float w1  = 1.f / p1.w;
+        float w2  = 1.f / p2.w;
+        float ow0 = 1.f / o0.w;
+        float ow1 = 1.f / o1.w;
+        float ow2 = 1.f / o2.w;
+        float fx  = (float)px + .5f - p.xh;
+        float fy  = (float)py + .5f - p.yh;
+        float x0  = p0.x * w0 * p.xh - fx;
+        float y0  = p0.y * w0 * p.yh - fy;
+        float x1  = p1.x * w1 * p.xh - fx;
+        float y1  = p1.y * w1 * p.yh - fy;
+        float x2  = p2.x * w2 * p.xh - fx;
+        float y2  = p2.y * w2 * p.yh - fy;
+        float ox0 = o0.x * ow0 * p.xh - fx;
+        float oy0 = o0.y * ow0 * p.yh - fy;
+        float ox1 = o1.x * ow1 * p.xh - fx;
+        float oy1 = o1.y * ow1 * p.yh - fy;
+        float ox2 = o2.x * ow2 * p.xh - fx;
+        float oy2 = o2.y * ow2 * p.yh - fy;
+
+        // Signs to kill non-silhouette edges.
+        float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
+        float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
+        float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
+        float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
+
+        // If no matching signs anywhere, skip the rest.
+        if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
+        {
+            // XY flip for horizontal edges.
+            if (d)
+            {
+                swap(x0, y0);
+                swap(x1, y1);
+                swap(x2, y2);
+            }
+
+            float dx0 = x2 - x1;
+            float dx1 = x0 - x2;
+            float dx2 = x1 - x0;
+            float dy0 = y2 - y1;
+            float dy1 = y0 - y2;
+            float dy2 = y1 - y0;
+
+            // Check if an edge crosses between us and the neighbor pixel.
+            float dc = -F32_MAX;
+            float ds = (tri == tri0) ? 1.f : -1.f;
+            float d0 = ds * (x1*dy0 - y1*dx0);
+            float d1 = ds * (x2*dy1 - y2*dx1);
+            float d2 = ds * (x0*dy2 - y0*dx2);
+
+            if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
+            if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
+            if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
+
+            int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
+            if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
+            if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
+            if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
+            float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
+
+            // Adjust output image if a suitable edge was found.
+            if (dc > -eps && dc < 1.f + eps)
+            {
+                dc = fminf(fmaxf(dc, 0.f), 1.f);
+                float alpha = ds * (.5f - dc);
+                const float* pColor0 = p.color + pixel0 * p.channels;
+                const float* pColor1 = p.color + pixel1 * p.channels;
+                float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+                for (int i=0; i < p.channels; i++)
+                    atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
+
+                // Rewrite the work item's flags and alpha. Keep original px, py.
+                unsigned int flags = pz << 16;
+                flags |= di;
+                flags |= d << AAWorkItem::FLAG_DOWN_BIT;
+                flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
+                ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+    __shared__ int s_base; // Work counter communication across entire CTA.
+
+    int workCount = p.workBuffer[0].x;
+
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        // Read work item filled out by forward kernel.
+        int4 item = p.workBuffer[thread_idx + 1];
+        unsigned int amask = __ballot_sync(0xffffffffu, item.w);
+        if (item.w == 0)
+            continue; // No effect.
+
+        // Unpack work item and replicate setup from forward analysis kernel.
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+        float alpha = __int_as_float(item.w);
+        int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
+        int di = item.z & AAWorkItem::EDGE_MASK;
+        float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        int tri = (int)p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3] - 1;
+        if (tri1)
+        {
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        bool triFail = (tri < 0 || tri >= p.numTriangles);
+        amask = __ballot_sync(amask, !triFail);
+        if (triFail)
+            continue;
+
+        // Outgoing color gradients.
+        float* pGrad0 = p.gradColor + pixel0 * p.channels;
+        float* pGrad1 = p.gradColor + pixel1 * p.channels;
+
+        // Incoming color gradients.
+        const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+
+        // Position gradient weight based on colors and incoming gradients.
+        float dd = 0.f;
+        const float* pColor0 = p.color + pixel0 * p.channels;
+        const float* pColor1 = p.color + pixel1 * p.channels;
+
+        // Loop over channels and accumulate.
+        for (int i=0; i < p.channels; i++)
+        {
+            float dy = pDy[i];
+            if (dy != 0.f)
+            {
+                // Update position gradient weight.
+                dd += dy * (pColor1[i] - pColor0[i]);
+
+                // Update color gradients. No coalescing because all have different targets.
+                float v = alpha * dy;
+                atomicAdd(&pGrad0[i], -v);
+                atomicAdd(&pGrad1[i], v);
+            }
+        }
+
+        // If position weight is zero, skip the rest.
+        bool noGrad = (dd == 0.f);
+        amask = __ballot_sync(amask, !noGrad);
+        if (noGrad)
+            continue;
+
+        // Fetch vertex indices of the active edge and their positions.
+        int i1 = (di < 2) ? (di + 1) : 0;
+        int i2 = (i1 < 2) ? (i1 + 1) : 0;
+        int vi1 = p.tri[3 * tri + i1];
+        int vi2 = p.tri[3 * tri + i2];
+
+        // Bail out if vertex indices are corrupt.
+        bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
+        amask = __ballot_sync(amask, !vtxFail);
+        if (vtxFail)
+            continue;
+    
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            vi1 += pz * p.numVertices;
+            vi2 += pz * p.numVertices;
+        }
+
+        // Fetch vertex positions.
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+
+        // Project vertices to pixel space.
+        float pxh = p.xh;
+        float pyh = p.yh;
+        float fx = (float)px + .5f - pxh;
+        float fy = (float)py + .5f - pyh;
+
+        // XY flip for horizontal edges.
+        if (d)
+        {
+            swap(p1.x, p1.y);
+            swap(p2.x, p2.y);
+            swap(pxh, pyh);
+            swap(fx, fy);
+        }
+
+        // Gradient calculation setup.
+        float w1 = 1.f / p1.w;
+        float w2 = 1.f / p2.w;
+        float x1 = p1.x * w1 * pxh - fx;
+        float y1 = p1.y * w1 * pyh - fy;
+        float x2 = p2.x * w2 * pxh - fx;
+        float y2 = p2.y * w2 * pyh - fy;
+        float dx = x2 - x1;
+        float dy = y2 - y1;
+        float db = x1*dy - y1*dx;
+
+        // Compute inverse delta-y with epsilon.
+        float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
+        float iy = 1.f / (dy + ep);
+
+        // Compute position gradients.
+        float dby = db * iy;
+        float iw1 = -w1 * iy * dd;
+        float iw2 =  w2 * iy * dd;
+        float gp1x = iw1 * pxh * y2;
+        float gp2x = iw2 * pxh * y1;
+        float gp1y = iw1 * pyh * (dby - x2);
+        float gp2y = iw2 * pyh * (dby - x1);
+        float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
+        float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
+
+        // XY flip the gradients.
+        if (d)
+        {
+            swap(gp1x, gp1y);
+            swap(gp2x, gp2y);
+        }
+
+        // Kill position gradients if alpha was saturated.
+        if (fabsf(alpha) >= 0.5f)
+        {
+            gp1x = gp1y = gp1w = 0.f;
+            gp2x = gp2y = gp2w = 0.f;
+        }
+
+        // Initialize coalesced atomics. Match both triangle ID and edge index.
+        // Also note that some threads may be inactive.
+        CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
+
+        // Accumulate gradients.
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/antialias.h b/pose_estimation/nvdiffrast/nvdiffrast/common/antialias.h
new file mode 100755
index 0000000000000000000000000000000000000000..a35737db38c3f70da9ca81729cba4f5515a201d2
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/antialias.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "common.h"
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH     32
+#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT    8
+#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK    256
+#define AA_MESH_KERNEL_THREADS_PER_BLOCK        256
+#define AA_HASH_ELEMENTS_PER_TRIANGLE           8   // Minimum is 4 but 8 gives fewer collisions. Must be power of two.
+#define AA_GRAD_KERNEL_THREADS_PER_BLOCK        256
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct AntialiasKernelParams
+{
+    const float*    color;          // Incoming color buffer.
+    const float*    rasterOut;      // Incoming rasterizer output buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    pos;            // Incoming position buffer.
+    float*          output;         // Output buffer of forward kernel.    
+    const float*    dy;             // Incoming gradients.
+    float*          gradColor;      // Output buffer, color gradient.
+    float*          gradPos;        // Output buffer, position gradient.
+    int4*           workBuffer;     // Buffer for storing intermediate work items. First item reserved for counters.
+    uint4*          evHash;         // Edge-vertex hash.
+    int             allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Input width.
+    int             height;         // Input height.
+    int             n;              // Minibatch size.
+    int             channels;       // Channel count in color input.
+    float           xh, yh;         // Transfer to pixel space.
+    int             instance_mode;  // 0=normal, 1=instance mode.
+    int             tri_const;      // 1 if triangle array is known to be constant.
+};
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/common.cpp b/pose_estimation/nvdiffrast/nvdiffrast/common/common.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..e566c035bdef66e9b75265a58fb8602b0fa530ca
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/common.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (width * height) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+
+    // Optimizations for weirdly sized buffers.
+    if (width < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= width)
+            bw >>= 1;
+
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > height)
+            bh = height;
+    }
+    else if (height < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > height)
+        {
+            bh >>= 1;
+            if (bw < width)
+                bw <<= 1;
+        }
+    }
+
+    // Done.
+    return dim3(bw, bh, 1);
+}
+
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
+{
+    dim3 gridSize;
+    gridSize.x = (width  - 1) / blockSize.x + 1;
+    gridSize.y = (height - 1) / blockSize.y + 1;
+    gridSize.z = (depth  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/common.h b/pose_estimation/nvdiffrast/nvdiffrast/common/common.h
new file mode 100755
index 0000000000000000000000000000000000000000..8df48ed73cd330c45250ee02a113e03357504055
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/common.h
@@ -0,0 +1,253 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+// The rest is CUDA device code specific stuff.
+
+#ifdef __CUDACC__
+
+//------------------------------------------------------------------------
+// Helpers for CUDA vector types.
+
+static __device__ __forceinline__ float2&   operator*=  (float2& a, const float2& b)       { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, const float2& b)       { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, const float2& b)       { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ float2&   operator*=  (float2& a, float b)               { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, float b)               { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, float b)               { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ float2    operator*   (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ float2    operator*   (const float2& a, float b)         { return make_float2(a.x * b, a.y * b); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, float b)         { return make_float2(a.x + b, a.y + b); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, float b)         { return make_float2(a.x - b, a.y - b); }
+static __device__ __forceinline__ float2    operator*   (float a, const float2& b)         { return make_float2(a * b.x, a * b.y); }
+static __device__ __forceinline__ float2    operator+   (float a, const float2& b)         { return make_float2(a + b.x, a + b.y); }
+static __device__ __forceinline__ float2    operator-   (float a, const float2& b)         { return make_float2(a - b.x, a - b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a)                  { return make_float2(-a.x, -a.y); }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, const float3& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, const float3& b)       { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, const float3& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, float b)               { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, float b)               { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, float b)               { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ float3    operator*   (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ float3    operator*   (const float3& a, float b)         { return make_float3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, float b)         { return make_float3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, float b)         { return make_float3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ float3    operator*   (float a, const float3& b)         { return make_float3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ float3    operator+   (float a, const float3& b)         { return make_float3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ float3    operator-   (float a, const float3& b)         { return make_float3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a)                  { return make_float3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, const float4& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, const float4& b)       { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, const float4& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, float b)               { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, float b)               { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, float b)               { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ float4    operator*   (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ float4    operator*   (const float4& a, float b)         { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, float b)         { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, float b)         { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ float4    operator*   (float a, const float4& b)         { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ float4    operator+   (float a, const float4& b)         { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ float4    operator-   (float a, const float4& b)         { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a)                  { return make_float4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, const int2& b)           { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, const int2& b)           { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, const int2& b)           { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, int b)                   { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, int b)                   { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, int b)                   { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ int2      operator*   (const int2& a, const int2& b)     { return make_int2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, const int2& b)     { return make_int2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, const int2& b)     { return make_int2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ int2      operator*   (const int2& a, int b)             { return make_int2(a.x * b, a.y * b); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, int b)             { return make_int2(a.x + b, a.y + b); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, int b)             { return make_int2(a.x - b, a.y - b); }
+static __device__ __forceinline__ int2      operator*   (int a, const int2& b)             { return make_int2(a * b.x, a * b.y); }
+static __device__ __forceinline__ int2      operator+   (int a, const int2& b)             { return make_int2(a + b.x, a + b.y); }
+static __device__ __forceinline__ int2      operator-   (int a, const int2& b)             { return make_int2(a - b.x, a - b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a)                    { return make_int2(-a.x, -a.y); }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, const int3& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, const int3& b)           { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, const int3& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, int b)                   { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ int3      operator*   (const int3& a, const int3& b)     { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, const int3& b)     { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, const int3& b)     { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ int3      operator*   (const int3& a, int b)             { return make_int3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, int b)             { return make_int3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, int b)             { return make_int3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ int3      operator*   (int a, const int3& b)             { return make_int3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ int3      operator+   (int a, const int3& b)             { return make_int3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ int3      operator-   (int a, const int3& b)             { return make_int3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a)                    { return make_int3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, const int4& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, const int4& b)           { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, const int4& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, int b)                   { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ int4      operator*   (const int4& a, const int4& b)     { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, const int4& b)     { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, const int4& b)     { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ int4      operator*   (const int4& a, int b)             { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, int b)             { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, int b)             { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ int4      operator*   (int a, const int4& b)             { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ int4      operator+   (int a, const int4& b)             { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ int4      operator-   (int a, const int4& b)             { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a)                    { return make_int4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, const uint2& b)         { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, const uint2& b)         { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, const uint2& b)         { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, unsigned int b)         { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, unsigned int b)         { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, unsigned int b)         { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, const uint2& b)   { return make_uint2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, const uint2& b)   { return make_uint2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, const uint2& b)   { return make_uint2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, unsigned int b)   { return make_uint2(a.x * b, a.y * b); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, unsigned int b)   { return make_uint2(a.x + b, a.y + b); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, unsigned int b)   { return make_uint2(a.x - b, a.y - b); }
+static __device__ __forceinline__ uint2     operator*   (unsigned int a, const uint2& b)   { return make_uint2(a * b.x, a * b.y); }
+static __device__ __forceinline__ uint2     operator+   (unsigned int a, const uint2& b)   { return make_uint2(a + b.x, a + b.y); }
+static __device__ __forceinline__ uint2     operator-   (unsigned int a, const uint2& b)   { return make_uint2(a - b.x, a - b.y); }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, const uint3& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, const uint3& b)         { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, const uint3& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, const uint3& b)   { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, const uint3& b)   { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, const uint3& b)   { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, unsigned int b)   { return make_uint3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, unsigned int b)   { return make_uint3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, unsigned int b)   { return make_uint3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ uint3     operator*   (unsigned int a, const uint3& b)   { return make_uint3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ uint3     operator+   (unsigned int a, const uint3& b)   { return make_uint3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ uint3     operator-   (unsigned int a, const uint3& b)   { return make_uint3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, const uint4& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, const uint4& b)         { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, const uint4& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, const uint4& b)   { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, const uint4& b)   { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, const uint4& b)   { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, unsigned int b)   { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, unsigned int b)   { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, unsigned int b)   { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ uint4     operator*   (unsigned int a, const uint4& b)   { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ uint4     operator+   (unsigned int a, const uint4& b)   { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ uint4     operator-   (unsigned int a, const uint4& b)   { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
+
+template<class T> static __device__ __forceinline__ T zero_value(void);
+template<> __device__ __forceinline__ float  zero_value<float> (void)                      { return 0.f; }
+template<> __device__ __forceinline__ float2 zero_value<float2>(void)                      { return make_float2(0.f, 0.f); }
+template<> __device__ __forceinline__ float4 zero_value<float4>(void)                      { return make_float4(0.f, 0.f, 0.f, 0.f); }
+static __device__ __forceinline__ float3 make_float3(const float2& a, float b)             { return make_float3(a.x, a.y, b); }
+static __device__ __forceinline__ float4 make_float4(const float3& a, float b)             { return make_float4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b)     { return make_float4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ int3 make_int3(const int2& a, int b)                     { return make_int3(a.x, a.y, b); }
+static __device__ __forceinline__ int4 make_int4(const int3& a, int b)                     { return make_int4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b)             { return make_int4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b)         { return make_uint3(a.x, a.y, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b)         { return make_uint4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b)         { return make_uint4(a.x, a.y, b.x, b.y); }
+
+template<class T> static __device__ __forceinline__ void swap(T& a, T& b)                  { T temp = a; a = b; b = temp; }
+
+//------------------------------------------------------------------------
+// Coalesced atomics. These are all done via macros.
+
+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
+
+#define CA_TEMP       _ca_temp
+#define CA_TEMP_PARAM float* CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) \
+    __shared__ float CA_TEMP[(threads_per_block)]
+
+#define CA_SET_GROUP_MASK(group, thread_mask)                   \
+    bool   _ca_leader;                                          \
+    float* _ca_ptr;                                             \
+    do {                                                        \
+        int tidx   = threadIdx.x + blockDim.x * threadIdx.y;    \
+        int lane   = tidx & 31;                                 \
+        int warp   = tidx >> 5;                                 \
+        int tmask  = __match_any_sync((thread_mask), (group));  \
+        int leader = __ffs(tmask) - 1;                          \
+        _ca_leader = (leader == lane);                          \
+        _ca_ptr    = &_ca_temp[((warp << 5) + leader)];         \
+    } while(0)
+
+#define CA_SET_GROUP(group) \
+    CA_SET_GROUP_MASK((group), 0xffffffffu)
+
+#define caAtomicAdd(ptr, value)         \
+    do {                                \
+        if (_ca_leader)                 \
+            *_ca_ptr = 0.f;             \
+        atomicAdd(_ca_ptr, (value));    \
+        if (_ca_leader)                 \
+            atomicAdd((ptr), *_ca_ptr); \
+    } while(0)
+
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        caAtomicAdd((ptr), (x));        \
+        caAtomicAdd((ptr)+1, (y));      \
+        caAtomicAdd((ptr)+3, (w));      \
+    } while(0)
+
+#define caAtomicAddTexture(ptr, level, idx, value)  \
+    do {                                            \
+        CA_SET_GROUP((idx) ^ ((level) << 27));      \
+        caAtomicAdd((ptr)+(idx), (value));          \
+    } while(0)
+
+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        atomicAdd((ptr), (x));          \
+        atomicAdd((ptr)+1, (y));        \
+        atomicAdd((ptr)+3, (w));        \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
+
+//------------------------------------------------------------------------
+#endif // __CUDACC__
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/framework.h b/pose_estimation/nvdiffrast/nvdiffrast/common/framework.h
new file mode 100755
index 0000000000000000000000000000000000000000..12d803caaf3210c45808dee41217c4c6c6edfe6e
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/framework.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+// Framework-specific macros to enable code sharing.
+
+//------------------------------------------------------------------------
+// Tensorflow.
+
+#ifdef NVDR_TENSORFLOW
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/default/logging.h"
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+#define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
+#define NVDR_CTX_PARAMS _nvdr_ctx
+#define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
+#endif
+
+//------------------------------------------------------------------------
+// PyTorch.
+
+#ifdef NVDR_TORCH
+#ifndef __CUDACC__
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <pybind11/numpy.h>
+#endif
+#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
+#define NVDR_CTX_PARAMS 0
+#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(!err, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
+#endif
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/glutil.cpp b/pose_estimation/nvdiffrast/nvdiffrast/common/glutil.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..2af3e931b6808e2575d8a209d5485746499b3374
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/glutil.cpp
@@ -0,0 +1,403 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common.
+//------------------------------------------------------------------------
+
+#include "framework.h"
+#include "glutil.h"
+#include <iostream>
+#include <iomanip>
+
+// Create the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) return_type (GLAPIENTRY* name)(__VA_ARGS__) = 0;
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+// Track initialization status.
+static volatile bool s_glExtInitialized = false;
+
+// Error strings.
+const char* getGLErrorString(GLenum err)
+{
+    switch(err)
+    {
+        case GL_NO_ERROR:                       return "GL_NO_ERROR";
+        case GL_INVALID_ENUM:                   return "GL_INVALID_ENUM";
+        case GL_INVALID_VALUE:                  return "GL_INVALID_VALUE";
+        case GL_INVALID_OPERATION:              return "GL_INVALID_OPERATION";
+        case GL_STACK_OVERFLOW:                 return "GL_STACK_OVERFLOW";
+        case GL_STACK_UNDERFLOW:                return "GL_STACK_UNDERFLOW";
+        case GL_OUT_OF_MEMORY:                  return "GL_OUT_OF_MEMORY";
+        case GL_INVALID_FRAMEBUFFER_OPERATION:  return "GL_INVALID_FRAMEBUFFER_OPERATION";
+        case GL_TABLE_TOO_LARGE:                return "GL_TABLE_TOO_LARGE";
+        case GL_CONTEXT_LOST:                   return "GL_CONTEXT_LOST";
+    }
+    return "Unknown error";
+}
+
+//------------------------------------------------------------------------
+// Windows.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+
+static CRITICAL_SECTION getInitializedCriticalSection(void)
+{
+    CRITICAL_SECTION cs;
+    InitializeCriticalSection(&cs);
+    return cs;
+}
+
+static CRITICAL_SECTION s_getProcAddressMutex = getInitializedCriticalSection();
+
+static void safeGetProcAddress(const char* name, PROC* pfn)
+{
+    PROC result = wglGetProcAddress(name);
+    if (!result)
+    {
+        LeaveCriticalSection(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    // Use critical section for thread safety.
+    EnterCriticalSection(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROC*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    // Done.
+    LeaveCriticalSection(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+    if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
+        LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    if (!wglMakeCurrent(NULL, NULL))
+        LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
+}
+
+extern "C" int set_gpu(const char*); // In setgpu.lib
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx))
+        {
+            LOG(INFO) << "PCI bus id query failed";
+        }
+        else
+        {
+            int res = set_gpu(pciBusId);
+            LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
+        }
+    }
+
+    HINSTANCE hInstance = GetModuleHandle(NULL);
+    WNDCLASS wc = {};
+    wc.style         = CS_OWNDC;
+    wc.lpfnWndProc   = DefWindowProc;
+    wc.hInstance     = hInstance;
+    wc.lpszClassName = "__DummyGLClassCPP";
+    int res = RegisterClass(&wc);
+
+    HWND hwnd = CreateWindow(
+        "__DummyGLClassCPP",        // lpClassName
+        "__DummyGLWindowCPP",       // lpWindowName
+        WS_OVERLAPPEDWINDOW,        // dwStyle
+        CW_USEDEFAULT,              // x
+        CW_USEDEFAULT,              // y
+        0, 0,                       // nWidth, nHeight
+        NULL, NULL,                 // hWndParent, hMenu
+        hInstance,                  // hInstance
+        NULL                        // lpParam
+    );
+
+    PIXELFORMATDESCRIPTOR pfd = {};
+    pfd.dwFlags      = PFD_SUPPORT_OPENGL;
+    pfd.iPixelType   = PFD_TYPE_RGBA;
+    pfd.iLayerType   = PFD_MAIN_PLANE;
+    pfd.cColorBits   = 32;
+    pfd.cDepthBits   = 24;
+    pfd.cStencilBits = 8;
+
+    HDC hdc = GetDC(hwnd);
+    int pixelformat = ChoosePixelFormat(hdc, &pfd);
+    SetPixelFormat(hdc, pixelformat, &pfd);
+
+    HGLRC hglrc = wglCreateContext(hdc);
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
+
+    GLContext glctx = {hdc, hglrc, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (wglGetCurrentContext() == glctx.hglrc)
+        releaseGLContext();
+
+    HWND hwnd = WindowFromDC(glctx.hdc);
+    if (!hwnd)
+        LOG(FATAL) << "WindowFromDC() failed";
+    if (!ReleaseDC(hwnd, glctx.hdc))
+        LOG(FATAL) << "ReleaseDC() failed";
+    if (!wglDeleteContext(glctx.hglrc))
+        LOG(FATAL) << "wglDeleteContext() failed";
+    if (!DestroyWindow(hwnd))
+        LOG(FATAL) << "DestroyWindow() failed";
+
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+
+static pthread_mutex_t s_getProcAddressMutex;
+
+typedef void (*PROCFN)();
+
+static void safeGetProcAddress(const char* name, PROCFN* pfn)
+{
+    PROCFN result = eglGetProcAddress(name);
+    if (!result)
+    {
+        pthread_mutex_unlock(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    pthread_mutex_lock(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROCFN*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    pthread_mutex_unlock(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+
+    if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
+        LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    EGLDisplay display = eglGetCurrentDisplay();
+    if (display == EGL_NO_DISPLAY)
+        LOG(WARNING) << "releaseGLContext() called with no active display";
+    if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
+        LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
+}
+
+static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
+{
+    typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
+    typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
+    typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
+
+    eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
+    if (!eglQueryDevicesEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
+        return 0;
+    }
+
+    eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!eglQueryDeviceAttribEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
+        return 0;
+    }
+
+    eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
+    if (!eglGetPlatformDisplayEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
+        return 0;
+    }
+
+    int num_devices = 0;
+    eglQueryDevicesEXT(0, 0, &num_devices);
+    if (!num_devices)
+        return 0;
+
+    EGLDisplay display = 0;
+    EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
+    eglQueryDevicesEXT(num_devices, devices, &num_devices);
+    for (int i=0; i < num_devices; i++)
+    {
+        EGLDeviceEXT device = devices[i];
+        intptr_t value = -1;
+        if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
+        {
+            display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
+            break;
+        }
+    }
+
+    free(devices);
+    return display;
+}
+
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    EGLDisplay display = 0;
+
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        display = getCudaDisplay(cudaDeviceIdx);
+        if (!display)
+            LOG(INFO) << "Failed, falling back to default display";
+    }
+
+    if (!display)
+    {
+        display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+        if (display == EGL_NO_DISPLAY)
+            LOG(FATAL) << "eglGetDisplay() failed";
+    }
+
+    EGLint major;
+    EGLint minor;
+    if (!eglInitialize(display, &major, &minor))
+        LOG(FATAL) << "eglInitialize() failed";
+
+    // Choose configuration.
+
+    const EGLint context_attribs[] = {
+        EGL_RED_SIZE,           8,
+        EGL_GREEN_SIZE,         8,
+        EGL_BLUE_SIZE,          8,
+        EGL_ALPHA_SIZE,         8,
+        EGL_DEPTH_SIZE,         24,
+        EGL_STENCIL_SIZE,       8,
+        EGL_RENDERABLE_TYPE,    EGL_OPENGL_BIT,
+        EGL_SURFACE_TYPE,       EGL_PBUFFER_BIT,
+        EGL_NONE
+    };
+
+    EGLConfig config;
+    EGLint num_config;
+    if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
+        LOG(FATAL) << "eglChooseConfig() failed";
+
+    // Create GL context.
+
+    if (!eglBindAPI(EGL_OPENGL_API))
+        LOG(FATAL) << "eglBindAPI() failed";
+
+    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
+    if (context == EGL_NO_CONTEXT)
+        LOG(FATAL) << "eglCreateContext() failed";
+
+    // Done.
+
+    LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
+
+    GLContext glctx = {display, context, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (eglGetCurrentContext() == glctx.context)
+        releaseGLContext();
+
+    if (!eglDestroyContext(glctx.display, glctx.context))
+        LOG(ERROR) << "eglDestroyContext() failed";
+
+    LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)glctx.display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+//------------------------------------------------------------------------
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/glutil.h b/pose_estimation/nvdiffrast/nvdiffrast/common/glutil.h
new file mode 100755
index 0000000000000000000000000000000000000000..e9a3a7d95a5af4a808a25097cc055b699024409e
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/glutil.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Windows-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+#define NOMINMAX
+#include <windows.h> // Required by gl.h in Windows.
+#define GLAPIENTRY APIENTRY
+
+struct GLContext
+{
+    HDC     hdc;
+    HGLRC   hglrc;
+    int     extInitialized;
+};
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+#define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
+#define MESA_EGL_NO_X11_HEADERS
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#define GLAPIENTRY
+
+struct GLContext
+{
+    EGLDisplay  display;
+    EGLContext  context;
+    int         extInitialized;
+};
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
+// OpenGL, CUDA interop, GL extensions.
+//------------------------------------------------------------------------
+#define GL_GLEXT_LEGACY
+#include <GL/gl.h>
+#include <cuda_gl_interop.h>
+
+// Constants.
+#ifndef GL_VERSION_1_2
+#define GL_CLAMP_TO_EDGE                 0x812F
+#define GL_TEXTURE_3D                    0x806F
+#endif
+#ifndef GL_VERSION_1_5
+#define GL_ARRAY_BUFFER                  0x8892
+#define GL_DYNAMIC_DRAW                  0x88E8
+#define GL_ELEMENT_ARRAY_BUFFER          0x8893
+#endif
+#ifndef GL_VERSION_2_0
+#define GL_FRAGMENT_SHADER               0x8B30
+#define GL_INFO_LOG_LENGTH               0x8B84
+#define GL_LINK_STATUS                   0x8B82
+#define GL_VERTEX_SHADER                 0x8B31
+#endif
+#ifndef GL_VERSION_3_0
+#define GL_MAJOR_VERSION                 0x821B
+#define GL_MINOR_VERSION                 0x821C
+#define GL_RGBA32F                       0x8814
+#define GL_TEXTURE_2D_ARRAY              0x8C1A
+#endif
+#ifndef GL_VERSION_3_2
+#define GL_GEOMETRY_SHADER               0x8DD9
+#endif
+#ifndef GL_ARB_framebuffer_object
+#define GL_COLOR_ATTACHMENT0             0x8CE0
+#define GL_COLOR_ATTACHMENT1             0x8CE1
+#define GL_DEPTH_STENCIL                 0x84F9
+#define GL_DEPTH_STENCIL_ATTACHMENT      0x821A
+#define GL_DEPTH24_STENCIL8              0x88F0
+#define GL_FRAMEBUFFER                   0x8D40
+#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
+#define GL_UNSIGNED_INT_24_8             0x84FA
+#endif
+#ifndef GL_ARB_imaging
+#define GL_TABLE_TOO_LARGE               0x8031
+#endif
+#ifndef GL_KHR_robustness
+#define GL_CONTEXT_LOST                  0x0507
+#endif
+
+// Declare function pointers to OpenGL extension functions.
+#define GLUTIL_EXT(return_type, name, ...) extern return_type (GLAPIENTRY* name)(__VA_ARGS__);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+//------------------------------------------------------------------------
+// Common functions.
+//------------------------------------------------------------------------
+
+void        setGLContext            (GLContext& glctx);
+void        releaseGLContext        (void);
+GLContext   createGLContext         (int cudaDeviceIdx);
+void        destroyGLContext        (GLContext& glctx);
+const char* getGLErrorString        (GLenum err);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/glutil_extlist.h b/pose_estimation/nvdiffrast/nvdiffrast/common/glutil_extlist.h
new file mode 100755
index 0000000000000000000000000000000000000000..49061ab760e9dca5bf610f8ed71fbd3fe11023fc
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/glutil_extlist.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#ifndef GL_VERSION_1_2
+GLUTIL_EXT(void,   glTexImage3D,                GLenum target, GLint level, GLint internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+#endif
+#ifndef GL_VERSION_1_5
+GLUTIL_EXT(void,   glBindBuffer,                GLenum target, GLuint buffer);
+GLUTIL_EXT(void,   glBufferData,                GLenum target, ptrdiff_t size, const void* data, GLenum usage);
+GLUTIL_EXT(void,   glGenBuffers,                GLsizei n, GLuint* buffers);
+#endif
+#ifndef GL_VERSION_2_0
+GLUTIL_EXT(void,   glAttachShader,              GLuint program, GLuint shader);
+GLUTIL_EXT(void,   glCompileShader,             GLuint shader);
+GLUTIL_EXT(GLuint, glCreateProgram,             void);
+GLUTIL_EXT(GLuint, glCreateShader,              GLenum type);
+GLUTIL_EXT(void,   glDrawBuffers,               GLsizei n, const GLenum* bufs);
+GLUTIL_EXT(void,   glEnableVertexAttribArray,   GLuint index);
+GLUTIL_EXT(void,   glGetProgramInfoLog,         GLuint program, GLsizei bufSize, GLsizei* length, char* infoLog);
+GLUTIL_EXT(void,   glGetProgramiv,              GLuint program, GLenum pname, GLint* param);
+GLUTIL_EXT(void,   glLinkProgram,               GLuint program);
+GLUTIL_EXT(void,   glShaderSource,              GLuint shader, GLsizei count, const char *const* string, const GLint* length);
+GLUTIL_EXT(void,   glUniform2f,                 GLint location, GLfloat v0, GLfloat v1);
+GLUTIL_EXT(void,   glUseProgram,                GLuint program);
+GLUTIL_EXT(void,   glVertexAttribPointer,       GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void* pointer);
+#endif
+#ifndef GL_VERSION_3_2
+GLUTIL_EXT(void,   glFramebufferTexture,        GLenum target, GLenum attachment, GLuint texture, GLint level);
+#endif
+#ifndef GL_ARB_framebuffer_object
+GLUTIL_EXT(void,   glBindFramebuffer,           GLenum target, GLuint framebuffer);
+GLUTIL_EXT(void,   glGenFramebuffers,           GLsizei n, GLuint* framebuffers);
+#endif
+#ifndef GL_ARB_vertex_array_object
+GLUTIL_EXT(void,   glBindVertexArray,           GLuint array);
+GLUTIL_EXT(void,   glGenVertexArrays,           GLsizei n, GLuint* arrays);
+#endif
+#ifndef GL_ARB_multi_draw_indirect
+GLUTIL_EXT(void,   glMultiDrawElementsIndirect, GLenum mode, GLenum type, const void *indirect, GLsizei primcount, GLsizei stride);
+#endif
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/interpolate.cu b/pose_estimation/nvdiffrast/nvdiffrast/common/interpolate.cu
new file mode 100755
index 0000000000000000000000000000000000000000..84f5fb761175dc7844e6137da75bb944cab5fd35
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/interpolate.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "interpolate.h"
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Output ptrs.
+    float* out = p.out + pidx * p.numAttr;
+    float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0;
+
+    // Fetch rasterizer output.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = (int)r.w - 1;
+    bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);
+
+    // If no geometry in entire warp, zero the output and exit.
+    // Otherwise force barys to zero and output with live threads.
+    if (__all_sync(0xffffffffu, !triValid))
+    {
+        for (int i=0; i < p.numAttr; i++)
+            out[i] = 0.f;
+        if (ENABLE_DA)
+            for (int i=0; i < p.numDiffAttr; i++)
+                outDA[i] = make_float2(0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
+    int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
+    int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Pointers to attributes.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+
+    // Barys. If no triangle, force all to zero -> output is zero.
+    float b0 = triValid ? r.x : 0.f;
+    float b1 = triValid ? r.y : 0.f;
+    float b2 = triValid ? (1.f - r.x - r.y) : 0.f;
+
+    // Interpolate and write attributes.
+    for (int i=0; i < p.numAttr; i++)
+        out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i];
+
+    // No diff attrs? Exit.
+    if (!ENABLE_DA)
+        return;
+
+    // Read bary pixel differentials if we have a triangle.
+    float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
+    if (triValid)
+        db = ((float4*)p.rastDB)[pidx];
+
+    // Unpack a bit.
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    // Calculate the pixel differentials of chosen attributes.    
+    for (int i=0; i < p.numDiffAttr; i++)
+    {   
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Zero output if invalid index.
+        float dsdx = 0.f;
+        float dsdy = 0.f;
+        if (j >= 0 && j < p.numAttr)
+        {
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            dsdx = dudx*dsdu + dvdx*dsdv;
+            dsdy = dudy*dsdu + dvdy*dsdv;
+        }
+
+        // Write.
+        outDA[i] = make_float2(dsdx, dsdy);
+    }
+}
+
+// Template specializations.
+__global__ void InterpolateFwdKernel  (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); }
+__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Fetch triangle ID. If none, output zero bary/db gradients and exit.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = (int)r.w - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        ((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        if (ENABLE_DA)
+            ((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+
+    // Pointers to inputs.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+    const float* pdy = p.dy + pidx * p.numAttr;
+
+    // Pointers to outputs.
+    float* ga0 = p.gradAttr + vi0 * p.numAttr;
+    float* ga1 = p.gradAttr + vi1 * p.numAttr;
+    float* ga2 = p.gradAttr + vi2 * p.numAttr;
+
+    // Barys and bary gradient accumulators.
+    float b0 = r.x;
+    float b1 = r.y;
+    float b2 = 1.f - r.x - r.y;
+    float gb0 = 0.f;
+    float gb1 = 0.f;
+
+    // Loop over attributes and accumulate attribute gradients.
+    for (int i=0; i < p.numAttr; i++)
+    {
+        float y = pdy[i];
+        float s0 = a0[i];
+        float s1 = a1[i];
+        float s2 = a2[i];
+        gb0 += y * (s0 - s2);
+        gb1 += y * (s1 - s2);
+        caAtomicAdd(ga0 + i, b0 * y);
+        caAtomicAdd(ga1 + i, b1 * y);
+        caAtomicAdd(ga2 + i, b2 * y);
+    }
+
+    // Write the bary gradients.
+    ((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);
+
+    // If pixel differentials disabled, we're done.
+    if (!ENABLE_DA)
+        return;
+
+    // Calculate gradients based on attribute pixel differentials.
+    const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr;
+    float gdudx = 0.f;
+    float gdudy = 0.f;
+    float gdvdx = 0.f;
+    float gdvdy = 0.f;
+
+    // Read bary pixel differentials.
+    float4 db = ((float4*)p.rastDB)[pidx];
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    for (int i=0; i < p.numDiffAttr; i++)
+    {
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Check that index is valid.
+        if (j >= 0 && j < p.numAttr)
+        {
+            float2 dsdxy = dda[i];
+            float dsdx = dsdxy.x;
+            float dsdy = dsdxy.y;
+
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+
+            // Gradients of db.
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            gdudx += dsdu * dsdx;
+            gdudy += dsdu * dsdy;
+            gdvdx += dsdv * dsdx;
+            gdvdy += dsdv * dsdy;
+
+            // Gradients of attributes.
+            float du = dsdx*dudx + dsdy*dudy;
+            float dv = dsdx*dvdx + dsdy*dvdy;
+            caAtomicAdd(ga0 + j, du);
+            caAtomicAdd(ga1 + j, dv);
+            caAtomicAdd(ga2 + j, -du - dv);
+        }
+    }
+
+    // Write.
+    ((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
+}
+
+// Template specializations.
+__global__ void InterpolateGradKernel  (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); }
+__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/interpolate.h b/pose_estimation/nvdiffrast/nvdiffrast/common/interpolate.h
new file mode 100755
index 0000000000000000000000000000000000000000..d35d8388240e97c255c837446609d8ae00cd78d9
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/interpolate.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define IP_FWD_MAX_KERNEL_BLOCK_WIDTH   8
+#define IP_FWD_MAX_KERNEL_BLOCK_HEIGHT  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define IP_MAX_DIFF_ATTRS               32
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct InterpolateKernelParams
+{
+    const int*      tri;                            // Incoming triangle buffer.
+    const float*    attr;                           // Incoming attribute buffer.
+    const float*    rast;                           // Incoming rasterizer output buffer.
+    const float*    rastDB;                         // Incoming rasterizer output buffer for bary derivatives.
+    const float*    dy;                             // Incoming attribute gradients.
+    const float*    dda;                            // Incoming attr diff gradients.
+    float*          out;                            // Outgoing interpolated attributes.
+    float*          outDA;                          // Outgoing texcoord major axis lengths.
+    float*          gradAttr;                       // Outgoing attribute gradients.
+    float*          gradRaster;                     // Outgoing rasterizer gradients.
+    float*          gradRasterDB;                   // Outgoing rasterizer bary diff gradients.
+    int             numTriangles;                   // Number of triangles.
+    int             numVertices;                    // Number of vertices.
+    int             numAttr;                        // Number of total vertex attributes.
+    int             numDiffAttr;                    // Number of attributes to differentiate.
+    int             width;                          // Image width.
+    int             height;                         // Image height.
+    int             depth;                          // Minibatch size.
+    int             attrBC;                         // 0=normal, 1=attr is broadcast.
+    int             instance_mode;                  // 0=normal, 1=instance mode.
+    int             diff_attrs_all;                 // 0=normal, 1=produce pixel differentials for all attributes.
+    int             diffAttrs[IP_MAX_DIFF_ATTRS];   // List of attributes to differentiate.
+};
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.cpp b/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..73064d4620a0905d8732c3ec33abc825a8a71bc9
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.cpp
@@ -0,0 +1,560 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "rasterize.h"
+#include "glutil.h"
+#include <vector>
+#define STRINGIFY_SHADER_SOURCE(x) #x
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+static int ROUND_UP_BITS(uint32_t x, uint32_t y)
+{
+    // Round x up so that it has at most y bits of mantissa.
+    if (x < (1u << y))
+        return x;
+    uint32_t m = 0;
+    while (x & ~m)
+        m = (m << 1) | 1u;
+    m >>= y;
+    if (!(x & m))
+        return x;
+    return (x | m) + 1u;
+}
+
+//------------------------------------------------------------------------
+// GL helpers.
+
+static void compileGLShader(NVDR_CTX_ARGS, GLuint* pShader, GLenum shaderType, const char* src)
+{
+    const char* srcPtr = src;
+    int srcLength = strlen(src);
+    *pShader = 0;
+    NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
+    NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &srcPtr, &srcLength));
+    NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
+}
+
+static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
+{
+    *pProgram = 0;
+
+    GLuint glProgram = 0;
+    NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
+    NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));
+
+    GLint linkStatus = 0;
+    NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
+    if (!linkStatus)
+    {
+        GLint infoLen = 0;
+        NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
+        if (infoLen)
+        {
+            const char* hdr = "glLinkProgram() failed:\n";
+            std::vector<char> info(strlen(hdr) + infoLen);
+            strcpy(&info[0], hdr);
+            NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
+            NVDR_CHECK(0, &info[0]);
+        }
+        NVDR_CHECK(0, "glLinkProgram() failed");
+    }
+
+    *pProgram = glProgram;
+}
+
+//------------------------------------------------------------------------
+// Shared C++ functions.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
+{
+    // Create GL context and set it current.
+    s.glctx = createGLContext(cudaDeviceIdx);
+    setGLContext(s.glctx);
+
+    // Version check.
+    GLint vMajor = 0;
+    GLint vMinor = 0;
+    glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
+    glGetIntegerv(GL_MINOR_VERSION, &vMinor);
+    glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
+    LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
+    NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
+
+    // Number of output buffers.
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Set up vertex shader.
+    compileGLShader(NVDR_CTX_PARAMS, &s.glVertexShader, GL_VERTEX_SHADER,
+        "#version 330\n"
+        "#extension GL_ARB_shader_draw_parameters : enable\n"
+        STRINGIFY_SHADER_SOURCE(
+            layout(location = 0) in vec4 in_pos;
+            out int v_layer;
+            out int v_offset;
+            void main()
+            {
+                int layer = gl_DrawIDARB;
+                gl_Position = in_pos;
+                v_layer = layer;
+                v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
+            }
+        )
+    );
+
+    // Geometry and fragment shaders depend on if bary differential output is enabled or not.
+    if (s.enableDB)
+    {
+        // Set up geometry shader. Calculation of per-pixel bary differentials is based on:
+        //           u = (u/w) / (1/w)
+        //   --> du/dX = d((u/w) / (1/w))/dX
+        //   --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w
+        // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                layout(location = 0) uniform vec2 vp_scale;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                out vec4 var_db;
+                void main()
+                {
+                    // Plane equations for bary differentials.
+                    float w0 = gl_in[0].gl_Position.w;
+                    float w1 = gl_in[1].gl_Position.w;
+                    float w2 = gl_in[2].gl_Position.w;
+                    vec2 p0 = gl_in[0].gl_Position.xy;
+                    vec2 p1 = gl_in[1].gl_Position.xy;
+                    vec2 p2 = gl_in[2].gl_Position.xy;
+                    vec2 e0 = p0*w2 - p2*w0;
+                    vec2 e1 = p1*w2 - p2*w1;
+                    float a = e0.x*e1.y - e0.y*e1.x;
+
+                    // Clamp area to an epsilon to avoid arbitrarily high bary differentials.
+                    float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
+                    float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
+                    float ia = 1.f / ca; // Inverse area.
+
+                    vec2 ascl = ia * vp_scale;
+                    float dudx =  e1.y * ascl.x;
+                    float dudy = -e1.x * ascl.y;
+                    float dvdx = -e0.y * ascl.x;
+                    float dvdy =  e0.x * ascl.y;
+
+                    float duwdx = w2 * dudx;
+                    float dvwdx = w2 * dvdx;
+                    float duvdx = w0 * dudx + w1 * dvdx;
+                    float duwdy = w2 * dudy;
+                    float dvwdy = w2 * dvdy;
+                    float duvdy = w0 * dudy + w1 * dvdy;
+
+                    vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
+                    vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
+                    vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);
+
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
+                }
+            )
+        );
+
+        // Set up fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                in int gl_PrimitiveID;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                void main()
+                {
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                    out_db = var_db * var_uvzw.w;
+                }
+            )
+        );
+
+        // Set up fragment shader for depth peeling.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                in int gl_Layer;
+                in int gl_PrimitiveID;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                void main()
+                {
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, float(gl_PrimitiveID + 1));
+                    out_db = var_db * var_uvzw.w;
+                }
+            )
+        );
+    }
+    else
+    {
+        // Geometry shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                void main()
+                {
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
+                }
+            )
+        );
+
+        // Fragment shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in int gl_PrimitiveID;
+                layout(location = 0) out vec4 out_raster;
+                void main()
+                {
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                }
+            )
+        );
+
+        // Depth peeling variant of fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in int gl_Layer;
+                in int gl_PrimitiveID;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                void main()
+                {
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                }
+            )
+        );
+    }
+
+    // Finalize programs.
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP);
+
+    // Construct main fbo and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
+    NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));
+
+    // Enable two color attachments.
+    GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+    NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));
+
+    // Construct vertex array object.
+    NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
+    NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));
+
+    // Construct position buffer, bind permanently, enable, set ptr.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
+    NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));
+
+    // Construct index buffer and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));
+
+    // Set up depth test.
+    NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
+    NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
+    NVDR_CHECK_GL_ERROR(glClearDepth(1.0));
+
+    // Create and bind output buffers. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
+    }
+
+    // Create and bind depth/stencil buffer. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));
+
+    // Create texture name for previous output buffer (depth peeling).
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
+}
+
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth)
+{
+    // Resize vertex buffer?
+    if (posCount > s.posCount)
+    {
+        if (s.cudaPosBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+        s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
+        LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+    }
+
+    // Resize triangle buffer?
+    if (triCount > s.triCount)
+    {
+        if (s.cudaTriBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+        s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
+        LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+    }
+
+    // Resize framebuffer?
+    if (width > s.width || height > s.height || depth > s.depth)
+    {
+        int num_outputs = s.enableDB ? 2 : 1;
+        if (s.cudaColorBuffer[0])
+            for (int i=0; i < num_outputs; i++)
+                NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+
+        if (s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+            s.cudaPrevOutBuffer = 0;
+        }
+
+        // New framebuffer size.
+        s.width  = (width > s.width) ? width : s.width;
+        s.height = (height > s.height) ? height : s.height;
+        s.depth  = (depth > s.depth) ? depth : s.depth;
+        s.width  = ROUND_UP(s.width, 32);
+        s.height = ROUND_UP(s.height, 32);
+        LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
+
+        // Allocate color buffers.
+        for (int i=0; i < num_outputs; i++)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+        }
+
+        // Allocate depth/stencil buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+        NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+
+        // (Re-)register all GL buffers into Cuda.
+        for (int i=0; i < num_outputs; i++)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+    }
+
+    // Resize range arrays?
+    if ((unsigned int)depth > s.drawCmdBuffer.size())
+    {
+        int newSize = (depth > 64) ? ROUND_UP_BITS(depth, 1) : 64;
+        LOG(INFO) << "Increasing range array size to " << newSize << " elements";
+        s.drawCmdBuffer.resize(newSize);
+    }
+}
+
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
+{
+    // Only copy inputs if we are on first iteration of depth peeling or not doing it at all.
+    if (peeling_idx < 1)
+    {
+        if (triPtr)
+        {
+            // Copy both position and triangle buffers.
+            void* glPosPtr = NULL;
+            void* glTriPtr = NULL;
+            size_t posBytes = 0;
+            size_t triBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
+        }
+        else
+        {
+            // Copy position buffer only. Triangles are already copied and known to be constant.
+            void* glPosPtr = NULL;
+            size_t posBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
+        }
+    }
+
+    // Select program based on whether we have a depth peeling input or not.
+    if (peeling_idx < 1)
+    {
+        // Normal case: No peeling, or peeling disabled.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
+    }
+    else
+    {
+        // If we don't have a third buffer yet, create one.
+        if (!s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+        }
+
+        // Swap the GL buffers.
+        GLuint glTempBuffer = s.glPrevOutBuffer;
+        s.glPrevOutBuffer = s.glColorBuffer[0];
+        s.glColorBuffer[0] = glTempBuffer;
+
+        // Swap the Cuda buffers.
+        cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer;
+        s.cudaPrevOutBuffer = s.cudaColorBuffer[0];
+        s.cudaColorBuffer[0] = cudaTempBuffer;
+
+        // Bind the new output buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0));
+
+        // Bind old buffer as the input texture.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+
+        // Activate the correct program.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP));
+    }
+
+    // Set viewport, clear color buffer(s) and depth/stencil buffer.
+    NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
+    NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT));
+
+    // If outputting bary differentials, set resolution uniform
+    if (s.enableDB)
+        NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
+
+    // Render the meshes.
+    if (depth == 1 && !rangesPtr)
+    {
+        // Trivial case.
+        NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
+    }
+    else
+    {
+        if (!rangesPtr)
+        {
+            // Fill in range array to instantiate the same triangles for each output layer.
+            // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = s.drawCmdBuffer[i];
+                cmd.firstIndex    = 0;
+                cmd.count         = triCount;
+                cmd.baseVertex    = vtxPerInstance * i;
+                cmd.baseInstance  = 0;
+                cmd.instanceCount = 1;
+            }
+        }
+        else
+        {
+            // Fill in the range array according to user-given ranges. Triangle IDs point
+            // to the input triangle array, NOT index within range, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0, j=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = s.drawCmdBuffer[i];
+                int first = rangesPtr[j++];
+                int count = rangesPtr[j++];
+                NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
+                NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
+                cmd.firstIndex    = first * 3;
+                cmd.count         = count * 3;
+                cmd.baseVertex    = 0;
+                cmd.baseInstance  = first;
+                cmd.instanceCount = 1;
+            }
+        }
+
+        // Draw!
+        NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &s.drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
+    }
+}
+
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
+{
+    // Copy color buffers to output tensors.
+    cudaArray_t array = 0;
+    cudaChannelFormatDesc arrayDesc = {};   // For error checking.
+    cudaExtent arrayExt = {};               // For error checking.
+    int num_outputs = s.enableDB ? 2 : 1;
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
+        NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
+        NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
+        NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
+        NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
+        cudaMemcpy3DParms p = {0};
+        p.srcArray = array;
+        p.dstPtr.ptr = outputPtr[i];
+        p.dstPtr.pitch = width * 4 * sizeof(float);
+        p.dstPtr.xsize = width;
+        p.dstPtr.ysize = height;
+        p.extent.width = width;
+        p.extent.height = height;
+        p.extent.depth = depth;
+        p.kind = cudaMemcpyDeviceToDevice;
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
+    }
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.cu b/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.cu
new file mode 100755
index 0000000000000000000000000000000000000000..fe9888e002fd131bbd3b270fcb8f7f6746878e9e
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.cu
@@ -0,0 +1,175 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "rasterize.h"
+
+//------------------------------------------------------------------------
+// Gradient Cuda kernel.
+
+template <bool ENABLE_DB>
+static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Read triangle idx and dy.
+    float2 dy  = ((float2*)p.dy)[pidx * 2];
+    float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
+    int triIdx = (int)(((float*)p.out)[pidx * 4 + 3]) - 1;
+
+    // Exit if nothing to do.
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+        return; // No or corrupt triangle.
+    int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
+    int grad_all_ddb = 0;
+    if (ENABLE_DB)
+        grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
+    if (((grad_all_dy | grad_all_ddb) << 1) == 0)
+        return; // All incoming gradients are +0/-0.
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+    
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Compute inverse area with epsilon.
+    float at = a0 + a1 + a2;
+    float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
+    float iw = 1.f / (at + ep);
+
+    // Perspective correct, normalized barycentrics.
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Position gradients.
+    float gb0  = dy.x * iw;
+    float gb1  = dy.y * iw;
+    float gbb  = gb0 * b0 + gb1 * b1;
+    float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
+    float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
+    float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
+    float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
+    float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
+    float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
+    float gp0w = -fx * gp0x - fy * gp0y;
+    float gp1w = -fx * gp1x - fy * gp1y;
+    float gp2w = -fx * gp2x - fy * gp2y;
+
+    // Bary differential gradients.
+    if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
+    {
+        float dfxdX = p.xs * iw;
+        float dfydY = p.ys * iw;
+        ddb.x *= dfxdX;
+        ddb.y *= dfydY;
+        ddb.z *= dfxdX;
+        ddb.w *= dfydY;
+
+        float da0dX = p1.y * p2.w - p2.y * p1.w;
+        float da1dX = p2.y * p0.w - p0.y * p2.w;
+        float da2dX = p0.y * p1.w - p1.y * p0.w;
+        float da0dY = p2.x * p1.w - p1.x * p2.w;
+        float da1dY = p0.x * p2.w - p2.x * p0.w;
+        float da2dY = p1.x * p0.w - p0.x * p1.w;
+        float datdX = da0dX + da1dX + da2dX;
+        float datdY = da0dY + da1dY + da2dY;
+
+        float x01 = p0.x - p1.x;
+        float x12 = p1.x - p2.x;
+        float x20 = p2.x - p0.x;
+        float y01 = p0.y - p1.y;
+        float y12 = p1.y - p2.y;
+        float y20 = p2.y - p0.y;
+        float w01 = p0.w - p1.w;
+        float w12 = p1.w - p2.w;
+        float w20 = p2.w - p0.w;
+
+        float a0p1 = fy * p2.x - fx * p2.y;
+        float a0p2 = fx * p1.y - fy * p1.x;
+        float a1p0 = fx * p2.y - fy * p2.x;
+        float a1p2 = fy * p0.x - fx * p0.y;
+
+        float wdudX = 2.f * b0 * datdX - da0dX; 
+        float wdudY = 2.f * b0 * datdY - da0dY;
+        float wdvdX = 2.f * b1 * datdX - da1dX;
+        float wdvdY = 2.f * b1 * datdY - da1dY;
+
+        float c0  = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
+        float cx  = c0 * fx - ddb.x * b0 - ddb.z * b1;
+        float cy  = c0 * fy - ddb.y * b0 - ddb.w * b1;
+        float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
+        float czw = iw * (ddb.z * datdX + ddb.w * datdY);
+
+        gp0x += c0 * y12 - cy * w12              + czw * p2y                                               + ddb.w * p2.w;
+        gp1x += c0 * y20 - cy * w20 - cxy * p2y                              - ddb.y * p2.w;
+        gp2x += c0 * y01 - cy * w01 + cxy * p1y  - czw * p0y                 + ddb.y * p1.w                - ddb.w * p0.w;
+        gp0y += cx * w12 - c0 * x12              - czw * p2x                                - ddb.z * p2.w;
+        gp1y += cx * w20 - c0 * x20 + cxy * p2x               + ddb.x * p2.w;
+        gp2y += cx * w01 - c0 * x01 - cxy * p1x  + czw * p0x  - ddb.x * p1.w                + ddb.z * p0.w;
+        gp0w += cy * x12 - cx * y12              - czw * a1p0                               + ddb.z * p2.y - ddb.w * p2.x;
+        gp1w += cy * x20 - cx * y20 - cxy * a0p1              - ddb.x * p2.y + ddb.y * p2.x;
+        gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
+    }
+
+    // Accumulate using coalesced atomics.
+    caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
+}
+
+// Template specializations.
+__global__ void RasterizeGradKernel  (const RasterizeGradParams p) { RasterizeGradKernelTemplate<false>(p); }
+__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.h b/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.h
new file mode 100755
index 0000000000000000000000000000000000000000..6905b98508ea540729a1eae1bfb71af0f4033520
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/rasterize.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+
+//------------------------------------------------------------------------
+// Gradient CUDA kernel params.
+
+struct RasterizeGradParams
+{
+    const float*    pos;            // Incoming position buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    out;            // Rasterizer output buffer.
+    const float*    dy;             // Incoming gradients of rasterizer output buffer.
+    const float*    ddb;            // Incoming gradients of bary diff output buffer.
+    float*          grad;           // Outgoing position gradients.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Image width.
+    int             height;         // Image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
+// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
+
+#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
+#include "framework.h"
+#include "glutil.h"
+
+//------------------------------------------------------------------------
+// Draw command struct used by rasterizer.
+
+struct GLDrawCmd
+{
+    uint32_t    count;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    uint32_t    baseVertex;
+    uint32_t    baseInstance;
+};
+
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+
+struct RasterizeGLState
+{
+    int                     width;              // Allocated frame buffer width.
+    int                     height;             // Allocated frame buffer height.
+    int                     depth;              // Allocated frame buffer depth.
+    int                     posCount;           // Allocated position buffer in floats.
+    int                     triCount;           // Allocated triangle buffer in ints.
+    GLContext               glctx;
+    GLuint                  glFBO;
+    GLuint                  glColorBuffer[2];
+    GLuint                  glPrevOutBuffer;
+    GLuint                  glDepthStencilBuffer;
+    GLuint                  glVAO;
+    GLuint                  glTriBuffer;
+    GLuint                  glPosBuffer;
+    GLuint                  glProgram;
+    GLuint                  glProgramDP;
+    GLuint                  glVertexShader;
+    GLuint                  glGeometryShader;
+    GLuint                  glFragmentShader;
+    GLuint                  glFragmentShaderDP;
+    cudaGraphicsResource_t  cudaColorBuffer[2];
+    cudaGraphicsResource_t  cudaPrevOutBuffer;
+    cudaGraphicsResource_t  cudaPosBuffer;
+    cudaGraphicsResource_t  cudaTriBuffer;
+    std::vector<GLDrawCmd>  drawCmdBuffer;
+    int                     enableDB;
+};
+
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/texture.cpp b/pose_estimation/nvdiffrast/nvdiffrast/common/texture.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..51633e10120b4dc465e5283241a38c95db31f8dc
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/texture.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "framework.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Mip stack construction and access helpers.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
+{
+    char buf[1024];
+    int bufsz = 1024;
+
+    std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
+
+    int w = p.texWidth;
+    int h = p.texHeight;
+    bool ew = false;
+    bool eh = false;
+
+    msg += "Attempted mip stack construction:\n";
+    msg +=               "level  width height\n";
+    msg +=               "-----  ----- ------\n";
+    snprintf(buf, bufsz, "base   %5d  %5d\n", w, h);
+    msg += buf;
+
+    int mipTotal = 0;
+    int level = 0;
+    while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
+    {
+        // Current level.
+        level += 1;
+
+        // Determine if downsampling fails.
+        ew = ew || (w > 1 && (w & 1));
+        eh = eh || (h > 1 && (h & 1));
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        // Append level size to error message.
+        snprintf(buf, bufsz, "mip %-2d ", level);
+        msg += buf; 
+        if (ew) snprintf(buf, bufsz, "  err  ");
+        else    snprintf(buf, bufsz, "%5d  ", w);
+        msg += buf;
+        if (eh) snprintf(buf, bufsz, "  err\n");
+        else    snprintf(buf, bufsz, "%5d\n", h);
+        msg += buf;
+    }
+
+    NVDR_CHECK(0, msg);
+}
+
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets)
+{
+    // No levels at all?
+    if (p.mipLevelLimit == 0)
+    {
+        p.mipLevelMax = 0;
+        return 0;
+    }
+
+    // Current level size.
+    int w = p.texWidth;
+    int h = p.texHeight;
+
+    int mipTotal = 0;
+    int level = 0;
+    int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
+    mipOffsets[0] = 0;
+    while ((w|h) > 1)
+    {
+        // Current level.
+        level += 1;
+
+        // Quit if cannot downsample.
+        if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
+            raiseMipSizeError(NVDR_CTX_PARAMS, p);
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        mipOffsets[level] = mipTotal; // Store the mip offset (#floats).
+        mipTotal += w * h * p.texDepth * c;
+
+        // Hit the level limit?
+        if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
+            break;
+    }
+
+    p.mipLevelMax = level;
+    return mipTotal;
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/texture.cu b/pose_estimation/nvdiffrast/nvdiffrast/common/texture.cu
new file mode 100755
index 0000000000000000000000000000000000000000..c5e2ad4abdd7c84a512e1dc3d62b3245f2261d0b
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/texture.cu
@@ -0,0 +1,1124 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Memory access and math helpers.
+
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float  b, float c) { a[0] += b * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
+static __device__ __forceinline__ void accum_to_mem(float&  a, float* b, int s) { a += b[0]; }
+static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
+static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
+template<class T> static __device__ __forceinline__ T lerp  (const T& a, const T& b, float c) { return a + c * (b - a); }
+template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
+
+//------------------------------------------------------------------------
+// Cube map wrapping for smooth filtering across edges and corners. At corners,
+// one of the texture coordinates will be negative. For correct interpolation,
+// the missing texel must take the average color of the other three.
+
+static __constant__ uint32_t c_cubeWrapMask1[48] =
+{
+    0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
+    0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
+    0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
+    0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
+    0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
+    0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
+};
+
+static __constant__ uint8_t c_cubeWrapMask2[48] =
+{
+    0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
+    0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
+    0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
+};
+
+static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
+{
+    // Calculate case number.
+    int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
+    int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
+    int c = cx + cy;
+    if (c >= 5)
+        c--;
+    c = (face << 3) + c;
+
+    // Compute coordinates and faces.
+    unsigned int m = c_cubeWrapMask1[c];
+    int x0 = (m >>  0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
+    int x1 = (m >>  2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
+    int x2 = (m >>  4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
+    int x3 = (m >>  6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
+    int y0 = (m >>  8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
+    int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
+    int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
+    int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
+    int f0 = ((m >> 16) & 15) - 1;
+    int f1 = ((m >> 20) & 15) - 1;
+    int f2 = ((m >> 24) & 15) - 1;
+    int f3 = ((m >> 28)     ) - 1;
+
+    // Flips.
+    unsigned int f = c_cubeWrapMask2[c];
+    int w1 = w - 1;
+    if (f & 0x01) x0 = w1 - x0;
+    if (f & 0x02) x1 = w1 - x1;
+    if (f & 0x04) x2 = w1 - x2;
+    if (f & 0x08) x3 = w1 - x3;
+    if (f & 0x10) y0 = w1 - y0;
+    if (f & 0x20) y1 = w1 - y1;
+    if (f & 0x40) y2 = w1 - y2;
+    if (f & 0x80) y3 = w1 - y3;
+
+    // Done.
+    int4 tcOut;
+    tcOut.x = x0 + (y0 + f0 * w) * w;
+    tcOut.y = x1 + (y1 + f1 * w) * w;
+    tcOut.z = x2 + (y2 + f2 * w) * w;
+    tcOut.w = x3 + (y3 + f3 * w) * w;
+    return tcOut;
+}
+
+//------------------------------------------------------------------------
+// Cube map indexing and gradient functions.
+
+// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
+// two parameters) and face index.
+static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
+{
+    float ax = fabsf(x);
+    float ay = fabsf(y);
+    float az = fabsf(z);
+    int idx;
+    float c;
+    if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
+    else if (ay > ax)       { idx = 2; c = y; y = z; }
+    else                    { idx = 0; c = x; x = z; }
+    if (c < 0.f) idx += 1;
+    float m = __frcp_rz(fabsf(c)) * .5;
+    float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
+    float m1 = (idx != 2) ? -m : m;
+    x = x * m0 + .5;
+    y = y * m1 + .5;
+    x = fminf(fmaxf(x, 0.f), 1.f);
+    y = fminf(fmaxf(y, 0.f), 1.f);
+    return idx;
+}
+
+// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
+static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c;
+    float c0 = gu;
+    float c1 = gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl = (c0 + c1) * m;
+    float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
+    float gy = (idx & 0x0c) ? gl : -gv;
+    float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
+    gz = (idx & 0x09) ? -gz : gz;
+    return make_float3(gx, gy, gz) * (m * .5f);
+}
+
+// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
+// indexCubeMapGrad() functions rolled together.
+static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, c0, c1;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl0 = (dw.x * c0 + dw.z * c1) * m;
+    float gl1 = (dw.y * c0 + dw.w * c1) * m;
+    float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
+    float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
+    float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
+    float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
+    float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
+    float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
+    if (idx & 0x09)
+    {
+        gz0 = -gz0;
+        gz1 = -gz1;
+    }
+    g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
+    g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
+}
+
+// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
+// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
+static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+    float m = __frcp_rz(fabsf(c));
+    float dm = m * .5f;
+    float mm = m * dm;
+    gu *= (idx & 0x34) ? -mm : mm;
+    gv *= (idx & 0x2e) ? -mm : mm;
+
+    if (idx & 0x03)
+    {
+        return make_float4(gu * dvdX.x + dm * dvdX.z,
+                           gu * dvdY.x + dm * dvdY.z,
+                           gv * dvdX.x - dm * dvdX.y,
+                           gv * dvdY.x - dm * dvdY.y);
+    }
+    else if (idx & 0x0c)
+    {
+        return make_float4(gu * dvdX.y + dm * dvdX.x,
+                           gu * dvdY.y + dm * dvdY.x,
+                           gv * dvdX.y + dm * dvdX.z,
+                           gv * dvdY.y + dm * dvdY.z);
+    } 
+    else // (idx & 0x30)
+    {
+        return make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
+                           gu * dvdY.z + copysignf(dm, c) * dvdY.x,
+                           gv * dvdX.z - dm * dvdX.y,
+                           gv * dvdY.z - dm * dvdY.y);
+    }
+}
+
+// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
+// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
+//   |  d(ds/dX)/dx  d(ds/dY)/dx  d(dt/dX)/dx  d(dt/dY)/dx  |
+//   |  d(ds/dX)/dy  d(ds/dY)/dy  d(dt/dX)/dy  d(dt/dY)/dy  |
+//   |  d(ds/dX)/dz  d(ds/dY)/dz  d(dt/dX)/dz  d(dt/dY)/dz  |
+static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+
+    float m = __frcp_rz(c);
+    float dm = -m * fabsf(m) * .5;
+    float mm = m * m * .5;
+    float mu = (idx & 0x34) ? -mm : mm;
+    float mv = (idx & 0x2e) ? -mm : mm;
+    gu *= -2.0 * m * mu;
+    gv *= -2.0 * m * mv;
+
+    if (idx & 0x03)
+    {
+        dx.x = gu * dvdX.x + dm * dvdX.z;
+        dx.y = gu * dvdY.x + dm * dvdY.z;
+        dx.z = gv * dvdX.x - dm * dvdX.y;
+        dx.w = gv * dvdY.x - dm * dvdY.y;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.x;
+        dy.w = mv * dvdY.x;
+        dz.x = mu * dvdX.x;
+        dz.y = mu * dvdY.x;
+        dz.z = 0.f;
+        dz.w = 0.f;
+    }
+    else if (idx & 0x0c)
+    {
+        dx.x = mu * dvdX.y;
+        dx.y = mu * dvdY.y;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = gu * dvdX.y + dm * dvdX.x;
+        dy.y = gu * dvdY.y + dm * dvdY.x;
+        dy.z = gv * dvdX.y + dm * dvdX.z;
+        dy.w = gv * dvdY.y + dm * dvdY.z;
+        dz.x = 0.f;
+        dz.y = 0.f;
+        dz.z = mv * dvdX.y;
+        dz.w = mv * dvdY.y;
+    }
+    else // (idx & 0x30)
+    {
+        dx.x = mu * dvdX.z;
+        dx.y = mu * dvdY.z;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.z;
+        dy.w = mv * dvdY.z;
+        dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
+        dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
+        dz.z = gv * dvdX.z - dm * dvdX.y;
+        dz.w = gv * dvdY.z - dm * dvdY.y;
+    }
+}
+
+//------------------------------------------------------------------------
+// General texture indexing.
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
+{
+    int w = p.texWidth;
+    int h = p.texHeight;
+    float u = uv.x;
+    float v = uv.y;
+
+    // Cube map indexing.
+    if (CUBE_MODE)
+    {
+        // No wrap. Fold face index into tz right away.
+        tz = 6 * tz + indexCubeMap(u, v, uv.z); // Rewrites u, v.
+    }
+    else
+    {
+        // Handle boundary.
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+    }
+
+    u = u * (float)w;
+    v = v * (float)h;
+
+    int iu = __float2int_rd(u);
+    int iv = __float2int_rd(v);
+
+    // In zero boundary mode, return texture address -1.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        if (iu < 0 || iu >= w || iv < 0 || iv >= h)
+            return -1;
+    }
+
+    // Otherwise clamp and calculate the coordinate properly.
+    iu = min(max(iu, 0), w-1);
+    iv = min(max(iv, 0), h-1);
+    return iu + w * (iv + tz * h);
+}
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
+{
+    // Mip level size.
+    int2 sz = mipLevelSize(p, level);
+    int w = sz.x;
+    int h = sz.y;
+
+    // Compute texture-space u, v.
+    float u = uv.x;
+    float v = uv.y;
+    bool clampU = false;
+    bool clampV = false;
+
+    // Cube map indexing.
+    int face = 0;
+    if (CUBE_MODE)
+    {
+        // Neither clamp or wrap.
+        face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+    }
+    else
+    {
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            // Wrap.
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+
+        // Move to texel space.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
+        {
+            // Clamp to center of edge texels.
+            u = fminf(fmaxf(u, 0.f), w - 1.f);
+            v = fminf(fmaxf(v, 0.f), h - 1.f);
+            clampU = (u == 0.f || u == w - 1.f);
+            clampV = (v == 0.f || v == h - 1.f);
+        }
+    }
+
+    // Compute texel coordinates and weights.
+    int iu0 = __float2int_rd(u);
+    int iv0 = __float2int_rd(v);
+    int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
+    int iv1 = iv0 + (clampV ? 0 : 1);
+    u -= (float)iu0;
+    v -= (float)iv0;
+
+    // Cube map wrapping.
+    bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
+    if (cubeWrap)
+    {
+        tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
+        tcOut += 6 * tz * w * h;  // Bring in tz.
+        return make_float2(u, v); // Done.
+    }
+
+    // Fold cube map face into tz.
+    if (CUBE_MODE)
+        tz = 6 * tz + face;
+
+    // Wrap overflowing texel indices.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+    {
+        if (iu0 < 0) iu0 += w;
+        if (iv0 < 0) iv0 += h;
+        if (iu1 >= w) iu1 -= w;
+        if (iv1 >= h) iv1 -= h;
+    }
+
+    // Coordinates
+    iu0 += tz * w * h;
+    iu1 += tz * w * h;
+    tcOut.x = iu0 + w * iv0;
+    tcOut.y = iu1 + w * iv0;
+    tcOut.z = iu0 + w * iv1;
+    tcOut.w = iu1 + w * iv1;
+
+    // Invalidate texture addresses outside unit square if we are in zero mode.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        bool iu0_out = (iu0 < 0 || iu0 >= w);
+        bool iu1_out = (iu1 < 0 || iu1 >= w);
+        bool iv0_out = (iv0 < 0 || iv0 >= h);
+        bool iv1_out = (iv1 < 0 || iv1 >= h);
+        if (iu0_out || iv0_out) tcOut.x = -1;
+        if (iu1_out || iv0_out) tcOut.y = -1;
+        if (iu0_out || iv1_out) tcOut.z = -1;
+        if (iu1_out || iv1_out) tcOut.w = -1;
+    }
+
+    // All done.
+    return make_float2(u, v);
+}
+
+//------------------------------------------------------------------------
+// Mip level calculation.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
+{
+    // Do nothing if mips not in use.
+    if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
+        return;
+
+    // Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
+    if (!BIAS_ONLY)
+    {
+        // Get pixel derivatives of texture coordinates.
+        float4 uvDA;
+        float3 dvdX, dvdY; // Gradients use these later.
+        if (CUBE_MODE)
+        {
+            // Fetch.
+            float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
+            float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
+            float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];
+
+            // Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
+            dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
+            dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
+            uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
+        }
+        else
+        {
+            // Fetch.
+            uvDA = ((const float4*)p.uvDA)[pidx];
+        }
+
+        // Scaling factors.
+        float uscl = p.texWidth;
+        float vscl = p.texHeight;
+
+        // d[s,t]/d[X,Y].
+        float dsdx = uvDA.x * uscl;
+        float dsdy = uvDA.y * uscl;
+        float dtdx = uvDA.z * vscl;
+        float dtdy = uvDA.w * vscl;
+
+        // Calculate footprint axis lengths.
+        float A = dsdx*dsdx + dtdx*dtdx;
+        float B = dsdy*dsdy + dtdy*dtdy;
+        float C = dsdx*dsdy + dtdx*dtdy;
+        float l2b = 0.5 * (A + B);
+        float l2n = 0.25 * (A-B)*(A-B) + C*C;
+        float l2a = sqrt(l2n);
+        float lenMinorSqr = fmaxf(0.0, l2b - l2a);
+        float lenMajorSqr = l2b + l2a;
+
+        // Footprint vs. mip level gradient.
+        if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            float dw   = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
+            float AB   = dw * .5f * (A - B);
+            float Cw   = dw * C;
+            float l2aw = dw * l2a;
+            float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
+            float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
+            float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
+            float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
+
+            *pdw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
+
+            // In cube maps, there is also a texture coordinate vs. mip level gradient.
+            if (CUBE_MODE)
+            {
+                float4 dx, dy, dz;
+                indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
+
+                float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
+                float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
+                float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
+                float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
+
+                float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
+                d_f_dv += d_dsdX_dv * d_f_ddsdX;
+                d_f_dv += d_dsdY_dv * d_f_ddsdY;
+                d_f_dv += d_dtdX_dv * d_f_ddtdX;
+                d_f_dv += d_dtdY_dv * d_f_ddtdY;
+
+                *pdfdv = d_f_dv;
+            }
+        }
+
+        // Finally, calculate mip level.
+        flevel = .5f * __log2f(lenMajorSqr);
+    }
+
+    // Bias the mip level and clamp.
+    if (p.mipLevelBias)
+        flevel += p.mipLevelBias[pidx];
+    flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
+
+    // Calculate levels depending on filter mode.
+    level0 = __float2int_rd(flevel);
+
+    // Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
+    if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
+    {
+        level1 = min(level0 + 1, p.mipLevelMax);
+        flevel -= level0; // Fractional part. Zero if clamped on last level.
+    }
+}
+
+//------------------------------------------------------------------------
+// Texel fetch and accumulator helpers that understand cube map corners.
+
+template<class T>
+static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
+{
+    if (corner)
+    {
+        T avg = zero_value<T>();
+        if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
+        if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
+        if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
+        if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
+        avg *= 0.33333333f;
+        if (tc.x < 0) a00 = avg;
+        if (tc.y < 0) a10 = avg;
+        if (tc.z < 0) a01 = avg;
+        if (tc.w < 0) a11 = avg;
+    }
+    else
+    {
+        a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>();
+        a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>();
+        a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>();
+        a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>();
+    }
+}
+
+static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
+{
+    if (corner)
+    {
+        float cb;
+        if (tc.x < 0) cb = c.x;
+        if (tc.y < 0) cb = c.y;
+        if (tc.z < 0) cb = c.z;
+        if (tc.w < 0) cb = c.w;
+        cb *= 0.33333333f;
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
+    }
+    else
+    {
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
+    }
+}
+
+//------------------------------------------------------------------------
+// Mip builder kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
+{
+    // Sizes.
+    int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
+    int2 sz_out = mipLevelSize(p, p.mipLevelOut);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= sz_out.x || py >= sz_out.y)
+        return;
+
+    // Pixel indices.
+    int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
+    int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
+    int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
+
+    // Input and output pointers.
+    const float* pin = p.tex[p.mipLevelOut - 1];
+    float* pout = (float*)p.tex[p.mipLevelOut];
+
+    // Special case: Input texture height or width is 1.
+    if (sz_in.x == 1 || sz_in.y == 1)
+    {
+        if (sz_in.y == 1)
+            pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
+
+        for (int i=0; i < p.channels; i += C)
+        {
+            T v0 = *((const T*)&pin[pidx_in0 + i]);
+            T v1 = *((const T*)&pin[pidx_in1 + i]);
+            T avg = .5f * (v0 + v1);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+            avg = (avg - .5f) * 1.41421356f + .5f;
+#endif
+            *((T*)&pout[pidx_out + i]) = avg;
+        }
+
+        return;
+    }
+
+    for (int i=0; i < p.channels; i += C)
+    {
+        T v0 = *((const T*)&pin[pidx_in0 + i]);
+        T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
+        T v2 = *((const T*)&pin[pidx_in1 + i]);
+        T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
+        T avg = .25f * (v0 + v1 + v2 + v3);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+        avg = (avg - .5f) * 2.f + .5f;
+#endif
+        *((T*)&pout[pidx_out + i]) = avg;
+    }
+}
+
+// Template specializations.
+__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float,  1>(p); }
+__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
+__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Output ptr.
+    float* pOut = p.out + pidx * p.channels;
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        tc *= p.channels;
+        const float* pIn = p.tex[0];
+
+        // Copy if valid tc, otherwise output zero.
+        for (int i=0; i < p.channels; i += C)
+            *((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>();
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
+
+    // Get texel indices and pointer for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Bilinear fetch.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        // Interpolate.
+        for (int i=0; i < p.channels; i += C, tc0 += C)
+        {
+            T a00, a10, a01, a11;
+            fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            *((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
+        }
+        return; // Exit.
+    }
+
+    // Get texel indices and pointer for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Trilinear fetch.
+    for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
+    {
+        // First level.
+        T a00, a10, a01, a11;
+        fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        T a = bilerp(a00, a10, a01, a11, uv0);
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            T b00, b10, b01, b11;
+            fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            T b = bilerp(b00, b10, b01, b11, uv1);
+            a = lerp(a, b, flevel); // Interpolate between levels.
+        }
+
+        // Write.
+        *((T*)&pOut[i]) = a;
+    }
+}
+
+// Template specializations.
+__global__ void TextureFwdKernelNearest1                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest2                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest4                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelLinear1                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear2                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear4                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest1        (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest2        (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest4        (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear1         (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear2         (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear4         (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeNearest1                (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest2                (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest4                (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinear1                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear2                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear4                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest1    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest2    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest4    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear1     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear2     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear4     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO1      (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO2      (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO4      (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO1       (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO2       (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO4       (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1   (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2   (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4   (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
+// Gradient mip puller kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.texWidth || py >= p.texHeight)
+        return;
+
+    // Number of wide elements.
+    int c = p.channels;
+    if (C == 2) c >>= 1;
+    if (C == 4) c >>= 2;
+
+    // Dynamically allocated shared memory for holding a texel.
+    extern __shared__ float s_texelAccum[];
+    int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
+    int sharedStride = blockDim.x * blockDim.y;
+#   define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
+
+    // Clear the texel.
+    for (int i=0; i < p.channels; i++)
+        *TEXEL_ACCUM(i) = 0.f;
+
+    // Track texel position and accumulation weight over the mip stack.
+    int x = px;
+    int y = py;
+    float w = 1.f;
+
+    // Pull gradients from all levels.
+    int2 sz = mipLevelSize(p, 0); // Previous level size.
+    for (int level=1; level <= p.mipLevelMax; level++)
+    {
+        // Weight decay depends on previous level size.
+        if (sz.x > 1) w *= .5f;
+        if (sz.y > 1) w *= .5f;
+
+        // Current level size and coordinates.
+        sz = mipLevelSize(p, level);
+        x >>= 1;
+        y >>= 1;
+
+        T* pIn = (T*)(p.gradTex[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
+        for (int i=0; i < c; i++)
+            accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
+    }
+
+    // Add to main texture gradients.
+    T* pOut = (T*)(p.gradTex[0] + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
+    for (int i=0; i < c; i++)
+        accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
+}
+
+// Template specializations.
+__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float,  1>(p); }
+__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
+__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Early exit if output gradients are zero.
+    const float* pDy = p.dy + pidx * p.channels;
+    unsigned int dmax = 0u;
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i < p.channels; i += 4)
+        {
+            uint4 dy = *((const uint4*)&pDy[i]);
+            dmax |= (dy.x | dy.y | dy.z | dy.w);
+        }
+    }
+    else
+    {
+        for (int i=0; i < p.channels; i++)
+            dmax |= __float_as_uint(pDy[i]);
+    }
+
+    // Store zeros and exit.
+    if (__uint_as_float(dmax) == 0.f)
+    {
+        if (CUBE_MODE)
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                {
+                    ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
+                }
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        else
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                    ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        return;
+    }
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode - texture gradients only.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        if (tc < 0)
+            return; // Outside texture.
+
+        tc *= p.channels;
+        float* pOut = p.gradTex[0];
+
+        // Accumulate texture gradients.
+        for (int i=0; i < p.channels; i++)
+            caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
+    float3 dfdv = make_float3(0.f, 0.f, 0.f);
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
+
+    // UV gradient accumulators.
+    float gu = 0.f;
+    float gv = 0.f;
+
+    // Get texel indices and pointers for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    float* pOut0 = p.gradTex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Texel weights.
+    float uv011 = uv0.x * uv0.y;
+    float uv010 = uv0.x - uv011;
+    float uv001 = uv0.y - uv011;
+    float uv000 = 1.f - uv0.x - uv001;
+    float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
+
+    // Attribute weights.
+    int2 sz0 = mipLevelSize(p, level0);
+    float sclu0 = (float)sz0.x;
+    float sclv0 = (float)sz0.y;
+
+    // Bilinear mode - texture and uv gradients.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        for (int i=0; i < p.channels; i++, tc0 += 1)
+        {
+            float dy = pDy[i];
+            accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
+
+            float a00, a10, a01, a11;
+            fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            float ad = (a11 + a00 - a10 - a01);
+            gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
+            gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
+        }
+
+        // Store UV gradients and exit.
+        if (CUBE_MODE)
+            ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
+        else
+            ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+        return;
+    }
+
+    // Accumulate fractional mip level gradient.
+    float df = 0; // dL/df.
+
+    // Get texel indices and pointers for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    float* pOut1 = p.gradTex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Texel weights.
+    float uv111 = uv1.x * uv1.y;
+    float uv110 = uv1.x - uv111;
+    float uv101 = uv1.y - uv111;
+    float uv100 = 1.f - uv1.x - uv101;
+    float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
+
+    // Attribute weights.
+    int2 sz1 = mipLevelSize(p, level1);
+    float sclu1 = (float)sz1.x;
+    float sclv1 = (float)sz1.y;
+
+    // Trilinear mode.
+    for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
+    {
+        float dy = pDy[i];
+        float dy0 = (1.f - flevel) * dy;
+        accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
+
+        // UV gradients for first level.
+        float a00, a10, a01, a11;
+        fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        float ad = (a11 + a00 - a10 - a01);
+        gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
+        gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            // Texture gradients for second level.
+            float dy1 = flevel * dy;
+            accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
+
+            // UV gradients for second level.
+            float b00, b10, b01, b11;
+            fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            float bd = (b11 + b00 - b10 - b01);
+            gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
+            gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
+
+            // Mip level gradient.
+            float a = bilerp(a00, a10, a01, a11, uv0);
+            float b = bilerp(b00, b10, b01, b11, uv1);
+            df += (b-a) * dy;
+        }
+    }
+
+    // Store UV gradients.
+    if (CUBE_MODE)
+        ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
+    else
+        ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+    // Store mip level bias gradient.
+    if (p.gradMipLevelBias)
+        p.gradMipLevelBias[pidx] = df;
+
+    // Store UV pixel differential gradients.
+    if (!BIAS_ONLY)
+    {
+        // Final gradients.
+        dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
+
+        // Store them.
+        if (CUBE_MODE)
+        {
+            // Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
+            float3 g0, g1;
+            indexCubeMapGrad4(uv, dw, g0, g1);
+            ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
+            ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
+            ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
+        }
+        else
+            ((float4*)p.gradUVDA)[pidx] = dw;
+    }
+}
+
+// Template specializations.
+__global__ void TextureGradKernelNearest                    (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelLinear                     (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearest        (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinear         (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeNearest                (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinear                 (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearest    (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinear     (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearestBO      (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinearBO       (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearestBO  (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinearBO   (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/common/texture.h b/pose_estimation/nvdiffrast/nvdiffrast/common/texture.h
new file mode 100755
index 0000000000000000000000000000000000000000..f79b600fff0256cdadd38e265b49366549434ef8
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/common/texture.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+
+//------------------------------------------------------------------------
+// Constants.
+
+#define TEX_DEBUG_MIP_RETAIN_VARIANCE           0   // For debugging
+#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH          8
+#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT         8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH      8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT     8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH         8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT        8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH     8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT    8
+#define TEX_MAX_MIP_LEVEL                       16  // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
+#define TEX_MODE_NEAREST                        0   // Nearest on base level.
+#define TEX_MODE_LINEAR                         1   // Bilinear on base level.
+#define TEX_MODE_LINEAR_MIPMAP_NEAREST          2   // Bilinear on nearest mip level.
+#define TEX_MODE_LINEAR_MIPMAP_LINEAR           3   // Trilinear.
+#define TEX_MODE_COUNT                          4
+#define TEX_BOUNDARY_MODE_CUBE                  0   // Cube map mode.
+#define TEX_BOUNDARY_MODE_WRAP                  1   // Wrap (u, v).
+#define TEX_BOUNDARY_MODE_CLAMP                 2   // Clamp (u, v).
+#define TEX_BOUNDARY_MODE_ZERO                  3   // Pad with zeros.
+#define TEX_BOUNDARY_MODE_COUNT                 4
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct TextureKernelParams
+{
+    const float*    tex[TEX_MAX_MIP_LEVEL];         // Incoming texture buffer with mip levels.
+    const float*    uv;                             // Incoming texcoord buffer.
+    const float*    uvDA;                           // Incoming uv pixel diffs or NULL.
+    const float*    mipLevelBias;                   // Incoming mip level bias or NULL.
+    const float*    dy;                             // Incoming output gradient.
+    float*          out;                            // Outgoing texture data.
+    float*          gradTex[TEX_MAX_MIP_LEVEL];     // Outgoing texture gradients with mip levels.
+    float*          gradUV;                         // Outgoing texcoord gradient.
+    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
+    float*          gradMipLevelBias;               // Outgoing mip level bias gradient.
+    int             enableMip;                      // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
+    int             filterMode;                     // One of the TEX_MODE_ constants.
+    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
+    int             texConst;                       // If true, texture is known to be constant.
+    int             mipLevelLimit;                  // Mip level limit coming from the op.
+    int             channels;                       // Number of texture channels.
+    int             imgWidth;                       // Image width.
+    int             imgHeight;                      // Image height.
+    int             texWidth;                       // Texture width.
+    int             texHeight;                      // Texture height.
+    int             texDepth;                       // Texture depth.
+    int             n;                              // Minibatch size.
+    int             mipLevelMax;                    // Maximum mip level index. Zero if mips disabled.
+    int             mipLevelOut;                    // Mip level being calculated in builder kernel.
+};
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets);
+
+//------------------------------------------------------------------------
+// Macros.
+
+#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/lib/setgpu.lib b/pose_estimation/nvdiffrast/nvdiffrast/lib/setgpu.lib
new file mode 100755
index 0000000000000000000000000000000000000000..3735639b54066d28236ceb1fd426704bc25026a5
Binary files /dev/null and b/pose_estimation/nvdiffrast/nvdiffrast/lib/setgpu.lib differ
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/__init__.py b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..cf62df8782d730f072ca5f4e4862a44dc8c3a086
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import rasterize, interpolate, texture, antialias
+from .plugin_loader import set_cache_dir
+
+__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/ops.py b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/ops.py
new file mode 100755
index 0000000000000000000000000000000000000000..be51deef13e0ecfbd5bfe8bc376af24a18db7224
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/ops.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import tensorflow as tf
+import numpy as np
+import os
+from . import plugin_loader
+
+#----------------------------------------------------------------------------
+# Helpers.
+#----------------------------------------------------------------------------
+
+# OpenGL-related linker options depending on platform.
+def _get_gl_opts():
+    libs = {
+        'posix': ['GL', 'EGL'],
+        'nt':    ['gdi32', 'opengl32', 'user32', 'setgpu'],
+    }
+    return ['-l' + x for x in libs[os.name]]
+
+# Load the cpp plugin.
+def _get_plugin():
+    fn = os.path.join(os.path.dirname(__file__), 'tf_all.cu')
+    return plugin_loader.get_plugin(fn, extra_nvcc_options=_get_gl_opts() + ['-DNVDR_TENSORFLOW'])
+
+# Convert parameter to a numpy array if possible.
+def _get_constant(x, dtype):
+    try:
+        return np.asarray(x, dtype=dtype)
+    except (TypeError, ValueError):
+        return None
+
+# Tests for a construction-time constantness instead of tf.constant node because
+# the latter can be overridden in Session.run() feed_dict at evaluation time.
+def _is_constant(x, dtype):
+    if isinstance(x, np.ndarray):
+        return np.can_cast(x.dtype, dtype, 'unsafe')
+    else:
+        return _get_constant(x, dtype) is not None
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+def rasterize(pos, tri, resolution, ranges=None, tri_const=False, output_db=True, grad_db=True):
+    assert tri_const is True or tri_const is False
+    assert output_db is True or output_db is False
+
+    # Known constant resolution?
+    resolution_c = _get_constant(resolution, np.int32)
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert all inputs to tensors / base types.
+    tri_const = 1 if tri_const else 0
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    resolution = tf.convert_to_tensor(resolution, dtype=tf.int32)
+    if ranges is None:
+        ranges = tf.convert_to_tensor(np.zeros(shape=[0, 2], dtype=np.int32)) # Empty tensor.
+    else:
+        ranges = tf.convert_to_tensor(ranges, dtype=tf.int32) # Convert input to tensor.
+
+    # Infer as much about the output shape as possible.
+    out_shape = [None, None, None, 4]
+    if pos.shape.rank == 3: # Instanced mode.
+        out_shape[0] = pos.shape[0].value
+    elif pos.shape.rank == 2: # Range mode.
+        if ranges.shape.rank not in [None, 0]:
+            out_shape[0] = ranges.shape[0].value
+    if resolution_c is not None:
+        assert resolution_c.shape == (2,)
+        out_shape[1], out_shape[2] = resolution_c
+
+    # Output pixel differentials.
+    @tf.custom_gradient
+    def func_db(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 1, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape)
+        def grad(dy, ddb):
+            if grad_db:
+                return _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+            else:
+                return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Do not output pixel differentials.
+    @tf.custom_gradient
+    def func(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 0, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape[:-1] + [0]) # Zero channels in out_db.
+        def grad(dy, _):
+            return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Choose stub.
+    if output_db:
+        return func_db(pos)
+    else:
+        return func(pos)
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = _get_constant(diff_attrs, np.int32)
+        assert (diff_attrs is not None) and len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    # Convert all inputs to tensors.
+    attr = tf.convert_to_tensor(attr, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    if diff_attrs:
+        rast_db = tf.convert_to_tensor(rast_db, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if rast.shape.rank is not None:
+        out_shape = [rast.shape[0].value, rast.shape[1].value, rast.shape[2].value, None]
+    if attr.shape.rank in [2, 3]:
+        out_shape[3] = attr.shape[-1].value
+
+    # Output pixel differentials for at least some attributes.
+    @tf.custom_gradient
+    def func_da(attr, rast, rast_db):
+        diff_attrs_all = int(diff_attrs == 'all')
+        diff_attrs_list = [] if diff_attrs_all else diff_attrs
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+
+        # Infer number of channels in out_da.
+        if not diff_attrs_all:
+            da_channels = 2 * len(diff_attrs)
+        if (attr.shape.rank in [2, 3]) and (attr.shape[-1].value is not None):
+            da_channels = 2 * attr.shape[-1].value
+        else:
+            da_channels = None
+
+        # Set output shapes.
+        out.set_shape(out_shape)
+        out_da.set_shape([out_shape[0], out_shape[1], out_shape[2], da_channels])
+
+        def grad(dy, dda):
+            return _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return (out, out_da), grad
+
+    # No pixel differentials for any attribute.
+    @tf.custom_gradient
+    def func(attr, rast):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        out.set_shape(out_shape)
+        out_da.set_shape(out_shape[:-1] + [0]) # Zero channels in out_da.
+        def grad(dy, _):
+            return _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return (out, out_da), grad
+
+    # Choose stub.
+    if diff_attrs:
+        return func_da(attr, rast, rast_db)
+    else:
+        return func(attr, rast)
+
+#----------------------------------------------------------------------------
+# Texture.
+#----------------------------------------------------------------------------
+
+def texture(tex, uv, uv_da=None, filter_mode='auto', boundary_mode='wrap', tex_const=False, max_mip_level=None):
+    assert tex_const is True or tex_const is False
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+
+    # Known constant texture?
+    tex_const = tex_const or _is_constant(tex, np.float32)
+
+    # Sanitize inputs.
+    tex_const = 1 if tex_const else 0
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Convert inputs to tensors.
+    tex = tf.convert_to_tensor(tex, dtype=tf.float32)
+    uv = tf.convert_to_tensor(uv, dtype=tf.float32)
+    if 'mipmap' in filter_mode:
+        uv_da = tf.convert_to_tensor(uv_da, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if uv.shape.rank is not None:
+        assert uv.shape.rank == 4
+        out_shape = [uv.shape[0].value, uv.shape[1].value, uv.shape[2].value, None]
+    if tex.shape.rank is not None:
+        assert tex.shape.rank == (5 if boundary_mode == 'cube' else 4)
+        out_shape[-1] = tex.shape[-1].value
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Linear-mipmap-linear: Mipmaps enabled, all gradients active.
+    @tf.custom_gradient
+    def func_linear_mipmap_linear(tex, uv, uv_da):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear-mipmap-nearest: Mipmaps enabled, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear_mipmap_nearest(tex, uv):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear: Mipmaps disabled, no uv_da, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear(tex, uv):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Nearest: Mipmaps disabled, no uv_da, no gradients to uv_da or uv.
+    @tf.custom_gradient
+    def func_nearest(tex):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear':
+        return func_linear_mipmap_linear(tex, uv, uv_da)
+    elif filter_mode == 'linear-mipmap-nearest':
+        return func_linear_mipmap_nearest(tex, uv)
+    elif filter_mode == 'linear':
+        return func_linear(tex, uv)
+    elif filter_mode == 'nearest':
+        return func_nearest(tex)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+def antialias(color, rast, pos, tri, tri_const=False, pos_gradient_boost=1.0):
+    assert tri_const is True or tri_const is False
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert inputs to tensors.
+    color = tf.convert_to_tensor(color, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+
+    # Sanitize inputs.
+    tri_const = 1 if tri_const else 0
+
+    @tf.custom_gradient
+    def func(color, pos):
+        color_out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, tri_const)
+        color_out.set_shape(color.shape)
+        def grad(dy):
+            grad_color, grad_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+            if pos_gradient_boost != 1.0:
+                grad_pos = grad_pos * pos_gradient_boost
+            return grad_color, grad_pos
+        return color_out, grad
+
+    return func(color, pos)
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py
new file mode 100755
index 0000000000000000000000000000000000000000..d428c55de2194e42be331b1cad1b2162709a4cd4
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+_nvdiffrast_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    '''Set CUDA kernel compilation temp dir.
+
+    If `set_cache_dir` is not called, the cache directory will default to
+    one of the below:
+
+    - Value of NVDIFFRAST_CACHE_DIR env var, if set
+    - $HOME/.cache/nvdiffrast if HOME env var is set
+    - $USERPROFILE/.cache/nvdiffrast if USERPROFILE is set.
+
+    Args:
+      path: Where to save CUDA kernel build temporaries
+    '''
+    global _nvdiffrast_cache_dir
+    _nvdiffrast_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _nvdiffrast_cache_dir is not None:
+        return os.path.join(_nvdiffrast_cache_dir, *paths)
+    if 'NVDIFFRAST_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['NVDIFFRAST_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'nvdiffrast', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'nvdiffrast', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'nvdiffrast', *paths)
+
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    vc_bin_dir = 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin'
+    if os.path.isdir(vc_bin_dir):
+        return vc_bin_dir
+    return None
+
+def _get_compute_cap(device):
+    caps_str = device.physical_device_desc
+    m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+    major = m.group(1)
+    minor = m.group(2)
+    return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+    gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU devices found')
+    (major, minor) = _get_compute_cap(gpus[0])
+    return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+    with os.popen(cmd) as pipe:
+        output = pipe.read()
+        status = pipe.close()
+    if status is not None:
+        raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+    cmd = 'nvcc ' + opts.strip()
+    cmd += ' --disable-warnings'
+    cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+    compiler_bindir = _find_compiler_bindir()
+    if compiler_bindir is None:
+        # Require that _find_compiler_bindir succeeds on Windows.  Allow
+        # nvcc to use whatever is the default on Linux.
+        if os.name == 'nt':
+            raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+    else:
+        cmd += ' --compiler-bindir "%s"' % compiler_bindir
+    cmd += ' 2>&1'
+    return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file, extra_nvcc_options=[]):
+    cuda_file_base = os.path.basename(cuda_file)
+    cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+    # Already in cache?
+    if cuda_file in _plugin_cache:
+        return _plugin_cache[cuda_file]
+
+    # Setup plugin.
+    if verbose:
+        print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+    try:
+        # Hash CUDA source.
+        md5 = hashlib.md5()
+        with open(cuda_file, 'rb') as f:
+            md5.update(f.read())
+        md5.update(b'\n')
+
+        # Hash headers included by the CUDA code by running it through the preprocessor.
+        if not do_not_hash_included_headers:
+            if verbose:
+                print('Preprocessing... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+                _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+                with open(tmp_file, 'rb') as f:
+                    bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+                    good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+                    for ln in f:
+                        if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+                            ln = ln.replace(bad_file_str, good_file_str)
+                            md5.update(ln)
+                    md5.update(b'\n')
+
+        # Select compiler options.
+        compile_opts = ''
+        if os.name == 'nt':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+            compile_opts += ' --library-path="%s"' % (os.path.dirname(__file__) + r"\..\lib") # Find libraries during compilation.
+        elif os.name == 'posix':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+            compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+        else:
+            assert False # not Windows or Linux, w00t?
+        compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+        compile_opts += ' --use_fast_math'
+        for opt in extra_nvcc_options:
+            compile_opts += ' ' + opt
+        nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+        # Hash build configuration.
+        md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+        md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+        md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+        # Compile if not already compiled.
+        bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+        cuda_cache_path = make_cache_dir_path()
+        bin_file = os.path.join(make_cache_dir_path(), cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+        if not os.path.isfile(bin_file):
+            if verbose:
+                print('Compiling... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+                _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+                os.makedirs(cuda_cache_path, exist_ok=True)
+                intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+                shutil.copyfile(tmp_file, intermediate_file)
+                os.rename(intermediate_file, bin_file) # atomic
+
+        # Load.
+        if verbose:
+            print('Loading... ', end='', flush=True)
+        plugin = tf.load_op_library(bin_file)
+
+        # Add to cache.
+        _plugin_cache[cuda_file] = plugin
+        if verbose:
+            print('Done.', flush=True)
+        return plugin
+
+    except:
+        if verbose:
+            print('Failed!', flush=True)
+        raise
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu
new file mode 100755
index 0000000000000000000000000000000000000000..122cc02700c7b8eeda56736eb1a27f8f5104051b
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+// TF-specific helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
+#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
+
+// Cuda kernels and CPP all together. What an absolute compilation unit.
+
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "../common/framework.h"
+#include "../common/glutil.cpp"
+
+#include "../common/common.h"
+#include "../common/common.cpp"
+
+#include "../common/rasterize.h"
+#include "../common/rasterize.cpp"
+#include "../common/rasterize.cu"
+#include "tf_rasterize.cu"
+
+#include "../common/interpolate.cu"
+#include "tf_interpolate.cu"
+
+#include "../common/texture.cpp"
+#include "../common/texture.cu"
+#include "tf_texture.cu"
+
+#include "../common/antialias.cu"
+#include "tf_antialias.cu"
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu
new file mode 100755
index 0000000000000000000000000000000000000000..4e5c9c6d4afa05489d6ff8179c7d32f8d8e92025
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct AntialiasFwdOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_attribs.tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color     = ctx->input(0);
+        const Tensor& rasterOut = ctx->input(1);
+        const Tensor& pos       = ctx->input(2);
+        const Tensor& tri       = ctx->input(3);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate output tensor.
+        Tensor* outputTensor = NULL;
+        TensorShape outputShape;
+        outputShape.AddDim(p.n);
+        outputShape.AddDim(p.height);
+        outputShape.AddDim(p.width);
+        outputShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, outputShape, &outputTensor));
+        p.output = outputTensor->flat<float>().data();
+
+        // Allocate work buffer. One extra int4 for storing counters.
+        Tensor* workTensor = NULL;
+        TensorShape workShape;
+        workShape.AddDim(p.n * p.width * p.height * 8 + 4); // 8 int for a maximum of two work items per pixel.
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, workShape, &workTensor));
+        p.workBuffer = (int4*)(workTensor->flat<int>().data());
+
+        // Clear the work counters.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rasterOut  &  7), errors::Internal("raster_out input tensor not aligned to float2"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Kernel parameters.
+        void* args[] = {&p};
+
+        // (Re-)calculate opposite vertex hash.
+        if (!p.evHash || !p.tri_const)
+        {            
+            if (p.allocTriangles < p.numTriangles)
+            {
+                p.allocTriangles = max(p.allocTriangles, 64);
+                while (p.allocTriangles < p.numTriangles)
+                    p.allocTriangles <<= 1; // Must be power of two.
+               
+                // (Re-)allocate memory for the hash.
+                OP_CHECK_CUDA_ERROR(ctx, cudaFree(p.evHash));
+                OP_CHECK_CUDA_ERROR(ctx, cudaMalloc(&p.evHash, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4)));
+                LOG(INFO) << "Increasing topology hash size to accommodate " << p.allocTriangles << " triangles";
+            }
+
+            // Clear the hash and launch the mesh kernel to populate it.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.evHash, 0, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4), stream));
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+        }
+
+        // Copy input to output as a baseline.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.output, p.color, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+        // Choose launch parameters for the discontinuity finder kernel and launch.
+        dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+        dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+        // Determine optimum block size for the persistent analysis kernel.
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+
+        // Launch analysis kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasFwd")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Output     ("output: float")
+    .Output     ("work_buffer: int32")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasFwd").Device(DEVICE_GPU), AntialiasFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct AntialiasGradOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color      = ctx->input(0);
+        const Tensor& rasterOut  = ctx->input(1);
+        const Tensor& pos        = ctx->input(2);
+        const Tensor& tri        = ctx->input(3);
+        const Tensor& dy         = ctx->input(4);
+        const Tensor& workBuffer = ctx->input(5);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) > 0 && dy.dim_size(2) > 0 && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        OP_REQUIRES(ctx, color.dim_size(1) == dy.dim_size(1) && color.dim_size(2) == dy.dim_size(2) && color.dim_size(3) == dy.dim_size(3), errors::InvalidArgument("color and dy inputs must have same dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.dy = dy.flat<float>().data();
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+        p.workBuffer = (int4*)(workBuffer.flat<int>().data());
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate color gradient output tensor.
+        Tensor* gradColor = NULL;
+        TensorShape gradColorShape;
+        gradColorShape.AddDim(p.n);
+        gradColorShape.AddDim(p.height);
+        gradColorShape.AddDim(p.width);
+        gradColorShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, gradColorShape, &gradColor));
+        p.gradColor = gradColor->flat<float>().data();
+
+        // Allocate position gradient output tensor.
+        Tensor* gradPos = NULL;
+        TensorShape gradPosShape;
+        if (p.instance_mode)
+            gradPosShape.AddDim(p.n);
+        gradPosShape.AddDim(p.numVertices);
+        gradPosShape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, gradPosShape, &gradPos));
+        p.gradPos = gradPos->flat<float>().data();
+
+        // Initialize all the stuff.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream)); // Gradient kernel work counter.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.gradColor, p.dy, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradPos, 0, (p.instance_mode ? p.n : 1) * p.numVertices * 4 * sizeof(float), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Launch the gradient kernel.
+        void* args[] = {&p};
+
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasGrad")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("work_buffer: int32")
+    .Output     ("grad_color: float")
+    .Output     ("grad_pos: float");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasGrad").Device(DEVICE_GPU), AntialiasGradOp);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_interpolate.cu b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_interpolate.cu
new file mode 100755
index 0000000000000000000000000000000000000000..612ce1afc5ce41a25496523b193725c1edac64de
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_interpolate.cu
@@ -0,0 +1,301 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void interpolateParseOpAttributes(OpKernelConstruction* ctx, InterpolateKernelParams& p, bool enableDA)
+{
+    if (enableDA)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs_all", &p.diff_attrs_all));
+        if (!p.diff_attrs_all)
+        {
+            std::vector<int> diff_attrs_vec;
+            OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs", &diff_attrs_vec));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() > 0, errors::InvalidArgument("differentiation enabled with empty diff_attrs list"));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, errors::InvalidArgument("too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)"));
+            p.numDiffAttr = diff_attrs_vec.size();
+            memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateFwdOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 3 : 2);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, attr.dim_size(0) == p.depth || attr.dim_size(0) == 1, errors::InvalidArgument("minibatch size mismatch between inputs rast, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, rast_db.dims() == 4 && rast_db.dim_size(0) > 0 && rast_db.dim_size(1) > 0 && rast_db.dim_size(2) > 0 && rast_db.dim_size(3) == 4, errors::InvalidArgument("rast_db must have shape[>0, >0, >0, 4]"));
+            OP_REQUIRES(ctx, rast_db.dim_size(1) == rast.dim_size(1) && rast_db.dim_size(2) == rast.dim_size(2), errors::InvalidArgument("spatial size mismatch between inputs rast and rast_db"));
+            OP_REQUIRES(ctx, rast_db.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, rast_db"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr = attr.flat<float>().data();
+        p.rast = rast.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.attrBC = (p.instance_mode && attr.dim_size(0) == 1) ? 1 : 0;
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+
+        // Allocate main output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.depth);
+        out_shape.AddDim(p.height);
+        out_shape.AddDim(p.width);
+        out_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Allocate pixel differential output tensor.
+        Tensor* out_da_tensor = NULL;
+        out_shape.set_dim(3, p.numDiffAttr * 2);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, out_shape, &out_da_tensor));
+        p.outDA = ENABLE_DA ? out_da_tensor->flat<float>().data() : 0;
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+        if (ENABLE_DA)
+            OP_REQUIRES(ctx, !((uintptr_t)p.outDA & 7), errors::Internal("out_da output tensor not aligned to float2"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateFwd")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Output     ("out: float")
+    .Output     ("out_da: float");
+
+REGISTER_OP("InterpolateFwdDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("rast_db: float")
+    .Output     ("out: float")
+    .Output     ("out_da: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwd")  .Device(DEVICE_GPU), InterpolateFwdOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwdDa").Device(DEVICE_GPU), InterpolateFwdOp<true>);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateGradOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);      
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& dy      = ctx->input(3);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 4 : 3);
+        const Tensor& dda     = ctx->input(ENABLE_DA ? 5 : 3);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        int attr_depth = p.instance_mode ? (attr.dims() > 1 ? attr.dim_size(0) : 0) : 1;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape [>0, height, width, >0]"));
+        OP_REQUIRES(ctx, dy.dim_size(3) == p.numAttr, errors::InvalidArgument("argument count mismatch between inputs dy, attr"));
+        OP_REQUIRES(ctx, (attr_depth == p.depth || attr_depth == 1) && dy.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, dy, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, dda.dims() == 4 && dda.dim_size(0) > 0 && dda.dim_size(1) == p.height && dda.dim_size(2) == p.width, errors::InvalidArgument("dda must have shape [>0, height, width, ?]"));
+            OP_REQUIRES(ctx, dda.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between rast, dda"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr   = attr.flat<float>().data();
+        p.rast   = rast.flat<float>().data();
+        p.tri    = tri.flat<int>().data();
+        p.dy     = dy.flat<float>().data();
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+        p.dda    = ENABLE_DA ? dda.flat<float>().data() : 0;
+        p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+        // Allocate attribute gradient output tensor.
+        Tensor* grad_attr_tensor = NULL;
+        TensorShape grad_attr_shape;
+        if (p.instance_mode)
+            grad_attr_shape.AddDim(attr_depth);
+        grad_attr_shape.AddDim(p.numVertices);
+        grad_attr_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_attr_shape, &grad_attr_tensor));
+        p.gradAttr = grad_attr_tensor->flat<float>().data();
+
+        // Allocate bary gradient output tensor.
+        Tensor* grad_rast_tensor = NULL;
+        TensorShape grad_rast_shape;
+        grad_rast_shape.AddDim(p.depth);
+        grad_rast_shape.AddDim(p.height);
+        grad_rast_shape.AddDim(p.width);
+        grad_rast_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_rast_shape, &grad_rast_tensor));
+        p.gradRaster = grad_rast_tensor->flat<float>().data();
+
+        // Allocate bary pixel diff gradient output tensor.
+        if (ENABLE_DA)
+        {
+            Tensor* grad_rast_db_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_rast_shape, &grad_rast_db_tensor));
+            p.gradRasterDB = grad_rast_db_tensor->flat<float>().data();
+        }
+        
+        // Clear attribute gradients.
+        cudaMemsetAsync(p.gradAttr, 0, attr_depth * p.numVertices * p.numAttr * sizeof(float), stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.gradRaster & 15), errors::Internal("grad_rast output tensor not aligned to float4"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.dda & 7), errors::Internal("dda input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradRasterDB & 15), errors::Internal("grad_rast_db output tensor not aligned to float4"));
+        }
+    
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateGrad")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    ;
+
+REGISTER_OP("InterpolateGradDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("rast_db: float")
+    .Input      ("dda: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    .Output     ("grad_rast_db: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+    ;
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateGrad")  .Device(DEVICE_GPU), InterpolateGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateGradDa").Device(DEVICE_GPU), InterpolateGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_rasterize.cu b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_rasterize.cu
new file mode 100755
index 0000000000000000000000000000000000000000..bc9d0714e5b9e5f172dc4985d3ead48c65117271
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_rasterize.cu
@@ -0,0 +1,241 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct RasterizeFwdOp : public OpKernel
+{
+    RasterizeGLState        m_glState;              // OpenGL-related persistent state.
+    int                     m_tri_const;            // 1 if triangle array is known to be constant.
+
+    RasterizeFwdOp(OpKernelConstruction* ctx):
+        OpKernel(ctx)
+    {
+        memset(&m_glState, 0, sizeof(RasterizeGLState));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Check that input shapes are correct.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& resolution = ctx->input(2);
+        const Tensor& ranges = ctx->input(3);
+
+        // Determine number of outputs
+        int num_outputs = m_glState.enableDB ? 2 : 1;
+
+        // Determine instance mode and check input dimensions.
+        bool instance_mode = pos.dims() > 2;
+        if (instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+            OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
+        }
+
+        // Get output shape.
+        const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
+        int height = res_in[0];
+        int width  = res_in[1];
+        int depth  = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
+        OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));
+
+        // Get position and triangle buffer sizes in int32/float32.
+        int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
+        int triCount = 3 * tri.dim_size(0);
+
+        // Init context and GL?
+        bool initCtx = !m_glState.glFBO;
+        if (initCtx)
+        {
+            const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
+            int cudaDeviceIdx = g ? g->gpu_id : -1;
+            rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
+        }
+        else
+            setGLContext(m_glState.glctx); // (Re-)Activate GL context.
+
+        // Resize all buffers.
+        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
+
+        // Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
+        if (initCtx)
+        {
+            // On first execution, do a bonus context swap.
+            releaseGLContext();
+            setGLContext(m_glState.glctx);
+        }
+
+        // Copy input data to GL and render.
+        const float* posPtr = pos.flat<float>().data();
+        const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
+        const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
+        int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
+        rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1);
+
+        // Allocate output tensors.
+        TensorShape output_shape;
+        output_shape.AddDim(depth);
+        output_shape.AddDim(height);
+        output_shape.AddDim(width);
+        output_shape.AddDim(4);
+        float* outputPtr[2];
+        for (int i=0; i < 2; i++)
+        {
+            if (i >= num_outputs)
+                output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
+            Tensor* output_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
+            if (i < num_outputs)
+                outputPtr[i] = output_tensor->flat<float>().data();
+        }
+
+        // Copy rasterized results into CUDA buffers.
+        rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);
+
+        // Done. Release GL context.
+        releaseGLContext();
+    }
+};
+
+REGISTER_OP("RasterizeFwd")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("resolution: int32")
+    .Input      ("ranges: int32")
+    .Output     ("out: float")
+    .Output     ("out_db: float")
+    .Attr       ("enable_db: int")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DB>
+struct RasterizeGradOp : public OpKernel
+{
+    RasterizeGradParams m_attribs;
+
+    RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        RasterizeGradParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Input tensors.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& out = ctx->input(2);
+        const Tensor& dy  = ctx->input(3);
+        const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);
+
+        // Determine instance mode.
+        p.instance_mode = (pos.dims() > 2) ? 1 : 0;
+
+        // Shape is taken from the rasterizer output tensor.
+        OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
+        p.depth  = out.dim_size(0);
+        p.height = out.dim_size(1);
+        p.width  = out.dim_size(2);
+        OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));
+
+        // Check other shapes.
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
+        else
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
+        OP_REQUIRES(ctx,  dy.dims() == 4 &&  dy.dim_size(0) == p.depth &&  dy.dim_size(1) == p.height &&  dy.dim_size(2) == p.width &&  dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));
+
+        // Populate parameters.
+        p.numTriangles = tri.dim_size(0);
+        p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
+        p.pos = pos.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.out = out.flat<float>().data();
+        p.dy  = dy.flat<float>().data();
+        p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
+
+        // Set up pixel position to clip space x, y transform.
+        p.xs = 2.f / (float)p.width;
+        p.xo = 1.f / (float)p.width - 1.f;
+        p.ys = 2.f / (float)p.height;
+        p.yo = 1.f / (float)p.height - 1.f;
+
+        // Allocate output tensor for position gradients.
+        Tensor* grad_tensor = NULL;
+        TensorShape grad_shape;
+        if (p.instance_mode)
+            grad_shape.AddDim(p.depth);
+        grad_shape.AddDim(p.numVertices);
+        grad_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
+        p.grad = grad_tensor->flat<float>().data();
+
+        // Clear the output buffers.
+        size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
+        cudaMemsetAsync(p.grad, 0, gradBytes, stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.dy  &  7), errors::Internal("dy input tensor not aligned to float2"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("RasterizeGrad")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Output     ("grad: float");
+
+REGISTER_OP("RasterizeGradDb")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Input      ("ddb: float")
+    .Output     ("grad: float");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeGrad")  .Device(DEVICE_GPU), RasterizeGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_texture.cu b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_texture.cu
new file mode 100755
index 0000000000000000000000000000000000000000..c5382fed28236da09d20a04c0524a937383daf5a
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/tensorflow/tf_texture.cu
@@ -0,0 +1,525 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p)
+{
+    // Mip and filter modes.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode));
+    OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported"));
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit));
+        OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level"));
+        ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op.
+    }
+
+    // Boundary mode.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode));
+    OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported"));
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct TextureFwdOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+    PersistentTensor    m_persistentMipTensor; // Used if texture is constant and mips are enabled.
+    bool                m_persistentMipTensorInitialized;
+
+    TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        m_persistentMipTensorInitialized = false;
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+
+        // Allocate output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.n);
+        out_shape.AddDim(p.imgHeight);
+        out_shape.AddDim(p.imgWidth);
+        out_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Choose kernel variants based on channel count.
+        void* args[] = {&p};
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        float* pmip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Mip output tensor.
+            Tensor* mip_tensor = NULL;
+            TensorShape mip_shape;
+            mip_shape.AddDim(mipTotal);
+
+            // If texture is constant, calculate mip stack only once.
+            bool computeMip = true;
+            if (p.texConst)
+            {
+                // First execution?
+                if (!m_persistentMipTensorInitialized)
+                {
+                    // Allocate a persistent mip tensor.
+                    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor));
+                    m_persistentMipTensorInitialized = true;
+                }
+                else
+                {
+                    // Reuse the persistent tensor, do not recompute mip levels.
+                    mip_tensor = m_persistentMipTensor.AccessTensor(ctx);
+                    computeMip = false;
+                }
+
+                // Set as output tensor as well.
+                ctx->set_output(1, *mip_tensor);
+            }
+            else
+            {
+                // Allocate an output tensor as usual.
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor));
+            }
+
+            pmip = mip_tensor->flat<float>().data(); // Pointer to data.
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+            // Build mip levels if needed.
+            if (computeMip)
+            {
+                for (int i=1; i <= p.mipLevelMax; i++)
+                {
+                    int2 ms = mipLevelSize(p, i);
+                    int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+                    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+                    dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+                    p.mipLevelOut = i;
+
+                    void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+                    OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+                }
+            }
+        }
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 15), errors::Internal("out output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 15), errors::Internal("mip output tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 7), errors::Internal("out output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 7), errors::Internal("mip output tensor not aligned to float2"));
+        }
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+        else
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+
+        // Choose launch parameters for texture lookup kernel.
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        // Choose kernel based on filter mode, cube mode, and datatype.
+        void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
+            (void*)TextureFwdKernelNearest1,
+            (void*)TextureFwdKernelNearest2,
+            (void*)TextureFwdKernelNearest4,
+            (void*)TextureFwdKernelLinear1,
+            (void*)TextureFwdKernelLinear2,
+            (void*)TextureFwdKernelLinear4,
+            (void*)TextureFwdKernelLinearMipmapNearest1,
+            (void*)TextureFwdKernelLinearMipmapNearest2,
+            (void*)TextureFwdKernelLinearMipmapNearest4,
+            (void*)TextureFwdKernelLinearMipmapLinear1,
+            (void*)TextureFwdKernelLinearMipmapLinear2,
+            (void*)TextureFwdKernelLinearMipmapLinear4,
+            (void*)TextureFwdKernelCubeNearest1,
+            (void*)TextureFwdKernelCubeNearest2,
+            (void*)TextureFwdKernelCubeNearest4,
+            (void*)TextureFwdKernelCubeLinear1,
+            (void*)TextureFwdKernelCubeLinear2,
+            (void*)TextureFwdKernelCubeLinear4,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+        func_idx = func_idx * 3 + channel_div_idx;
+
+        // Launch kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("TextureFwd")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Output     ("out: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureFwdMip")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("uv_da: float")
+    .Output     ("out: float")
+    .Output     ("mip: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("tex_const: int")
+    .Attr       ("max_mip_level: int");
+
+REGISTER_KERNEL_BUILDER(Name("TextureFwd")   .Device(DEVICE_GPU), TextureFwdOp);
+REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct TextureGradOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+
+    TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& dy    = ctx->input(2);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2);
+        const Tensor& mip   = ctx->input(p.enableMip ? 4 : 2);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.dy = dy.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+        float* pmip = p.enableMip ? (float*)mip.flat<float>().data() : 0;
+
+        // Allocate output tensor for tex gradient.
+        Tensor* grad_tex_tensor = NULL;
+        TensorShape grad_tex_shape;
+        grad_tex_shape.AddDim(p.texDepth);
+        if (cube_mode)
+            grad_tex_shape.AddDim(6);
+        grad_tex_shape.AddDim(p.texHeight);
+        grad_tex_shape.AddDim(p.texWidth);
+        grad_tex_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor));
+        p.gradTex[0] = grad_tex_tensor->flat<float>().data();
+
+        // Allocate output tensor for uv gradient.
+        if (p.filterMode != TEX_MODE_NEAREST)
+        {
+            TensorShape grad_uv_shape;
+            Tensor* grad_uv_tensor = NULL;
+            grad_uv_shape.AddDim(p.n);
+            grad_uv_shape.AddDim(p.imgHeight);
+            grad_uv_shape.AddDim(p.imgWidth);
+            grad_uv_shape.AddDim(uv.dim_size(3));
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor));
+            p.gradUV = grad_uv_tensor->flat<float>().data();
+
+            // Allocate output tensor for uv_da gradient.
+            if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                Tensor* grad_uv_da_tensor = NULL;
+                grad_uv_shape.set_dim(3, uv_da.dim_size(3));
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor));
+                p.gradUVDA = grad_uv_da_tensor->flat<float>().data();
+            }
+        }
+
+        // Choose kernel variants based on channel count.
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        Tensor grad_mip_tensor;
+        float* pgradMip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Get space for temporary mip gradients.
+            TensorShape grad_mip_shape;
+            grad_mip_shape.AddDim(mipTotal);
+            ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor);
+            pgradMip = grad_mip_tensor.flat<float>().data();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+
+            // Clear mip gradients.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream));
+        }
+
+        // Initialize texture gradients to zero.
+        int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float);
+        if (cube_mode)
+            texBytes *= 6;
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv       & 7), errors::Internal("uv input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUV   & 7), errors::Internal("grad_uv output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2"));
+        }
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 15), errors::Internal("dy input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 15), errors::Internal("mip input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 15), errors::Internal("internal mip gradient tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 7), errors::Internal("dy output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 7), errors::Internal("mip input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 7), errors::Internal("internal mip gradient tensor not aligned to float2"));
+        }
+
+        // Choose launch parameters for main gradient kernel.
+        void* args[] = {&p};
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        void* func_tbl[TEX_MODE_COUNT * 2] = {
+            (void*)TextureGradKernelNearest,
+            (void*)TextureGradKernelLinear,
+            (void*)TextureGradKernelLinearMipmapNearest,
+            (void*)TextureGradKernelLinearMipmapLinear,
+            (void*)TextureGradKernelCubeNearest,
+            (void*)TextureGradKernelCubeLinear,
+            (void*)TextureGradKernelCubeLinearMipmapNearest,
+            (void*)TextureGradKernelCubeLinearMipmapLinear,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+
+        // Launch main gradient kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+        // Launch kernel to pull gradients from mip levels.
+        if (p.enableMip)
+        {
+            dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+            dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+            int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+            void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+        }
+    }
+};
+
+REGISTER_OP("TextureGradNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinearMipmapNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_OP("TextureGradLinearMipmapLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Output     ("grad_uv_da: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_KERNEL_BUILDER(Name("TextureGradNearest")            .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinear")             .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp);
+        
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/__init__.py b/pose_estimation/nvdiffrast/nvdiffrast/torch/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..9043ba774987338b238a723b0ad06639b9b2a98c
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
+__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/ops.py b/pose_estimation/nvdiffrast/nvdiffrast/torch/ops.py
new file mode 100755
index 0000000000000000000000000000000000000000..be603675873437b01fd0976588a6020d7086fd9b
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/ops.py
@@ -0,0 +1,640 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import logging
+import numpy as np
+import os
+import sys
+import torch
+import torch.utils.cpp_extension
+
+#----------------------------------------------------------------------------
+# C++/Cuda plugin compiler/loader.
+
+_cached_plugin = None
+def _get_plugin():
+    # Return cached plugin if already loaded.
+    global _cached_plugin
+    if _cached_plugin is not None:
+        return _cached_plugin
+
+    # Make sure we can find the necessary compiler and libary binaries.
+    if os.name == 'nt':
+        lib_dir = os.path.dirname(__file__) + r"\..\lib"
+        def find_cl_path():
+            import glob
+            for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
+                paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
+                if paths:
+                    return paths[0]
+
+        # If cl.exe is not on path, try to find it.
+        if os.system("where cl.exe >nul 2>nul") != 0:
+            cl_path = find_cl_path()
+            if cl_path is None:
+                raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+            os.environ['PATH'] += ';' + cl_path
+
+    # Compiler options.
+    opts = ['-DNVDR_TORCH']
+
+    # Linker options.
+    if os.name == 'posix':
+        ldflags = ['-lGL', '-lEGL']
+    elif os.name == 'nt':
+        libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
+        ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
+
+    # List of source files.
+    source_files = [
+        '../common/common.cpp',
+        '../common/glutil.cpp',
+        '../common/rasterize.cu',
+        '../common/rasterize.cpp',
+        '../common/interpolate.cu',
+        '../common/texture.cu',
+        '../common/texture.cpp',
+        '../common/antialias.cu',
+        'torch_bindings.cpp',
+        'torch_rasterize.cpp',
+        'torch_interpolate.cpp',
+        'torch_texture.cpp',
+        'torch_antialias.cpp',
+    ]
+
+    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
+    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
+    plugin_name = 'nvdiffrast_plugin'
+    try:
+        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
+        if os.path.exists(lock_fn):
+            logging.getLogger('nvdiffrast').warning("Lock file exists in build directory: '%s'" % lock_fn)
+    except:
+        pass
+
+    # Compile and load.
+    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
+    torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=False)
+
+    # Import, cache, and return the compiled module.
+    import nvdiffrast_plugin
+    _cached_plugin = nvdiffrast_plugin
+    return _cached_plugin
+
+#----------------------------------------------------------------------------
+# Log level.
+#----------------------------------------------------------------------------
+
+def get_log_level():
+    '''Get current log level.
+
+    Returns:
+      Current log level in nvdiffrast. See `set_log_level()` for possible values.
+    '''
+    return _get_plugin().get_log_level()
+
+def set_log_level(level):
+    '''Set log level.
+
+    Log levels follow the convention on the C++ side of Torch:
+      0 = Info,
+      1 = Warning,
+      2 = Error,
+      3 = Fatal.
+    The default log level is 1.
+
+    Args:
+      level: New log level as integer. Internal nvdiffrast messages of this 
+             severity or higher will be printed, while messages of lower
+             severity will be silent.
+    '''
+    _get_plugin().set_log_level(level)
+
+#----------------------------------------------------------------------------
+# GL State wrapper.
+#----------------------------------------------------------------------------
+
+class RasterizeGLContext:
+    def __init__(self, output_db=True, mode='automatic', device=None):
+        '''Create a new OpenGL rasterizer context.
+
+        Creating an OpenGL context is a slow operation so you should reuse the same
+        context in all calls to `rasterize()` on the same CPU thread. The OpenGL context
+        is deleted when the object is destroyed.
+
+        Args:
+          output_db (bool): Compute and output image-space derivates of barycentrics.
+          mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
+        Returns:
+          The newly created OpenGL rasterizer context.
+        '''
+        assert output_db is True or output_db is False
+        assert mode in ['automatic', 'manual']
+        self.output_db = output_db
+        self.mode = mode
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
+        self.active_depth_peeler = None # For error checking only
+
+    def set_context(self):
+        '''Set (activate) OpenGL context in the current CPU thread.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.set_context()
+
+    def release_context(self):
+        '''Release (deactivate) currently active OpenGL context.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.release_context()
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+class _rasterize_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, glctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
+        out, out_db = _get_plugin().rasterize_fwd(glctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        ctx.save_for_backward(pos, tri, out)
+        ctx.saved_grad_db = grad_db
+        return out, out_db
+
+    @staticmethod
+    def backward(ctx, dy, ddb):
+        pos, tri, out = ctx.saved_variables
+        if ctx.saved_grad_db:
+            g_pos = _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+        else:
+            g_pos = _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return None, g_pos, None, None, None, None, None
+
+# Op wrapper.
+def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
+    '''Rasterize triangles.
+
+    All input tensors must be contiguous and reside in GPU memory except for
+    the `ranges` tensor that, if specified, has to reside in CPU memory. The
+    output tensors will be contiguous and reside in GPU memory.
+
+    Args:
+        glctx: OpenGL context of type `RasterizeGLContext`.
+        pos: Vertex position tensor with dtype `torch.float32`. To enable range
+             mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+             instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        resolution: Output resolution as integer tuple (height, width).
+        ranges: In range mode, tensor with shape [minibatch_size, 2] and dtype
+                `torch.int32`, specifying start indices and counts into `tri`.
+                Ignored in instanced mode.
+        grad_db: Propagate gradients of image-space derivatives of barycentrics
+                 into `pos` in backward pass. Ignored if OpenGL context was
+                 not configured to output image-space derivatives.
+
+    Returns:
+        A tuple of two tensors. The first output tensor has shape [minibatch_size,
+        height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+        triangle_id). If the OpenGL context was configured to output image-space
+        derivatives of barycentrics, the second output tensor will also have shape
+        [minibatch_size, height, width, 4] and contain said derivatives in order
+        (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    '''
+    assert isinstance(glctx, RasterizeGLContext)
+    assert grad_db is True or grad_db is False
+    grad_db = grad_db and glctx.output_db
+
+    # Sanitize inputs.
+    assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+    resolution = tuple(resolution)
+    if ranges is None:
+        ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+    else:
+        assert isinstance(ranges, torch.Tensor)
+
+    # Check that context is not currently reserved for depth peeling.
+    if glctx.active_depth_peeler is not None:
+        return RuntimeError("Cannot call rasterize() during depth peeling operation, use rasterize_next_layer() instead")
+
+    # Instantiate the function.
+    return _rasterize_func.apply(glctx, pos, tri, resolution, ranges, grad_db, -1)
+
+#----------------------------------------------------------------------------
+# Depth peeler context manager for rasterizing multiple depth layers.
+#----------------------------------------------------------------------------
+
+class DepthPeeler:
+    def __init__(self, glctx, pos, tri, resolution, ranges=None, grad_db=True):
+        '''Create a depth peeler object for rasterizing multiple depth layers.
+
+        Arguments are the same as in `rasterize()`.
+
+        Returns:
+          The newly created depth peeler.
+        '''
+        assert isinstance(glctx, RasterizeGLContext)
+        assert grad_db is True or grad_db is False
+        grad_db = grad_db and glctx.output_db
+
+        # Sanitize inputs as usual.
+        assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+        resolution = tuple(resolution)
+        if ranges is None:
+            ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+        else:
+            assert isinstance(ranges, torch.Tensor)
+
+        # Store all the parameters.
+        self.glctx = glctx
+        self.pos = pos
+        self.tri = tri
+        self.resolution = resolution
+        self.ranges = ranges
+        self.grad_db = grad_db
+        self.peeling_idx = None
+
+    def __enter__(self):
+        if self.glctx is None:
+            raise RuntimeError("Cannot re-enter a terminated depth peeling operation")
+        if self.glctx.active_depth_peeler is not None:
+            raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a RasterizeGLContext")
+        self.glctx.active_depth_peeler = self
+        self.peeling_idx = 0
+        return self
+
+    def __exit__(self, *args):
+        assert self.glctx.active_depth_peeler is self
+        self.glctx.active_depth_peeler = None
+        self.glctx = None # Remove all references to input tensor so they're not left dangling.
+        self.pos = None
+        self.tri = None
+        self.resolution = None
+        self.ranges = None
+        self.grad_db = None
+        self.peeling_idx = None
+        return None
+
+    def rasterize_next_layer(self):
+        '''Rasterize next depth layer.
+
+        Operation is equivalent to `rasterize()` except that previously reported
+        surface points are culled away.
+
+        Returns:
+          A tuple of two tensors as in `rasterize()`.
+        '''
+        assert self.glctx.active_depth_peeler is self
+        assert self.peeling_idx >= 0
+        result = _rasterize_func.apply(self.glctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
+        self.peeling_idx += 1
+        return result
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+# Output pixel differentials for at least some attributes.
+class _interpolate_func_da(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list):
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+        ctx.save_for_backward(attr, rast, tri, rast_db)
+        ctx.saved_misc = diff_attrs_all, diff_attrs_list
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, dda):
+        attr, rast, tri, rast_db = ctx.saved_variables
+        diff_attrs_all, diff_attrs_list = ctx.saved_misc
+        g_attr, g_rast, g_rast_db = _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return g_attr, g_rast, None, g_rast_db, None, None
+
+# No pixel differential for any attribute.
+class _interpolate_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        ctx.save_for_backward(attr, rast, tri)
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, _):
+        attr, rast, tri = ctx.saved_variables
+        g_attr, g_rast = _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return g_attr, g_rast, None
+
+# Op wrapper.
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    """Interpolate vertex attributes.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensors
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        attr: Attribute tensor with dtype `torch.float32`. 
+              Shape is [num_vertices, num_attributes] in range mode, or 
+              [minibatch_size, num_vertices, num_attributes] in instanced mode.
+              Broadcasting is supported along the minibatch axis.
+        rast: Main output tensor from `rasterize()`.
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        rast_db: (Optional) Tensor containing image-space derivatives of barycentrics, 
+                 i.e., the second output tensor from `rasterize()`. Enables computing
+                 image-space derivatives of attributes.
+        diff_attrs: (Optional) List of attribute indices for which image-space
+                    derivatives are to be computed. Special value 'all' is equivalent
+                    to list [0, 1, ..., num_attributes - 1].
+
+    Returns:
+        A tuple of two tensors. The first output tensor contains interpolated
+        attributes and has shape [minibatch_size, height, width, num_attributes].
+        If `rast_db` and `diff_attrs` were specified, the second output tensor contains
+        the image-space derivatives of the selected attributes and has shape
+        [minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+        first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+        Otherwise, the second output tensor will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    """
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = np.asarray(diff_attrs, np.int32)
+        assert len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    diff_attrs_all = int(diff_attrs == 'all')
+    diff_attrs_list = [] if diff_attrs_all else diff_attrs
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (attr, rast, tri))
+    if diff_attrs:
+        assert isinstance(rast_db, torch.Tensor)
+
+    # Choose stub.
+    if diff_attrs:
+        return _interpolate_func_da.apply(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+    else:
+        return _interpolate_func.apply(attr, rast, tri)
+
+#----------------------------------------------------------------------------
+# Texture
+#----------------------------------------------------------------------------
+
+# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
+class _texture_func_mip(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack):
+        empty = torch.tensor([])
+        if uv_da is None:
+            uv_da = empty
+        if mip_level_bias is None:
+            mip_level_bias = empty
+        if mip_wrapper is None:
+            mip_wrapper = _get_plugin().TextureMipWrapper()
+        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv, uv_da, mip_level_bias, *mip_stack)
+        ctx.saved_misc = filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv, uv_da, mip_level_bias, *mip_stack = ctx.saved_variables
+        filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear-mipmap-linear':
+            g_tex, g_uv, g_uv_da, g_mip_level_bias, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None) + tuple(g_mip_stack)
+        else: # linear-mipmap-nearest
+            g_tex, g_uv, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, None, None, None, None, None) + tuple(g_mip_stack)
+
+# Linear and nearest: Mipmaps disabled.
+class _texture_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv)
+        ctx.saved_misc = filter_mode, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv = ctx.saved_variables
+        filter_mode, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear':
+            g_tex, g_uv = _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, None, None
+        else: # nearest
+            g_tex = _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, None, None, None
+
+# Op wrapper.
+def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
+    """Perform texture sampling.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
+             [minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+             must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+             tex_width and tex_height are equal. Note that `boundary_mode` must also be set
+             to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
+        uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
+            must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+            texture, must have shape [minibatch_size, height, width, 3].
+        uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
+               Must have same shape as `uv` except for the last dimension that is to be twice
+               as long.
+        mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
+                        determines mip level directly. Must have shape [minibatch_size, height, width].
+        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call, or a list
+                        of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
+                        the tensors in the list must follow the same format as `tex` except for width and
+                        height that must follow the usual rules for mipmap sizes. The base level texture
+                        is still supplied in `tex` and must not be included in the list. Gradients of a
+                        custom mipmap stack are not automatically propagated to base texture but the mipmap
+                        tensors will receive gradients of their own. If a mipmap stack is not specified
+                        but the chosen filter mode requires it, the mipmap stack is constructed internally
+                        and discarded afterwards.
+        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+                     selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
+                     'linear-mipmap-linear' when at least one of them is specified, these being
+                     the highest-quality modes possible depending on the availability of the
+                     image-space derivatives of the texture coordinates or direct mip level information.
+        boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
+                       cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+                       part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+                       centers of the boundary texels. Mode 'zero' virtually extends the texture with
+                       all-zero values in all directions.
+        max_mip_level: If specified, limits the number of mipmaps constructed and used in mipmap-based
+                       filter modes.
+
+    Returns:
+        A tensor containing the results of the texture sampling with shape
+        [minibatch_size, height, width, tex_channels].
+    """
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
+
+    # Sanitize inputs.
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Check inputs.
+    assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
+    if 'mipmap' in filter_mode:
+        assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Construct a mipmap if necessary.
+    if 'mipmap' in filter_mode:
+        mip_wrapper, mip_stack = None, []
+        if mip is not None:
+            assert isinstance(mip, (_get_plugin().TextureMipWrapper, list))
+            if isinstance(mip, list):
+                assert all(isinstance(x, torch.Tensor) for x in mip)
+                mip_stack = mip
+            else:
+                mip_wrapper = mip
+        else:
+            mip_wrapper = _get_plugin().texture_construct_mip(tex, max_mip_level, boundary_mode == 'cube')
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
+        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack)
+    else:
+        return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
+
+# Mipmap precalculation for cases where the texture stays constant.
+def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
+    """Construct a mipmap stack for a texture.
+
+    This function can be used for constructing a mipmap stack for a texture that is known to remain
+    constant. This avoids reconstructing it every time `texture()` is called.
+
+    Args:
+        tex: Texture tensor with the same constraints as in `texture()`.
+        max_mip_level: If specified, limits the number of mipmaps constructed.
+        cube_mode: Must be set to True if `tex` specifies a cube map texture.
+
+    Returns:
+        An opaque object containing the mipmap stack. This can be supplied in a call to `texture()` 
+        in the `mip` argument.
+    """
+
+    assert isinstance(tex, torch.Tensor)
+    assert cube_mode is True or cube_mode is False
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+    return _get_plugin().texture_construct_mip(tex, max_mip_level, cube_mode)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+class _antialias_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, color, rast, pos, tri, topology_hash, pos_gradient_boost):
+        out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, topology_hash)
+        ctx.save_for_backward(color, rast, pos, tri)
+        ctx.saved_misc = pos_gradient_boost, work_buffer
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        color, rast, pos, tri = ctx.saved_variables
+        pos_gradient_boost, work_buffer = ctx.saved_misc
+        g_color, g_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+        if pos_gradient_boost != 1.0:
+            g_pos = g_pos * pos_gradient_boost
+        return g_color, None, g_pos, None, None, None
+
+# Op wrapper.
+def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
+    """Perform antialiasing.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
+        rast: Main output tensor from `rasterize()`.
+        pos: Vertex position tensor used in the rasterization operation.
+        tri: Triangle tensor used in the rasterization operation.
+        topology_hash: (Optional) Preconstructed topology hash for the triangle tensor. If not
+                       specified, the topology hash is constructed internally and discarded afterwards.
+        pos_gradient_boost: (Optional) Multiplier for gradients propagated to `pos`.
+
+    Returns:
+        A tensor containing the antialiased image with the same shape as `color` input tensor.
+    """
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (color, rast, pos, tri))
+
+    # Construct topology hash unless provided by user.
+    if topology_hash is not None:
+        assert isinstance(topology_hash, _get_plugin().TopologyHashWrapper)
+    else:
+        topology_hash = _get_plugin().antialias_construct_topology_hash(tri)
+
+    # Instantiate the function.
+    return _antialias_func.apply(color, rast, pos, tri, topology_hash, pos_gradient_boost)
+
+# Topology hash precalculation for cases where the triangle array stays constant.
+def antialias_construct_topology_hash(tri):
+    """Construct a topology hash for a triangle tensor.
+
+    This function can be used for constructing a topology hash for a triangle tensor that is 
+    known to remain constant. This avoids reconstructing it every time `antialias()` is called.
+
+    Args:
+        tri: Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+             GPU memory.
+
+    Returns:
+        An opaque object containing the topology hash. This can be supplied in a call to 
+        `antialias()` in the `topology_hash` argument.
+    """
+    assert isinstance(tri, torch.Tensor)
+    return _get_plugin().antialias_construct_topology_hash(tri)
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_antialias.cpp b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_antialias.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..a926adc7dc68eb30811de6a3571a0a545c7b2a20
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_antialias.cpp
@@ -0,0 +1,239 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/antialias.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void AntialiasFwdMeshKernel         (const AntialiasKernelParams p);
+void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p);
+void AntialiasFwdAnalysisKernel     (const AntialiasKernelParams p);
+void AntialiasGradKernel            (const AntialiasKernelParams p);
+
+//------------------------------------------------------------------------
+// Topology hash construction.
+
+TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tri);
+    NVDR_CHECK_CONTIGUOUS(tri);
+    NVDR_CHECK_I32(tri);
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Fill in kernel parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = 0x7fffffff; // Let's not require vertex positions just to enable an error check.
+    p.tri = tri.data_ptr<int>();
+
+    // Kernel parameters.
+    p.allocTriangles = p.allocTriangles < 64 ? 64 : p.allocTriangles;
+    while (p.allocTriangles < p.numTriangles)
+        p.allocTriangles <<= 1; // Must be power of two.
+
+    // Construct the hash tensor and get pointer.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
+    torch::Tensor ev_hash = torch::zeros({p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * 4}, opts);
+    p.evHash = (uint4*)(ev_hash.data_ptr<int>());
+
+    // Check alignment.
+    NVDR_CHECK(!((uintptr_t)p.evHash & 15), "ev_hash internal tensor not aligned to int4");
+
+    // Populate the hash.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return.
+    TopologyHashWrapper hash_wrap;
+    hash_wrap.ev_hash = ev_hash;
+    return hash_wrap;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    torch::Tensor& topology_hash = topology_hash_wrap.ev_hash; // Unwrap.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_F32(color, rast, pos);
+    NVDR_CHECK_I32(tri, topology_hash);
+
+    // Sanity checks.
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and rast inputs must have same spatial dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.evHash = (uint4*)(topology_hash.data_ptr<int>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+    p.allocTriangles = topology_hash.size(0) / (4 * AA_HASH_ELEMENTS_PER_TRIANGLE);
+
+    // Allocate output tensors.
+    torch::Tensor out = color.detach().clone(); // Use color as base.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
+    p.output = out.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Clear the work counters.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rasterOut  &  7), "raster_out input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+    NVDR_CHECK(!((uintptr_t)p.evHash     & 15), "topology_hash internal tensor not aligned to int4");
+
+    // Choose launch parameters for the discontinuity finder kernel and launch.
+    void* args[] = {&p};
+    dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+    // Determine optimum block size for the persistent analysis kernel and launch.
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, work_buffer);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, dy, work_buffer);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, work_buffer);
+    NVDR_CHECK_F32(color, rast, pos, dy, work_buffer);
+    NVDR_CHECK_I32(tri);
+
+    // Sanity checks.
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) > 0 && dy.size(2) > 0 && dy.size(3) > 0, "dy must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "raster_out must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and raster_out inputs must have same spatial dimensions");
+    NVDR_CHECK(color.size(1) == dy.size(1) && color.size(2) == dy.size(2) && color.size(3) == dy.size(3), "color and dy inputs must have same dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out, pos");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0) && pos.size(0) ==color.size(0), "minibatch size mismatch between inputs dy, color, raster_out, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0), "minibatch size mismatch between inputs dy, color, raster_out");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Ensure dy is contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+
+    // Allocate output tensors.
+    torch::Tensor grad_color = dy_.detach().clone(); // Use dy as base.
+    torch::Tensor grad_pos = torch::zeros_like(pos);
+    p.gradColor = grad_color.data_ptr<float>();
+    p.gradPos = grad_pos.data_ptr<float>();
+
+    // Clear gradient kernel work counter.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+
+    // Determine optimum block size for the gradient kernel and launch.
+    void* args[] = {&p};
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(grad_color, grad_pos);
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_bindings.cpp b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_bindings.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..ed0ae0645a5ed82e4a0760c3e3a5f92aea8f85e6
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_bindings.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes. Return type macros for readability.
+
+#define OP_RETURN_T     torch::Tensor
+#define OP_RETURN_TT    std::tuple<torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTT   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTTT  std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTV   std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+#define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+
+OP_RETURN_TT        rasterize_fwd                       (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int depth_idx);
+OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
+OP_RETURN_T         rasterize_grad_db                   (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
+OP_RETURN_TT        interpolate_fwd                     (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
+OP_RETURN_TT        interpolate_fwd_da                  (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy);
+OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
+OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TTV       texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_TTTTV     texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
+OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
+OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
+        .def("set_context",     &RasterizeGLStateWrapper::setContext)
+        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
+    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>());
+    pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
+
+    // Plumbing to torch/c10 logging system.
+    m.def("get_log_level", [](void)     { return FLAGS_caffe2_log_level;  }, "get log level");
+    m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
+
+    // Ops.
+    m.def("rasterize_fwd",                      &rasterize_fwd,                         "rasterize forward op");
+    m.def("rasterize_grad",                     &rasterize_grad,                        "rasterize gradient op ignoring db gradients");
+    m.def("rasterize_grad_db",                  &rasterize_grad_db,                     "rasterize gradient op with db gradients");
+    m.def("interpolate_fwd",                    &interpolate_fwd,                       "interpolate forward op with attribute derivatives");
+    m.def("interpolate_fwd_da",                 &interpolate_fwd_da,                    "interpolate forward op without attribute derivatives");
+    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
+    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
+    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op without mipmapping");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op with mipmapping");
+    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
+    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
+    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");
+    m.def("texture_grad_linear_mipmap_linear",  &texture_grad_linear_mipmap_linear,     "texture gradient op in linear-mipmap-linear mode");
+    m.def("antialias_construct_topology_hash",  &antialias_construct_topology_hash,     "antialias topology hash construction");
+    m.def("antialias_fwd",                      &antialias_fwd,                         "antialias forward op");
+    m.def("antialias_grad",                     &antialias_grad,                        "antialias gradient op");
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_common.inl b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_common.inl
new file mode 100755
index 0000000000000000000000000000000000000000..74dea41528822294878d9ee5d36d1230d1df7ae6
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_common.inl
@@ -0,0 +1,29 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "../common/framework.h"
+
+//------------------------------------------------------------------------
+// Input check helpers.
+//------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
+#endif
+
+#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
+#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
+#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
+#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
+#define NVDR_CHECK_I32(...) do { nvdr_check_i32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be int32 tensors"); } while(0)
+inline void nvdr_check_cpu(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.device().type() == c10::DeviceType::CPU, func, err_msg); }
+inline void nvdr_check_contiguous(at::ArrayRef<at::Tensor> ts, const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.is_contiguous(), func, err_msg); }
+inline void nvdr_check_f32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kFloat32, func, err_msg); }
+inline void nvdr_check_i32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kInt32, func, err_msg); }
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_interpolate.cpp b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_interpolate.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..b2c99fccfe0b11b71018e2c0ddcf637a337522b8
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_interpolate.cpp
@@ -0,0 +1,250 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "../common/common.h"
+#include "../common/interpolate.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void InterpolateFwdKernel   (const InterpolateKernelParams p);
+void InterpolateFwdKernelDa (const InterpolateKernelParams p);
+void InterpolateGradKernel  (const InterpolateKernelParams p);
+void InterpolateGradKernelDa(const InterpolateKernelParams p);
+
+//------------------------------------------------------------------------
+// Helper
+
+static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    if (diff_attrs_all)
+    {
+        p.numDiffAttr = p.numAttr;
+        p.diff_attrs_all = 1;
+    }
+    else
+    {
+        NVDR_CHECK(diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, "too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)");
+        p.numDiffAttr = diff_attrs_vec.size();
+        memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, rast_db);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, rast_db);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK( tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    if (p.instance_mode)
+        NVDR_CHECK(attr.size(0) == rast.size(0) || attr.size(0) == 1, "minibatch size mismatch between inputs rast, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
+    torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);
+
+    p.out = out.data_ptr<float>();
+    p.outDA = enable_da ? out_da.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast   & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.outDA  &  7), "out_da output tensor not aligned to float2");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_da);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    return interpolate_fwd_da(attr, rast, tri, empty_tensor, false, empty_vec);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy, rast_db, dda);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, dy, rast_db, dda);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Depth of attributes.
+    int attr_depth = p.instance_mode ? (attr.sizes().size() > 1 ? attr.size(0) : 0) : 1;
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) == rast.size(1) && dy.size(2) == rast.size(2) && dy.size(3) > 0, "dy must have shape [>0, height, width, >0]");
+    NVDR_CHECK(dy.size(3) == attr.size(attr.sizes().size() - 1), "argument count mismatch between inputs dy, attr");
+    NVDR_CHECK((attr_depth == rast.size(0) || attr_depth == 1) && dy.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, dy, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(dda.sizes().size() == 4 && dda.size(0) > 0 && dda.size(1) == rast.size(1) && dda.size(2) == rast.size(2), "dda must have shape [>0, height, width, ?]");
+        NVDR_CHECK(dda.size(0) == rast.size(0), "minibatch size mismatch between rast, dda");
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor dda_;
+    if (enable_da)
+        dda_ = dda.contiguous();
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.dy = dy_.data_ptr<float>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.dda = enable_da ? dda_.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor gradAttr = torch::zeros_like(attr);
+    torch::Tensor gradRaster = torch::empty_like(rast);
+    torch::Tensor gradRasterDB;
+    if (enable_da)
+        gradRasterDB = torch::empty_like(rast_db);
+
+    p.gradAttr = gradAttr.data_ptr<float>();
+    p.gradRaster = gradRaster.data_ptr<float>();
+    p.gradRasterDB = enable_da ? gradRasterDB.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast         & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB       & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dda          &  7), "dda input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.gradRaster   & 15), "grad_rast output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.gradRasterDB & 15), "grad_rast_db output tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(gradAttr, gradRaster, gradRasterDB);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_grad(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = interpolate_grad_da(attr, rast, tri, dy, empty_tensor, empty_tensor, false, empty_vec);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_rasterize.cpp b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_rasterize.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..a52960347e04cfb84e1762b2d401106ce25ee609
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_rasterize.cpp
@@ -0,0 +1,223 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void RasterizeGradKernel(const RasterizeGradParams p);
+void RasterizeGradKernelDb(const RasterizeGradParams p);
+
+//------------------------------------------------------------------------
+// Python GL state wrapper methods.
+
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
+{
+    pState = new RasterizeGLState();
+    automatic = automatic_;
+    cudaDeviceIdx = cudaDeviceIdx_;
+    memset(pState, 0, sizeof(RasterizeGLState));
+    pState->enableDB = enableDB ? 1 : 0;
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
+    releaseGLContext();
+}
+
+RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
+{
+    destroyGLContext(pState->glctx);
+    delete pState;
+}
+
+void RasterizeGLStateWrapper::setContext(void)
+{
+    setGLContext(pState->glctx);
+}
+
+void RasterizeGLStateWrapper::releaseContext(void)
+{
+    releaseGLContext();
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGLState& s = *stateWrapper.pState;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+
+    // Check that GL context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
+
+    // Determine number of outputs
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Get output shape.
+    int height = std::get<0>(resolution);
+    int width  = std::get<1>(resolution);
+    int depth  = instance_mode ? pos.size(0) : ranges.size(0);
+    NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
+
+    // Get position and triangle buffer sizes in int32/float32.
+    int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
+    int triCount = 3 * tri.size(0);
+
+    // Set the GL context unless manual context.
+    if (stateWrapper.automatic)
+        setGLContext(s.glctx);
+
+    // Resize all buffers.
+    rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, posCount, triCount, width, height, depth);
+
+    // Copy input data to GL and render.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    int vtxPerInstance = instance_mode ? pos.size(1) : 0;
+    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
+    float* outputPtr[2];
+    outputPtr[0] = out.data_ptr<float>();
+    outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
+
+    // Copy rasterized results into CUDA buffers.
+    rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
+
+    // Done. Release GL context and return.
+    if (stateWrapper.automatic)
+        releaseGLContext();
+
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGradParams p;
+    bool enable_db = ddb.defined();
+
+    // Check inputs.
+    if (enable_db)
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy, ddb);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy, ddb);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Determine instance mode.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Shape is taken from the rasterizer output tensor.
+    NVDR_CHECK(out.sizes().size() == 4, "tensor out must be rank-4");
+    p.depth  = out.size(0);
+    p.height = out.size(1);
+    p.width  = out.size(2);
+    NVDR_CHECK(p.depth > 0 && p.height > 0 && p.width > 0, "resolution must be [>0, >0, >0]");
+
+    // Check other shapes.
+    if (p.instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) == p.depth && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [depth, >0, 4]");
+    else
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(out.sizes().size() == 4 && out.size(0) == p.depth && out.size(1) == p.height && out.size(2) == p.width && out.size(3) == 4, "out must have shape [depth, height, width, 4]");
+    NVDR_CHECK( dy.sizes().size() == 4 &&  dy.size(0) == p.depth &&  dy.size(1) == p.height &&  dy.size(2) == p.width &&  dy.size(3) == 4, "dy must have shape [depth, height, width, 4]");
+    if (enable_db)
+        NVDR_CHECK(ddb.sizes().size() == 4 && ddb.size(0) == p.depth && ddb.size(1) == p.height && ddb.size(2) == p.width && ddb.size(3) == 4, "ddb must have shape [depth, height, width, 4]");
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor ddb_;
+    if (enable_db)
+        ddb_ = ddb.contiguous();
+
+    // Populate parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = p.instance_mode ? pos.size(1) : pos.size(0);
+    p.pos = pos.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.out = out.data_ptr<float>();
+    p.dy  = dy_.data_ptr<float>();
+    p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
+
+    // Set up pixel position to clip space x, y transform.
+    p.xs = 2.f / (float)p.width;
+    p.xo = 1.f / (float)p.width - 1.f;
+    p.ys = 2.f / (float)p.height;
+    p.yo = 1.f / (float)p.height - 1.f;
+
+    // Allocate output tensor for position gradients.
+    torch::Tensor grad = torch::zeros_like(pos);
+    p.grad = grad.data_ptr<float>();
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dy  &  7), "dy input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.ddb & 15), "ddb input tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_db ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return the gradients.
+    return grad;
+}
+
+// Version without derivatives.
+torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
+{
+    torch::Tensor empty_tensor;
+    return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_texture.cpp b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_texture.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..2257f566623495c7044ea3f532ef00e327477dc7
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_texture.cpp
@@ -0,0 +1,718 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/texture.h"
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void MipBuildKernel1                            (const TextureKernelParams p);
+void MipBuildKernel2                            (const TextureKernelParams p);
+void MipBuildKernel4                            (const TextureKernelParams p);
+void TextureFwdKernelNearest1                   (const TextureKernelParams p);
+void TextureFwdKernelNearest2                   (const TextureKernelParams p);
+void TextureFwdKernelNearest4                   (const TextureKernelParams p);
+void TextureFwdKernelLinear1                    (const TextureKernelParams p);
+void TextureFwdKernelLinear2                    (const TextureKernelParams p);
+void TextureFwdKernelLinear4                    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest1       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest2       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest4       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear1        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear2        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear4        (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest1               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest2               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest4               (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear1                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear2                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear4                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest1   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest2   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest4   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear1    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear2    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear4    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO1     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO2     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO4     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO1      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO2      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO4      (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO1  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO2  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO4  (const TextureKernelParams p);
+void MipGradKernel1                             (const TextureKernelParams p);
+void MipGradKernel2                             (const TextureKernelParams p);
+void MipGradKernel4                             (const TextureKernelParams p);
+void TextureGradKernelNearest                   (const TextureKernelParams p);
+void TextureGradKernelLinear                    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearest       (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinear        (const TextureKernelParams p);
+void TextureGradKernelCubeNearest               (const TextureKernelParams p);
+void TextureGradKernelCubeLinear                (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearest   (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinear    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearestBO     (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinearBO      (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinearBO  (const TextureKernelParams p);
+
+//------------------------------------------------------------------------
+// Modeselektor.
+
+static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode, int max_mip_level)
+{
+    // Mip and filter modes.
+    p.filterMode = filter_mode;
+    NVDR_CHECK(p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, "filter_mode unsupported");
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        p.mipLevelLimit = max_mip_level;
+        NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+    }
+
+    // Boundary mode.
+    p.boundaryMode = boundary_mode;
+    NVDR_CHECK(p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, "boundary_mode unsupported");
+}
+
+//------------------------------------------------------------------------
+// Mipmap construction.
+
+TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    p.mipLevelLimit = max_mip_level;
+    p.boundaryMode = cube_mode ? TEX_BOUNDARY_MODE_CUBE : TEX_BOUNDARY_MODE_WRAP;
+    NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex);
+    NVDR_CHECK_CONTIGUOUS(tex);
+    NVDR_CHECK_F32(tex);
+
+    // Populate parameters and sanity check tex shape.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+    }
+    p.texDepth  = tex.size(0);
+    p.texHeight = tex.size(cube_mode ? 2 : 1);
+    p.texWidth  = tex.size(cube_mode ? 3 : 2);
+    p.channels  = tex.size(cube_mode ? 4 : 3);
+
+    // Set texture pointer.
+    p.tex[0] = tex.data_ptr<float>();
+
+    // Generate mip offsets and calculate total size.
+    int mipOffsets[TEX_MAX_MIP_LEVEL];
+    int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+
+    // Allocate and set mip tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor mip = torch::empty({mipTotal}, opts);
+    float* pmip = mip.data_ptr<float>();
+    for (int i=1; i <= p.mipLevelMax; i++)
+        p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Build mip levels.
+    for (int i=1; i <= p.mipLevelMax; i++)
+    {
+        int2 ms = mipLevelSize(p, i);
+        int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+        dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+        p.mipLevelOut = i;
+
+        void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+    }
+
+    // Return the mip tensor in a wrapper.
+    TextureMipWrapper mip_wrapper;
+    mip_wrapper.mip = mip;
+    mip_wrapper.max_mip_level = max_mip_level;
+    mip_wrapper.texture_size = tex.sizes().vec();
+    mip_wrapper.cube_mode = cube_mode;
+    return mip_wrapper;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.n, p.imgHeight, p.imgWidth, p.channels}, opts);
+    p.out = out.data_ptr<float>();
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    float* pmip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match and assign.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in custom mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+                p.tex[i] = t.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets, check mipmap size, and set mip data pointer.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "wrapped mip tensor size mismatch");
+            pmip = mip_w.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2");
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 15), "tex or mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.out    & 15), "out output tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip     & 15), "mip input tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 7), "tex or mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.out    & 7), "out output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip     & 7), "mip input tensor not aligned to float2");
+    }
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 15), "uv_da input tensor not aligned to float4");
+    else
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 7), "uv_da input tensor not aligned to float2");
+
+    // Choose launch parameters for texture lookup kernel.
+    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    // Choose kernel based on filter mode, cube mode, bias-only mode, and datatype.
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2 * 3] = {
+        (void*)TextureFwdKernelNearest1,
+        (void*)TextureFwdKernelNearest2,
+        (void*)TextureFwdKernelNearest4,
+        (void*)TextureFwdKernelLinear1,
+        (void*)TextureFwdKernelLinear2,
+        (void*)TextureFwdKernelLinear4,
+        (void*)TextureFwdKernelLinearMipmapNearest1,
+        (void*)TextureFwdKernelLinearMipmapNearest2,
+        (void*)TextureFwdKernelLinearMipmapNearest4,
+        (void*)TextureFwdKernelLinearMipmapLinear1,
+        (void*)TextureFwdKernelLinearMipmapLinear2,
+        (void*)TextureFwdKernelLinearMipmapLinear4,
+        (void*)TextureFwdKernelCubeNearest1,
+        (void*)TextureFwdKernelCubeNearest2,
+        (void*)TextureFwdKernelCubeNearest4,
+        (void*)TextureFwdKernelCubeLinear1,
+        (void*)TextureFwdKernelCubeLinear2,
+        (void*)TextureFwdKernelCubeLinear4,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelLinearMipmapLinearBO4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO4,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+    func_idx = func_idx * 3 + channel_div_idx; // Choose vector size.
+
+    // Launch kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Return output tensor.
+    return out;
+}
+
+// Version without mipmaps.
+torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    return texture_fwd_mip(tex, uv, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]");
+
+    // Get contiguous version of dy.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor for tex gradient.
+    torch::Tensor grad_tex = torch::zeros_like(tex);
+    p.gradTex[0] = grad_tex.data_ptr<float>();
+
+    // Allocate output tensor for uv gradient.
+    torch::Tensor grad_uv;
+    torch::Tensor grad_uv_da;
+    torch::Tensor grad_mip_level_bias;
+    if (p.filterMode != TEX_MODE_NEAREST)
+    {
+        grad_uv = torch::empty_like(uv);
+        p.gradUV = grad_uv.data_ptr<float>();
+
+        // Gradients for things affecting mip level.
+        if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            // Allocate output tensor for uv_da gradient.
+            if (has_uv_da)
+            {
+                grad_uv_da = torch::empty_like(uv_da);
+                p.gradUVDA = grad_uv_da.data_ptr<float>();
+            }
+
+            // Allocate output tensor for mip_level_bias gradient.
+            if (has_mip_level_bias)
+            {
+                grad_mip_level_bias = torch::empty_like(mip_level_bias);
+                p.gradMipLevelBias = grad_mip_level_bias.data_ptr<float>();
+            }
+        }
+    }
+
+    // Choose kernel variants based on channel count.
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    torch::Tensor grad_mip;
+    std::vector<torch::Tensor> grad_mip_stack;
+    float* pmip = 0;
+    float* pgradMip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match, assign, construct gradient tensors.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+
+                torch::Tensor g = torch::zeros_like(t);
+                grad_mip_stack.push_back(g);
+
+                p.tex[i] = t.data_ptr<float>();
+                p.gradTex[i] = g.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets and get space for temporary mip gradients.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "mip tensor size mismatch");
+            grad_mip = torch::zeros_like(mip_w);
+            pmip = (float*)mip_w.data_ptr<float>();
+            pgradMip = grad_mip.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(!((uintptr_t)p.uv       & 7), "uv input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUV   & 7), "grad_uv output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 15), "uv_da input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 15), "grad_uv_da output tensor not aligned to float4");
+    }
+    else
+    {
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 7), "uv_da input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 7), "grad_uv_da output tensor not aligned to float2");
+    }
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 15), "tex or mip input tensor not aligned to float4");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 15), "grad_tex output tensor not aligned to float4");
+        }
+        NVDR_CHECK(!((uintptr_t)p.dy         & 15), "dy input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip         & 15), "mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 15), "internal mip gradient tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 7), "tex or mip input tensor not aligned to float2");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 7), "grad_tex output tensor not aligned to float2");
+        }
+         NVDR_CHECK(!((uintptr_t)p.dy         & 7), "dy output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip         & 7), "mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 7), "internal mip gradient tensor not aligned to float2");
+    }
+
+    // Choose launch parameters for main gradient kernel.
+    void* args[] = {&p};
+    dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2] = {
+        (void*)TextureGradKernelNearest,
+        (void*)TextureGradKernelLinear,
+        (void*)TextureGradKernelLinearMipmapNearest,
+        (void*)TextureGradKernelLinearMipmapLinear,
+        (void*)TextureGradKernelCubeNearest,
+        (void*)TextureGradKernelCubeLinear,
+        (void*)TextureGradKernelCubeLinearMipmapNearest,
+        (void*)TextureGradKernelCubeLinearMipmapLinear,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelLinearMipmapNearestBO,
+        (void*)TextureGradKernelLinearMipmapLinearBO,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelCubeLinearMipmapNearestBO,
+        (void*)TextureGradKernelCubeLinearMipmapLinearBO,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+
+    // Launch main gradient kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Launch kernel to pull gradients from mip levels. Don't do this if mip stack was supplied - individual level gradients are already there.
+    if (p.enableMip && !has_mip_stack)
+    {
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+        int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+        void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+    }
+
+    // Return output tensors.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(grad_tex, grad_uv, grad_uv_da, grad_mip_level_bias, grad_mip_stack);
+}
+
+// Version for nearest filter mode.
+torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::get<0>(result);
+}
+
+// Version for linear filter mode.
+std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+// Version for linear-mipmap-nearest mode.
+std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(std::get<0>(result), std::get<1>(result), std::get<4>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_types.h b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_types.h
new file mode 100755
index 0000000000000000000000000000000000000000..d047cc67d4c901f26ab59bb8eb93c7a209368fc4
--- /dev/null
+++ b/pose_estimation/nvdiffrast/nvdiffrast/torch/torch_types.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+
+//------------------------------------------------------------------------
+// Python GL state wrapper.
+
+class RasterizeGLState;
+class RasterizeGLStateWrapper
+{
+public:
+    RasterizeGLStateWrapper     (bool enableDB, bool automatic, int cudaDeviceIdx);
+    ~RasterizeGLStateWrapper    (void);
+
+    void setContext             (void);
+    void releaseContext         (void);
+
+    RasterizeGLState*           pState;
+    bool                        automatic;
+    int                         cudaDeviceIdx;
+};
+
+//------------------------------------------------------------------------
+// Mipmap wrapper to prevent intrusion from Python side.
+
+class TextureMipWrapper
+{
+public:
+    torch::Tensor               mip;
+    int                         max_mip_level;
+    std::vector<int64_t>        texture_size;   // For error checking.
+    bool                        cube_mode;      // For error checking.
+};
+
+
+//------------------------------------------------------------------------
+// Antialias topology hash wrapper to prevent intrusion from Python side.
+
+class TopologyHashWrapper
+{
+public:
+    torch::Tensor               ev_hash;
+};
+
+//------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/run_sample.sh b/pose_estimation/nvdiffrast/run_sample.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9878ca061b8e5777d6a7851ff0c372b97441c3d1
--- /dev/null
+++ b/pose_estimation/nvdiffrast/run_sample.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+function print_help {
+    echo "Usage: `basename $0` [--build-container] <python_file>"
+    echo ""
+    echo "Option --build-container will build the Docker container based on"
+    echo "docker/Dockerfile and tag the image with gltorch:latest."
+    echo ""
+    echo "Example: `basename $0` samples/torch/envphong.py"
+}
+
+build_container=0
+sample=""
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --build-container) build_container=1;;
+        -h|--help) print_help; exit 0 ;;
+        --*) echo "Unknown parameter passed: $1"; exit 1 ;;
+        *) sample="$1"; shift; break;
+    esac
+    shift
+done
+
+rest=$@
+
+# Build the docker container
+if [ "$build_container" = "1" ]; then
+    docker build --tag gltorch:latest -f docker/Dockerfile .
+    docker build --tag gltensorflow:latest --build-arg BASE_IMAGE=tensorflow/tensorflow:1.15.0-gpu-py3 -f docker/Dockerfile .
+fi
+
+if [ ! -f "$sample" ]; then
+    echo
+    echo "No python sample given or file '$sample' not found.  Exiting."
+    exit 1
+fi
+
+image="gltorch:latest"
+TENSORFLOW_CUDA_CACHE=""
+# Magically choose the tensorflow container if running a sample from the samples/tensorflow/ directory
+if [[ $sample == *"/tensorflow/"* ]]; then
+    image="gltensorflow:latest"
+    TENSORFLOW_CUDA_CACHE="-e NVDIFFRAST_CACHE_DIR=/app/tmp"
+fi
+
+echo "Using container image: $image"
+echo "Running command: $sample $rest"
+
+# Run a sample with docker
+docker run --rm -it --gpus all --user $(id -u):$(id -g) \
+    -v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $TENSORFLOW_CUDA_CACHE $image python3 $sample $rest
diff --git a/pose_estimation/nvdiffrast/samples/__init__.py b/pose_estimation/nvdiffrast/samples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/nvdiffrast/samples/data/NOTICE.txt b/pose_estimation/nvdiffrast/samples/data/NOTICE.txt
new file mode 100755
index 0000000000000000000000000000000000000000..1c4fe0a6bcee80c13237e4e3640bfbbd1ca8309a
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/data/NOTICE.txt
@@ -0,0 +1,225 @@
+
+Environment map stored as part of samples/data/envphong.npz is derived from a Wave Engine sample material originally shared under MIT License that is reproduced below.
+Original material: https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+Original license:  https://github.com/WaveEngine/Samples/blob/master/LICENSE.md
+
+Mesh and texture stored as part of samples/data/earth.npz are derived from "3D Earth Photorealistic 2K" model originally made available under TurboSquid 3D Model License that is reproduced below.
+Original material: https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+Original license:  https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license
+
+
+
+MIT License
+
+Copyright (c) 2016 Wave Coorporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+
+TurboSquid 3D Model License
+
+This is a legally binding agreement between licensee ("you"), and TurboSquid regarding your rights to use 3D Models from the Site under this license. "You" refers to the purchasing entity, whether that is a natural person who must be at least 18 years of age, or a corporate entity. The rights granted in this agreement are granted to the purchasing entity, its parent company, and its majority owned affiliates on a "royalty free" basis, which means that after a Purchase, there are no future royalties or payments that are required. This agreement incorporates by reference the Terms of Use as well as the Site's policies and procedures as such.
+I. Introduction & Definitions
+
+Definitions
+
+This agreement is intended to be easy to understand, and to provide clarity for using 3D Models in the work you create ("Creations"). Over the years, TurboSquid has been asked many questions about how 3D Models may be used in Creations, and we have attempted to answer those questions in this agreement.
+
+Some words in this agreement are given specific meanings. Words that appear initially in quotations, such as "you" and "Creations", are defined in the text preceding the word. Other capitalized words are defined below:
+
+"3D Model" is the collection of one or more digital files, packaged in the form of a product on the Site that can be identified by a 3D Model ID, and that is made available to you for Purchase on the Site. A 3D Model may include 3D Model files, geometry, texture maps, materials, motion captures, renderings and other constituent files related to the 3D Model data and its representation.
+
+"Site" refers to the TurboSquid websites, API's, software applications or any approved means or utility either currently in existence or in the future; the software and source code used by TurboSquid to provide such services; user interface layouts, designs, images, text, knowledgebase articles, program offers; site information provided in reports (such as popular keyword searches); and all other intellectual property protected under copyright, trademark, patent, publicity, or any other proprietary right.
+
+"Purchase" is the acquisition of a 3D Model by you from the Site under this agreement, whether as a purchase of 3D Model made available at a price of greater than $0, or a download of 3D Model made available at no charge.
+
+"TurboSquid" includes TurboSquid, Inc. and all licensed affiliates and partners that distribute 3D Models on behalf of TurboSquid, Inc.
+
+"Product Page" is the product page or interface that displays 3D Models available for Purchase on the Site.
+
+"Computer Game" is a type of Creation that includes digital games, computer-based games, handheld electronic games, mobile games, online games, web-games, social games, game mods, and console-based games.
+
+"Imagery" is a Creation made of any single image or sequence of images.
+
+"Depicted Intellectual Property" means any intellectual property depicted in the 3D Model, including any copyright, trademark, trade dress, right of publicity, or any other proprietary right throughout the world that may apply. For purposes of clarity, this does not refer to the copyrights owned by the creator of the 3D Model that are licensed in this agreement.
+
+To make reading this agreement easier and less repetitive, the following constructions are used:
+
+"Include," including," and "such as" are considered to be followed with "but not limited to." Examples are used in this agreement to illustrate, rather than limit, the scope of the terms.
+
+"The following restrictions", "the foregoing restrictions", and "subject to the restrictions" are considered to be followed with "in addition to all other restrictions applicable within this agreement."
+II. License Rights
+
+1. Ownership. TurboSquid does not grant title or ownership in 3D Models. All rights in 3D Models not expressly granted in this agreement are reserved by TurboSquid for itself and its licensors.
+
+2. Rights Granted. For 3D Models, TurboSquid grants to you a non-exclusive, perpetual, worldwide right and license to copy, distribute, reproduce, adapt, publicly display, publicly perform, digitally perform, transmit, broadcast, telecast, advertise, create derivative works, and market 3D Models within Creations in the uses authorized in this agreement. You may request authorization for a use not covered by this agreement ("New Use") by writing use@turbosquid.com. TurboSquid is authorized to approve a New Use if TurboSquid finds in its sole judgment that the New Use is substantially similar to another established use in this agreement and authorizes the New Use in writing.
+
+3. Rights Granted When Sharing 3D Models. If you Purchase as an employee of a corporate entity, sharing Purchased 3D Models with other employees of your corporate entity is allowed. Examples of allowed sharing include storing files on a networked hard drive, and aggregating 3D Models for later use in future Creations. You are responsible for any downstream distribution, use, or misuse by a recipient of a shared 3D Models. In all cases, sharing 3D Models with external people or entities is only allowed in the following situations, and with the following restrictions:
+
+a. In the production of a Creation owned by you, if you are working in collaboration with external parties, and there is a need to share 3D Models for the development and production of your Creation, sharing 3D Models with those external parties is allowed. Any external party that receives 3D Models may only use 3D Models on your Creations and must take reasonable care to secure and limit access to 3D Models to that purpose.
+
+b. In the production of a Creation owned by another entity ("your Client"), if you are working as a contractor and need to share 3D Models with your Client, or any external parties working with your Client, sharing 3D Models is allowed, subject to the restriction that all parties may use 3D Models only for your Client's particular Creation, and for successive versions of your Client's Creation, such as sequel Computer Games or movies that utilize the same 3D Models. All parties must take reasonable care to secure and limit access to 3D Models to the parties working on your Client's Creation. For all other use by any party, 3D Models must be Purchased again to create a new license agreement governing that use
+
+4. Editorial Use Restriction for Some 3D Models. The following restrictions apply to any 3D Model with an "Editorial Uses Only" label on its Product Page. Permitted use of Depicted Intellectual Property in such 3D Models is limited to news reporting in Creations of some cultural, editorial, journalistic, or otherwise newsworthy value, including news reporting on television and the internet. A second permitted use is use within an academic setting, limited to teaching, scholarship, and research. This restriction does not apply if you have the needed authorization to use the Depicted Intellectual Property for your Creation, such as if you are owner of the Depicted Intellectual Property, or the advertising team, hired party, or licensee of the Depicted Intellectual Property owner.
+
+5. Depicted Intellectual Property. TurboSquid does not own or license any Depicted Intellectual Property. TurboSquid does not in any way make any representations or warranties about Depicted Intellectual Property associated with 3D Models. You are solely responsible for determining the need for and, if appropriate, obtaining any needed clearance, consent, or release to use any Depicted Intellectual Property in your Creations.
+
+6. Creations of Imagery.
+
+Permitted Uses of Creations of Imagery. Subject to the following restrictions, you may use Creations of Imagery within news, film, movies, television programs, video projects, multi-media projects, theatrical display, software user interfaces; architectural renderings, Computer Games, virtual worlds, simulation and training environments; corporate communications, marketing collateral, tradeshow promotional items, booth decorations and presentations; pre-visualizations, product prototyping and research; mobile, web, print, television, and billboard advertising; online and electronic publications of blogs, literature, social media, and email campaigns; website designs and layouts, desktop and mobile wallpapers, screensavers, toolbar skins; books, magazines, posters, greeting cards; apparel items, brochures, framed or printed artwork, household items, office items, lenticular prints, product packaging and manufactured products.
+
+Restrictions on Permitted Uses of Creations of Imagery.
+
+a. Stock Media Clearinghouse. You may NOT publish or distribute Creations of Imagery through another stock media clearinghouse, for example as part of an online marketplace for photography, clip art, or design templates.
+
+b. Promotional Images. Images displayed for the promotion a 3D Model on its Product Page ("Promotional Images") may be used in Creations of Imagery, provided that the 3D Model itself has been Purchased and subject to the following restrictions:
+
+i. You may NOT use a Promotional Image that has any added element which is not included as part of the 3D Model. An example of this type of restricted use is if the 3D Model contains an airplane, and there is a Promotional Image of that airplane rendered over a blue sky; however, the blue sky image is not included as part of the 3D Model. Other prohibited examples include use of Promotional Images from movies or advertisements that may have used 3D Model.
+
+ii. You may NOT use any Promotional Image that has a logo, mark, watermark, attribution, copyright or other notice superimposed on the image without prior approval from TurboSquid Support.
+
+c. Business Logos. You may NOT use Imagery in any Creation that is a trademark, servicemark, or business logo. This restriction is included because the owners of these types of Creations typically seek exclusivity on the use of the imagery in their Creation, which is incompatible with the non-exclusive license granted to you under this agreement.
+
+
+7. Creations of Computer Games and Software
+
+Permitted Uses in Creations of Computer Games and Software. Subject to the following restrictions, you may incorporate 3D Models in Creations of Computer Games, virtual worlds, simulation and training environments; mobile, desktop and web applications; and interactive electronic publications of literature such as e-books and electronic textbooks.
+
+Restrictions on Permitted Uses of 3D Models in Creations of Games and Software.
+
+a. Interactivity. Your inclusion of 3D Models within any such Creation is limited to uses where 3D Model is contained in an interactive experience for the user and not made available outside of the interactive experience. Such a permitted example of this use would be to include a 3D Model of human anatomy in a medical training application in a way that the 3D Model or its environment may be manipulated or interacted with.
+
+b. Access to 3D Models. You must take all reasonable and industry standard measures to incorporate 3D Models within Creations to prevent other parties from gaining access to 3D Models. 3D Models must be contained in proprietary formats so that they cannot be opened or imported in a publicly available software application or framework, or extracted without reverse engineering. WebGL exports from Unity, Unreal, and Lumberyard are permitted. Any other open format or format encrypted with decryptable open standards (such as an encrypted compression archive or other WebGL programs not listed here) are prohibited from using 3D Models. If your Creation uses WebGL and you are not sure if it qualifies, please contact use@turbosquid.com and describe your Creation in detail.
+
+c. Open Systems. You typically may NOT include 3D Models in Creations that have the general functionality for importing and/or exporting 3D Models. Please contact use@turbosquid.com and describe your Creation in detail if this is your desired use. An example of such a prohibited use is to include 3D Models as a starter library within a standard retail Software Creation that allows users to generally work with 3D Models, even if the 3D Model itself is somehow protected and is not capable of being exported. An allowed use is for custom or enterprise software in certain circumstances.
+
+d. Virtual Good Sales. You may NOT import, upload, reproduce, make available, publish, transmit, distribute, or sublicense 3D Models in Creations of virtual goods or worlds for any 3D community ("Virtual World"), unless you or your Client owns the Virtual World platform and it complies with the previous restrictions.
+
+
+8. Creations of Physical Form.
+
+Permitted Uses in Creations of Physical Form. Subject to the following restrictions, you may use 3D Models to make Physical Creations such as 3D printed works, articles of manufacture, custom vehicles, furniture, jewelry, sculptural artwork, toys, and physical entertainment goods ("Creations of Physical Form").
+
+Restrictions on Permitted Uses in Creations of Physical Form.
+
+a. Substantially Similar Creations. Permitted use of any Creation of Physical Form in which a 3D Model is untransformed or substantially similar to the 3D Model is limited to personal use, gifts, or charitable donations, with a maximum of 5 instances of such Creation per Purchase; unless the 3D Model is a small part of a much larger array of other physical objects in the Creation. For example, if you are creating a real-world, physical human skeleton for manufacture for sale, it is permitted to add a 3D printed human head that exactly resembles the Purchased 3D Model, but it is not permitted to sell the 3D printed head by itself. Another permitted example of a 3D Model being a small part of a larger array is using a 3D Model that ends up within an automobile as a part of the automobile.
+
+b. No Depicted Intellectual Property. You may NOT reproduce Depicted Intellectual Property in any Creation of Physical Form for any purpose. For example, you may NOT make Physical Form Creations of a copyrighted character (Spiderman, Elsa, Slimer), or branded technology (Apple, Toshiba, Samsung).
+
+9. 3D Industry Promotional Use. If TurboSquid has granted you, as a hardware or software partner, access to priced 3D Models on a free-of-charge basis, your use of 3D Models is restricted to internal testing for your 3D software or hardware products, and to the promotion of your software or hardware products with Creations of Imagery provided that an attribution of the artist's name and the Site are included. You agree that should any 3D Models be used outside of these purposes in ways that are normally allowed after a Purchase, that you will notify TurboSquid and promptly Purchase the 3D Models and otherwise comply with the terms herein.
+
+10. Unauthorized Use. If you use 3D Models in an unauthorized way, TurboSquid may terminate your account and pursue other penalties, damages, losses, and profits TurboSquid is entitled to under this agreement or at law or equity. The following are unauthorized uses that are explicitly prohibited:
+
+a. Competition. You may NOT use 3D Models in a way that competes with the Site, including distributing through 3D Model Clearinghouses. You may NOT publish, distribute, or make 3D Models available through any online clearinghouse infrastructure. You may not redistribute 3D Models as part of any design template, After Effects template, stock photography, video or clip art for distribution or licensing through any online stock media clearinghouse whatever.
+
+b. Re-Distribution. You may NOT re-distribute, publish, or make 3D Models available to any third party except in the form of a permitted Creation, or shared as authorized in this agreement.
+
+c. Group Buying. You may NOT aggregate funds to Purchase 3D Models with one or more other parties. An example of this prohibited use is a website membership where members pool their money to make a single Purchase that is shared by the members of the group. Each such member must Purchase individually.
+
+d. No Obscene or Unlawful Use. You may NOT use 3D Models for any defamatory, harassing, pornographic, obscene, or racist purpose, or to infringe any party's Depicted Intellectual Property rights.
+
+e. False Attribution. You may NOT misrepresent yourself as the creator of 3D Models.
+
+11. Resellers. The license granted herein is wholly transferable by an authorized reseller ("Reseller") to another party ("Transferee"). Each transferred license must be transferred entirely and all transferred 3D Models must be permanently deleted from the Reseller's systems after the transfer. When transferring the license, Reseller represents and warrants that the Reseller has the authority to bind the Transferee to these terms. The Reseller is jointly and severally responsible with any Transferee and each are liable for the transferee's use and compliance with TurboSquid's Terms of Use and Site's policies and procedures as well as any financial obligations hereunder.
+III. License Term & Termination
+
+1. Term. Your right and license to 3D Models is perpetual, unless terminated as described herein.
+
+2. Termination. Your license grant is terminated immediately and without notice in the cases below. In such termination, you and any recipients of 3D Models must cease use, distribution, and destroy all copies of 3D Models.
+
+a. Reversal of Purchase. Your right and license to 3D Models are contingent on your Purchase of 3D Models. Any payment reversal of a Purchase for any reason immediately terminates all rights granted under this agreement. Potential Reasons for a payment reversal include:
+
+i. TurboSquid reverses your Purchase at your request.
+
+ii. TurboSquid receives a charge back or other notice from your bank or credit card cancelling your Purchase and/or withdrawing the funds used for your Purchase.
+
+iii. TurboSquid determines in its sole discretion that your Purchase was fraudulent.
+
+iv. When you are granted delayed payment terms, and fail to make payments such that TurboSquid sends you notice and terminates your account.
+
+b. Failure to Abide by the License Grant. Material failure to abide by the terms of this agreement immediately terminates your right and license to 3D Models. If you detect a violation of the license grant by you or any recipient of shared 3D Models, and promptly report the violation to agent@turbosquid.com, TurboSquid will make a good faith effort to find an appropriate remedy to preserve your license grant.
+IV. Warranties
+
+You covenant, represent, and warrant to TurboSquid that:
+
+    You have full right, power, legal capacity, and authority to enter into and perform this agreement, have obtained any third-party consent needed to do so, and, prior to any Purchase, had an opportunity to seek independent legal counsel.
+    You will not use 3D Models except pursuant to the terms of this agreement. Should you use 3D Models in an unauthorized way, you agree to any reasonable fee or penalty exercised by TurboSquid under this agreement or applicable law.
+    You will, prior to Purchase, determine the need for and, if appropriate, obtain any needed third-party clearance, consent, or release to use Depicted Intellectual Property shown in the digital rendering of 3D Models, and shall not use 3D Models to infringe any party's Depicted Intellectual Property rights.
+    You will immediately notify TurboSquid of any legal claim or challenge against your use of 3D Models or any other rights issue, before disclosing such issue to any third-party.
+
+V. Limitation of Liability
+
+1. 3D Models are provided on an "as is", "as available", and "with all faults" basis. TurboSquid makes no representations, warranties, conditions, or guarantees as to the usefulness, quality, suitability, truth, fitness for a particular purpose, non-infringement, merchantability, or cosmetic attributes of 3D Models, and does not guarantee the accuracy or completeness of specifications associated with 3D Models, including measurements, weight, durability, strength, materials, general physical properties, regulatory compliance, other engineering or construction attributes.
+
+2. TurboSquid disclaims all express or implied conditions, representations, and warranties of any kind regarding 3D Models, including any implied warranty or condition of merchantability. TurboSquid allows your Purchase to be refunded under certain reasonable time frames and conditions, subject to the Site's policies.
+
+3. You assume all risk for any damage to your computer systems and network for any damage to your computer system by obtaining 3D Models, including any damages resulting from computer viruses.
+
+4. To the fullest extent permitted by law, TurboSquid shall not be liable for (A) any direct, indirect, punitive, special, incidental, consequential, or exemplary damages (including loss of business, revenue, profits, goodwill, use, data, electronically transmitted orders, or other economic advantage) arising out of or in connection with 3D Models, even if TurboSquid has previously been advised of, or reasonably could have foreseen, the possibility of such damages, however they arise, whether in breach of contract or in tort (including negligence) or (B) any damages in excess of $1,000. To the extent that any jurisdiction does not allow the exclusion or limitation of direct, incidental, or consequential damages, portions of the preceding limitation or exclusion may not apply, but should be construed to the greatest extent applicable in such jurisdictions. Notwithstanding anything to the contrary herein, the TurboSquid indemnification obligation set forth below shall be limited to the following depending on the licensing tier:
+
+Tier 0: 3D Models acquired at free-of-charge are not indemnified.
+
+Tier 1: Standard License indemnity limitation is ten thousand ($10,000) dollars for all 3D Models acquired with payment. This indemnity is in aggregate for all 3D Models acquired under the Standard License.
+
+Tier 2: Small Business License indemnity limitation is two hundred and fifty thousand ($250,000) dollars for any 3D Model. This indemnity is in aggregate for all 3D Models acquired under the Small Business License.
+
+Tier 3: Enterprise License indemnity limitation is one million ($1,000,000) dollars for any 3D Model. This indemnity is in aggregate for all 3D Models acquired under the Enterprise License.
+
+For any 3D Model labeled Editorial, the above indemnities shall only apply if the model is properly used within the editorial license set forth herein (i.e. for news and editorial purposes in association with newsworthy media.)  For use outside the Editorial scope, no indemnification from TurboSquid shall apply. 
+
+5. You agree to indemnify and hold TurboSquid and its subsidiaries, affiliates, shareholders, officers, directors, agents, licensors, licensee, suppliers, alliance members, other partners, employees and representatives ("TurboSquid Parties") harmless from any claim or demand, including reasonable attorneys' fees, made by any third party due to, or arising out of your use of 3D Models or Creations.
+
+6. Subject to sections 4 and 5 above, TurboSquid shall indemnify, defend, and hold you harmless from and against any claim or demand, including reasonable attorneys' fees made by any third party for copyright or trademark infringement due to or arising out of your use of the 3D Models in accordance with these Terms, but excluding any modifications made by You, if such infringement was caused by the modification. This indemnity shall not apply to any 3D Model labeled for Editorial Use or a brand name, logo, or other Depicted Intellectual Property prior identified in a 3D Model.
+
+7. In the event of an indemnification claim by You, you agree to provide notice to TurboSquid within thirty days' of receiving any claim and allowing TurboSquid to fully control such claim, including but not limited to, selection of counsel, reasonable diligence into the claim, and if necessary litigation and/or settlement. Notice must be given via email to: agent@turbosquid.com. Notice is not considered made until it is acknowledged in writing by TurboSquid.
+VI. Other Terms
+
+1. Entire Agreement. This agreement constitutes the entire agreement between you and TurboSquid relating to your Purchase, unless you have a corporate license agreement with TurboSquid. Corporate licenses are available with additional protections for additional fees. Please contact enterprise@turbosquid.com if your organization requires a corporate license. TurboSquid does not otherwise offer any other changes, additions, variations, or additional signed forms related to this agreement. No modification to this agreement will be binding, unless in writing and signed by an authorized TurboSquid representative.
+
+2. Material Breach and Injunction.
+
+Your rights hereunder vary by licensing tier as follows:
+
+For the Standard License, you agree that any material breach of these Terms will result in irreparable harm to TurboSquid for which damages would be an inadequate remedy and, therefore, in addition to its rights and remedies otherwise available at law, TurboSquid will be entitled to equitable relief, including both a preliminary and permanent injunction, if such a breach occurs. You waive any requirement for the posting of a bond or other security if TurboSquid seeks such an injunction.
+
+For the Enterprise License, TurboSquid may not seek injunctive relief hereunder for any 3D Model. It hereby waives all right to equitable and injunctive relief and its damages shall be limited to monetary damages.
+
+Notwithstanding anything to the contrary herein, TurboSquid would be irreparably harmed and shall be entitled to equitable relief including injunctive relief for any hacking, theft, or misuse of the Site.
+
+3. Import/Export Regulations. 3D Models may be subject to the U.S. export laws and the export or import laws of other countries. You agree to comply strictly with all such laws and, in particular, shall with 3D Models: (a) obtain any export, re-export, or import authorizations required by U.S. or Your local laws; (b) not design, develop or produce missile, chemical/biological, or nuclear weaponry; and (c) not provide 3D Models to prohibited countries and entities identified in the U.S. export regulations.
+
+4. Governing Law. This agreement is governed by New York law, excluding conflict of law principles. Any action or proceeding arising out of or related to this agreement must be brought in a state or federal court located in New York, New York, and both parties irrevocably submit to the exclusive jurisdiction of such courts. All notices, requests and other communications under this agreement must be in writing (e-mail messages shall be deemed writings).
+
+5. LIMITED INTERNAL USER ARBITRATION. You acknowledge and agree that TurboSquid may, in its sole discretion, arbitrate disputes between TurboSquid users involving 3D Models (including any purchaser or supplier of 3D Models), and such findings shall be final and non-appealable. Either party may request that TurboSquid arbitrate the dispute, or TurboSquid may elect, at its option, to arbitrate the dispute. After TurboSquid elects to arbitrate any dispute hereunder, TurboSquid will waive any rights to a commission from both the Purchase and arbitration, and the parties must keep the results and process confidential and may not disclose anything related to the dispute to any other party (whether by oral, written, or other type of disclosure). To resolve disputes, TurboSquid may decide to terminate or suspend users, revoke the license, offer replacement 3D Models, reestablish the licensee, or surrender or reallocate fees (whether by refund, charitable donation, or otherwise). TurboSquid may award up to 3X the Purchase price to either party depending on the circumstances. YOU UNDERSTAND, ACKNOWLEDGE, AND AGREE THAT ACCEPTING THIS ARBITRATION PROVISION WAIVES RIGHTS TO JUDICIAL RESOLUTION, TRIAL BY JURY AND RIGHTS YOU WOULD OTHERWISE HAVE IF YOU HAD NOT AGREED TO THIS ARBITRATION PROVISION.
+
+6. Notice. Any notice under this agreement shall be via email to agent@turbosquid.com, provided that you receive an acknowledgement email from a TurboSquid representative within 5 business days. If no such acknowledgement email is received, notice must be in writing and delivered by mail to the following address.
+
+TurboSquid, Inc.
+c/o TurboSquid Support
+935 Gravier St., Suite 1600
+New Orleans, LA 70112
+
+7. Assignment. TurboSquid may not assign its rights under this agreement without providing you notice, except in the case of a bankruptcy, merger, acquisition, sale of all or substantially all of TurboSquid's assets to a subsequent owner or operator, or similar event.
+
+Your assignment rights vary based on the licensing tier of your purchase:
+
+For the Standard License, you may not assign your rights under this agreement without the prior written consent of TurboSquid.
+
+For Small Business or Enterprise Licenses, you may assign your rights under this agreement without the notice and consent of TurboSquid.
+
+8. English. This agreement may be translated into other languages, but English is the official language of this agreement and in any conflict between the English language version and any other version, the English language version shall control.
+
+9. Publicity. The following advertising, marketing, and publicity rights are granted to TurboSquid for each licensing tier:
+
+Standard License purchases may be fully publicized by TurboSquid and you hereby grant TurboSquid the right to use you and your company's name, logo, and project name on the TurboSquid website and in its related marketing and advertising materials.
+
+Small Business and Enterprise License purchase may not be publicized by TurboSquid in any way without prior written permission of the purchaser.
+
+10. Time limitations on any claim hereunder. Any claim by you hereunder, including without limitation a claim for indemnification under section V must be made within two years of purchasing the 3D Model.
+
+This 3D Model License is effective for use with 3D Models for use on or after June 17, 2020.
diff --git a/pose_estimation/nvdiffrast/samples/data/cube_c.npz b/pose_estimation/nvdiffrast/samples/data/cube_c.npz
new file mode 100755
index 0000000000000000000000000000000000000000..2bd3bd52eb17a835a3b8d211b50849b498865731
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/data/cube_c.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8b2f01b657a726a292c936918c685fbc15b415de87c637f257f0cbfa7fc14e9
+size 1582
diff --git a/pose_estimation/nvdiffrast/samples/data/cube_d.npz b/pose_estimation/nvdiffrast/samples/data/cube_d.npz
new file mode 100755
index 0000000000000000000000000000000000000000..a624cbb706618b69740b76966a648d59e11c6a59
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/data/cube_d.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:561634bcfc8f982f3c9a4805538708e5aea3eab2abfa76543eb496bb87844baf
+size 1966
diff --git a/pose_estimation/nvdiffrast/samples/data/cube_p.npz b/pose_estimation/nvdiffrast/samples/data/cube_p.npz
new file mode 100755
index 0000000000000000000000000000000000000000..f176340ecba628cad37ec94598a56ff95eb5e075
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/data/cube_p.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb2dd23c1614e3936f9f8bfe99427bfaf71c783f50775e066e28212d50cbad5a
+size 824
diff --git a/pose_estimation/nvdiffrast/samples/data/earth.npz b/pose_estimation/nvdiffrast/samples/data/earth.npz
new file mode 100755
index 0000000000000000000000000000000000000000..96fa4b1d6ba51a12d1fc4ebfa854d33a5c766a58
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/data/earth.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a8f8d4cdc1b55daacb73aad5ae109924025d605b786b643ba5b5126e893f4a0
+size 4209146
diff --git a/pose_estimation/nvdiffrast/samples/data/envphong.npz b/pose_estimation/nvdiffrast/samples/data/envphong.npz
new file mode 100755
index 0000000000000000000000000000000000000000..10ef0057c01a7951d5887d346f7146d5ca3ce8aa
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/data/envphong.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e73baa94fe2476f5bbbd3221fb4f90e8c39852bb1f03a29ad6f38c6a4bad48c
+size 3459785
diff --git a/pose_estimation/nvdiffrast/samples/tensorflow/__init__.py b/pose_estimation/nvdiffrast/samples/tensorflow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/nvdiffrast/samples/tensorflow/cube.py b/pose_estimation/nvdiffrast/samples/tensorflow/cube.py
new file mode 100755
index 0000000000000000000000000000000000000000..cd830d4afe6de6ea057b3ebf671af7e0765a252a
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/tensorflow/cube.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import os
+import sys
+import pathlib
+
+import util
+import tensorflow as tf
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Cube shape fitter.
+#----------------------------------------------------------------------------
+
+def fit_cube(max_iter          = 5000,
+             resolution        = 4, 
+             discontinuous     = False,
+             repeats           = 1,
+             log_interval      = 10, 
+             display_interval  = None,
+             display_res       = 512,
+             out_dir           = '.',
+             log_fn            = None,
+             imgsave_interval  = None,
+             imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+    
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    fn = 'cube_%s.npz' % ('d' if discontinuous else 'c')
+    with np.load(f'{datadir}/{fn}') as f:
+        pos_idx, vtxp, col_idx, vtxc = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], vtxp.shape[0]))
+        
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Setup TF graph for reference.
+    vtxw = np.concatenate([vtxp, np.ones([vtxp.shape[0], 1])], axis=1).astype(np.float32)
+    pos_clip = tf.matmul(vtxw, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, _ = dr.rasterize(pos_clip, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color, _ = dr.interpolate(vtxc[tf.newaxis, ...], rast_out, col_idx)
+    color = dr.antialias(color, rast_out, pos_clip, pos_idx)
+
+    # Optimized variables.
+    vtxc_opt = tf.get_variable('vtxc', initializer=tf.zeros_initializer(), shape=vtxc.shape)
+    vtxp_opt = tf.get_variable('vtxp', initializer=tf.zeros_initializer(), shape=vtxp.shape)
+
+    # Optimization variable setters for initialization.
+    vtxc_opt_in = tf.placeholder(tf.float32, vtxc.shape)
+    vtxp_opt_in = tf.placeholder(tf.float32, vtxp.shape)
+    opt_set = tf.group(tf.assign(vtxc_opt, vtxc_opt_in), tf.assign(vtxp_opt, vtxp_opt_in))
+
+    # Setup TF graph for what we optimize result.
+    vtxw_opt = tf.concat([vtxp_opt, tf.ones([vtxp.shape[0], 1], tf.float32)], axis=1)
+    pos_clip_opt = tf.matmul(vtxw_opt, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out_opt, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color_opt, _ = dr.interpolate(vtxc_opt[tf.newaxis, ...], rast_out_opt, col_idx)
+    color_opt = dr.antialias(color_opt, rast_out_opt, pos_clip_opt, pos_idx)
+
+    # Image-space loss and optimizer.
+    loss = tf.reduce_mean((color_opt - color)**2)
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.999).minimize(loss, var_list=[vtxp_opt, vtxc_opt])
+
+    # Setup TF graph for display.
+    rast_out_disp, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[display_res, display_res], output_db=False)
+    color_disp, _ = dr.interpolate(vtxc_opt[tf.newaxis, ...], rast_out_disp, col_idx)
+    color_disp = dr.antialias(color_disp, rast_out_disp, pos_clip_opt, pos_idx)
+    rast_out_disp_ref, _ = dr.rasterize(pos_clip, pos_idx, resolution=[display_res, display_res], output_db=False)
+    color_disp_ref, _ = dr.interpolate(vtxc[tf.newaxis, ...], rast_out_disp_ref, col_idx)
+    color_disp_ref = dr.antialias(color_disp_ref, rast_out_disp_ref, pos_clip, pos_idx)
+
+    # Geometric error calculation
+    geom_loss = tf.reduce_mean(tf.reduce_sum((tf.abs(vtxp_opt) - .5)**2, axis=1)**0.5)
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Repeats.
+    for rep in range(repeats):
+
+        # Optimize.
+        ang = 0.0
+        gl_avg = []
+        util.init_uninitialized_vars()
+        for it in range(max_iter + 1):
+            # Initialize optimization.
+            if it == 0:
+                vtxp_init = np.random.uniform(-0.5, 0.5, size=vtxp.shape) + vtxp
+                vtxc_init = np.random.uniform(0.0, 1.0, size=vtxc.shape)
+                util.run(opt_set, {vtxc_opt_in: vtxc_init.astype(np.float32), vtxp_opt_in: vtxp_init.astype(np.float32)})
+
+            # Learning rate ramp.
+            lr = 1e-2
+            lr = lr * max(0.01, 10**(-it*0.0005))
+
+            # Random rotation/translation matrix for optimization.
+            r_rot = util.random_rotation_translation(0.25)
+
+            # Smooth rotation for display.
+            a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+            # Modelview and modelview + projection matrices.
+            proj  = util.projection(x=0.4)
+            r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+            r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+            a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+            a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+        
+            # Run training and measure geometric error.
+            gl_val, _ = util.run([geom_loss, train_op], {mtx_in: r_mvp, lr_in: lr})
+            gl_avg.append(gl_val)
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                gl_val, gl_avg = np.mean(np.asarray(gl_avg)), []
+                s = ("rep=%d," % rep) if repeats > 1 else ""
+                s += "iter=%d,err=%f" % (it, gl_val)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+            if display_image or save_image:
+                ang = ang + 0.1
+                img_o = util.run(color_opt, {mtx_in: r_mvp})[0]
+                img_b = util.run(color, {mtx_in: r_mvp})[0]
+                img_d = util.run(color_disp, {mtx_in: a_mvp})[0]
+                img_r = util.run(color_disp_ref, {mtx_in: a_mvp})[0]
+
+                scl = display_res // img_o.shape[0]
+                img_b = np.repeat(np.repeat(img_b, scl, axis=0), scl, axis=1)
+                img_o = np.repeat(np.repeat(img_o, scl, axis=0), scl, axis=1)
+                result_image = np.concatenate([img_o, img_b, img_d, img_r], axis=1)
+
+            if display_image:
+                util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+            if save_image:
+                util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+    # All repeats done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    discontinuous = False
+    resolution = 0
+
+    def usage():
+        print("Usage: python cube.py [-v] [-discontinuous] resolution")
+        exit()
+
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 100
+        elif a == '-discontinuous':
+            discontinuous = True
+        elif a.isdecimal():
+            resolution = int(a)
+        else:
+            usage()
+
+    if resolution <= 0:
+        usage()
+
+    # Initialize TensorFlow.
+    util.init_tf()
+
+    # Run.
+    out_dir = 'out/cube_%s_%d' % (('d' if discontinuous else 'c'), resolution)
+    fit_cube(max_iter=5000, resolution=resolution, discontinuous=discontinuous, log_interval=10, display_interval=display_interval, out_dir=out_dir, log_fn='log.txt', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/tensorflow/earth.py b/pose_estimation/nvdiffrast/samples/tensorflow/earth.py
new file mode 100755
index 0000000000000000000000000000000000000000..8ef5870d764d70a291aaea7132a6d96f51c707ea
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/tensorflow/earth.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import pathlib
+
+import util
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Texture learning with/without mipmaps.
+#----------------------------------------------------------------------------
+
+def fit_earth(max_iter          = 20000,
+              log_interval      = 10,
+              display_interval  = None,
+              display_res       = 1024,
+              enable_mip        = True,
+              res               = 512,
+              ref_res           = 4096,
+              lr_base           = 1e-2,
+              lr_ramp           = 0.1,
+              out_dir           = '.',
+              log_fn            = None,
+              texsave_interval  = None,
+              texsave_fn        = None,
+              imgsave_interval  = None,
+              imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
+    # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/earth.npz') as f:
+        pos_idx, pos, uv_idx, uv, tex = f.values()
+    tex = tex.astype(np.float32)/255.0
+    max_mip_level = 9 # Texture is a 4x3 atlas of 512x512 maps.
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Learned texture.
+    tex_var = tf.get_variable('tex', initializer=tf.constant_initializer(0.2), shape=tex.shape)
+
+    # Setup TF graph for reference rendering in high resolution.
+    pos_clip = tf.matmul(pos, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, rast_out_db = dr.rasterize(pos_clip, pos_idx, [ref_res, ref_res])
+    texc, texd = dr.interpolate(uv[tf.newaxis, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+    color = dr.texture(tex[np.newaxis], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    color = color * tf.clip_by_value(rast_out[..., -1:], 0, 1) # Mask out background.
+    
+    # Reduce the reference to correct size.
+    while color.shape[1] > res:
+        color = util.bilinear_downsample(color)
+
+    # TF Graph for rendered candidate.
+    if enable_mip:
+        # With mipmaps.
+        rast_out_opt, rast_out_db_opt = dr.rasterize(pos_clip, pos_idx, [res, res])
+        texc_opt, texd_opt = dr.interpolate(uv[tf.newaxis, ...], rast_out_opt, uv_idx, rast_db=rast_out_db_opt, diff_attrs='all')
+        color_opt = dr.texture(tex_var[np.newaxis], texc_opt, texd_opt, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    else:
+        # No mipmaps: no image-space derivatives anywhere.
+        rast_out_opt, _ = dr.rasterize(pos_clip, pos_idx, [res, res], output_db=False)
+        texc_opt, _ = dr.interpolate(uv[tf.newaxis, ...], rast_out_opt, uv_idx)
+        color_opt = dr.texture(tex_var[np.newaxis], texc_opt, filter_mode='linear')    
+    color_opt = color_opt * tf.clip_by_value(rast_out_opt[..., -1:], 0, 1) # Mask out background.
+
+    # Measure only relevant portions of texture when calculating texture PSNR.
+    loss = tf.reduce_mean((color - color_opt)**2)
+    texmask = np.zeros_like(tex)
+    tr = tex.shape[1]//4
+    texmask[tr+13:2*tr-13, 25:-25, :] += 1.0
+    texmask[25:-25, tr+13:2*tr-13, :] += 1.0
+    texloss = (tf.reduce_sum(texmask * (tex - tex_var)**2)/np.sum(texmask))**0.5 # RMSE within masked area.
+
+    # Training driven by image-space loss.
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.99).minimize(loss, var_list=[tex_var])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Render.
+    ang = 0.0
+    util.init_uninitialized_vars()
+    texloss_avg = []
+    for it in range(max_iter + 1):
+        lr = lr_base * lr_ramp**(float(it)/float(max_iter))
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+        dist = np.random.uniform(0.0, 48.5)
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -1.5 - dist), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+    
+        # Run training and measure texture-space RMSE loss.
+        texloss_val, _ = util.run([texloss, train_op], {mtx_in: r_mvp, lr_in: lr})
+        texloss_avg.append(texloss_val)
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            texloss_val, texloss_avg = np.mean(np.asarray(texloss_avg)), []
+            psnr = -10.0 * np.log10(texloss_val**2) # PSNR based on average RMSE.
+            s = "iter=%d,loss=%f,psnr=%f" % (it, texloss_val, psnr)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result images/textures.
+        display_image = display_interval and (it % display_interval) == 0
+        save_image = imgsave_interval and (it % imgsave_interval) == 0
+        save_texture = texsave_interval and (it % texsave_interval) == 0
+
+        if display_image or save_image:
+            result_image = util.run(color_opt, {mtx_in: a_mvp})[0]
+        if display_image:
+            util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+        if save_image:
+            util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+        if save_texture:
+            util.save_image(out_dir + '/' + (texsave_fn % it), util.run(tex_var)[::-1])
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    enable_mip = None
+
+    def usage():
+        print("Usage: python earth.py [-v] [-mip|-nomip]")
+        exit()
+
+    for a in sys.argv[1:]:
+        if   a == '-v':     display_interval = 10
+        elif a == '-mip':   enable_mip = True
+        elif a == '-nomip': enable_mip = False
+        else:               usage()
+
+    if enable_mip is None:
+        usage()
+
+    # Initialize TensorFlow.        
+    util.init_tf()
+
+    # Run.
+    out_dir = 'out/earth_mip' if enable_mip else 'out/earth_nomip'
+    fit_earth(max_iter=20000, log_interval=10, display_interval=display_interval, enable_mip=enable_mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/tensorflow/envphong.py b/pose_estimation/nvdiffrast/samples/tensorflow/envphong.py
new file mode 100755
index 0000000000000000000000000000000000000000..06b1021852397de7af6c60c6a6dba7971511b043
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/tensorflow/envphong.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import pathlib
+
+import util
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Environment map and Phong BRDF learning.
+#----------------------------------------------------------------------------
+
+def fit_env_phong(max_iter          = 1000,
+                  log_interval      = 10,
+                  display_interval  = None,
+                  display_res       = 1024,
+                  res               = 1024,
+                  lr_base           = 1e-2,
+                  lr_ramp           = 1.0,
+                  out_dir           = '.',
+                  log_fn            = None,
+                  imgsave_interval  = None,
+                  imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    # Texture adapted from https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/envphong.npz') as f:
+        pos_idx, pos, normals, env = f.values()
+    env = env.astype(np.float32)/255.0
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Target Phong parameters.
+    phong_rgb = np.asarray([1.0, 0.8, 0.6], np.float32)
+    phong_exp = 25.0
+
+    # Inputs to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+    invmtx_in = tf.placeholder(tf.float32, [4, 4]) # Inverse.
+    campos_in = tf.placeholder(tf.float32, [3]) # Camera position in world space.
+    lightdir_in = tf.placeholder(tf.float32, [3]) # Light direction.
+
+    # Learned variables: environment maps, phong color, phong exponent.
+    env_var = tf.get_variable('env_var', initializer=tf.constant_initializer(0.5), shape=env.shape)
+    phong_var_raw = tf.get_variable('phong_var', initializer=tf.random_uniform_initializer(0.0, 1.0), shape=[4]) # R, G, B, exp.
+    phong_var = phong_var_raw * [1.0, 1.0, 1.0, 10.0] # Faster learning rate for the exponent.
+
+    # Transform and rasterize.
+    viewvec = pos[..., :3] - campos_in[np.newaxis, np.newaxis, :] # View vectors at vertices.
+    reflvec = viewvec - 2.0 * normals[tf.newaxis, ...] * tf.reduce_sum(normals[tf.newaxis, ...] * viewvec, axis=-1, keepdims=True) # Reflection vectors at vertices.
+    reflvec = reflvec / tf.reduce_sum(reflvec**2, axis=-1, keepdims=True)**0.5 # Normalize.
+    pos_clip = tf.matmul(pos, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, rast_out_db = dr.rasterize(pos_clip, pos_idx, [res, res])
+    refl, refld = dr.interpolate(reflvec, rast_out, pos_idx, rast_db=rast_out_db, diff_attrs='all') # Interpolated reflection vectors.
+    
+    # Phong light.
+    refl = refl / tf.reduce_sum(refl**2, axis=-1, keepdims=True)**0.5  # Normalize.
+    ldotr = tf.reduce_sum(-lightdir_in * refl, axis=-1, keepdims=True) # L dot R.
+
+    # Reference color. No need for AA because we are not learning geometry.
+    env = np.stack(env)[:, ::-1]
+    color = dr.texture(env[np.newaxis, ...], refl, refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+    color = tf.reduce_sum(tf.stack(color), axis=0)
+    color = color + phong_rgb * tf.maximum(0.0, ldotr) ** phong_exp # Phong.
+    color = tf.maximum(color, 1.0 - tf.clip_by_value(rast_out[..., -1:], 0, 1)) # White background.
+
+    # Candidate rendering same up to this point, but uses learned texture and Phong parameters instead.
+    color_opt = dr.texture(env_var[tf.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+    color_opt = tf.reduce_sum(tf.stack(color_opt), axis=0)
+    color_opt = color_opt + phong_var[:3] * tf.maximum(0.0, ldotr) ** phong_var[3] # Phong.
+    color_opt = tf.maximum(color_opt, 1.0 - tf.clip_by_value(rast_out[..., -1:], 0, 1)) # White background.
+
+    # Training.
+    loss = tf.reduce_mean((color - color_opt)**2) # L2 pixel loss.
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.99).minimize(loss, var_list=[env_var, phong_var_raw])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Render.
+    ang = 0.0
+    util.init_uninitialized_vars()
+    imgloss_avg, phong_avg = [], []
+    for it in range(max_iter + 1):
+        lr = lr_base * lr_ramp**(float(it)/float(max_iter))
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+    
+        # Solve camera positions.
+        a_campos = np.linalg.inv(a_mv)[:3, 3]
+        r_campos = np.linalg.inv(r_mv)[:3, 3]
+
+        # Random light direction.        
+        lightdir = np.random.normal(size=[3])
+        lightdir /= np.linalg.norm(lightdir) + 1e-8
+
+        # Run training and measure image-space RMSE loss.
+        imgloss_val, phong_val, _ = util.run([loss, phong_var, train_op], {mtx_in: r_mvp, invmtx_in: np.linalg.inv(r_mvp), campos_in: r_campos, lightdir_in: lightdir, lr_in: lr})
+        imgloss_avg.append(imgloss_val**0.5)
+        phong_avg.append(phong_val)
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            imgloss_val, imgloss_avg = np.mean(np.asarray(imgloss_avg, np.float32)), []
+            phong_val, phong_avg = np.mean(np.asarray(phong_avg, np.float32), axis=0), []
+            phong_rgb_rmse = np.mean((phong_val[:3] - phong_rgb)**2)**0.5
+            phong_exp_rel_err = np.abs(phong_val[3] - phong_exp)/phong_exp
+            s = "iter=%d,phong_rgb_rmse=%f,phong_exp_rel_err=%f,img_rmse=%f" % (it, phong_rgb_rmse, phong_exp_rel_err, imgloss_val)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result image.        
+        display_image = display_interval and (it % display_interval == 0)
+        save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+        if display_image or save_image:
+            result_image = util.run(color_opt, {mtx_in: a_mvp, invmtx_in: np.linalg.inv(a_mvp), campos_in: a_campos, lightdir_in: lightdir})[0]
+        if display_image:
+            util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+        if save_image:
+            util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 10
+        else:
+            print("Usage: python envphong.py [-v]")
+            exit()
+
+    # Initialize TensorFlow.        
+    util.init_tf()
+
+    # Run.
+    fit_env_phong(max_iter=1500, log_interval=10, display_interval=display_interval, out_dir='out/env_phong', log_fn='log.txt', imgsave_interval=100, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/tensorflow/pose.py b/pose_estimation/nvdiffrast/samples/tensorflow/pose.py
new file mode 100755
index 0000000000000000000000000000000000000000..af8fca6e7cf551e1f6257a1f4e867bd2621e7c69
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/tensorflow/pose.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import util
+import pathlib
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Quaternion math.
+#----------------------------------------------------------------------------
+
+# Unit quaternion.
+def q_unit():
+    return np.asarray([1, 0, 0, 0], np.float32)
+
+# Get a random normalized quaternion.
+def q_rnd():
+    u, v, w = np.random.uniform(0.0, 1.0, size=[3])
+    v *= 2.0 * np.pi
+    w *= 2.0 * np.pi
+    return np.asarray([(1.0-u)**0.5 * np.sin(v), (1.0-u)**0.5 * np.cos(v), u**0.5 * np.sin(w), u**0.5 * np.cos(w)], np.float32)
+
+# Get a random quaternion from the octahedral symmetric group S_4.
+_r2 = 0.5**0.5
+_q_S4 = [[ 1.0, 0.0, 0.0, 0.0], [ 0.0, 1.0, 0.0, 0.0], [ 0.0, 0.0, 1.0, 0.0], [ 0.0, 0.0, 0.0, 1.0],
+         [-0.5, 0.5, 0.5, 0.5], [-0.5,-0.5,-0.5, 0.5], [ 0.5,-0.5, 0.5, 0.5], [ 0.5, 0.5,-0.5, 0.5],
+         [ 0.5, 0.5, 0.5, 0.5], [-0.5, 0.5,-0.5, 0.5], [ 0.5,-0.5,-0.5, 0.5], [-0.5,-0.5, 0.5, 0.5],
+         [ _r2,-_r2, 0.0, 0.0], [ _r2, _r2, 0.0, 0.0], [ 0.0, 0.0, _r2, _r2], [ 0.0, 0.0,-_r2, _r2],
+         [ 0.0, _r2, _r2, 0.0], [ _r2, 0.0, 0.0,-_r2], [ _r2, 0.0, 0.0, _r2], [ 0.0,-_r2, _r2, 0.0],
+         [ _r2, 0.0, _r2, 0.0], [ 0.0, _r2, 0.0, _r2], [ _r2, 0.0,-_r2, 0.0], [ 0.0,-_r2, 0.0, _r2]]
+def q_rnd_S4():
+    return np.asarray(_q_S4[np.random.randint(24)], np.float32)
+
+# Quaternion slerp.
+def q_slerp(p, q, t):
+    d = np.dot(p, q)
+    if d < 0.0:
+        q = -q
+        d = -d
+    if d > 0.999:
+        a = p + t * (q-p)
+        return a / np.linalg.norm(a)
+    t0 = np.arccos(d)
+    tt = t0 * t
+    st = np.sin(tt)
+    st0 = np.sin(t0)
+    s1 = st / st0
+    s0 = np.cos(tt) - d*s1
+    return s0*p + s1*q
+
+# Quaterion scale (slerp vs. identity quaternion).
+def q_scale(q, scl):
+    return q_slerp(q_unit(), q, scl)
+
+# Quaternion product.
+def q_mul(p, q):
+    s1, V1 = p[0], p[1:]
+    s2, V2 = q[0], q[1:]
+    s = s1*s2 - np.dot(V1, V2)
+    V = s1*V2 + s2*V1 + np.cross(V1, V2)
+    return np.asarray([s, V[0], V[1], V[2]], np.float32)
+
+# Angular difference between two quaternions in degrees.
+def q_angle_deg(p, q):
+    d = np.abs(np.dot(p, q))
+    d = min(d, 1.0)
+    return np.degrees(2.0 * np.arccos(d))
+
+# Quaternion product in TensorFlow.
+def q_mul_tf(p, q):
+    a = p[0]*q[0] - p[1]*q[1] - p[2]*q[2] - p[3]*q[3]
+    b = p[0]*q[1] + p[1]*q[0] + p[2]*q[3] - p[3]*q[2]
+    c = p[0]*q[2] + p[2]*q[0] + p[3]*q[1] - p[1]*q[3]
+    d = p[0]*q[3] + p[3]*q[0] + p[1]*q[2] - p[2]*q[1]
+    return tf.stack([a, b, c, d])
+
+# Convert quaternion to 4x4 rotation matrix. TensorFlow.
+def q_to_mtx_tf(q):
+    r0 = tf.stack([1.0-2.0*q[1]**2 - 2.0*q[2]**2, 2.0*q[0]*q[1] - 2.0*q[2]*q[3], 2.0*q[0]*q[2] + 2.0*q[1]*q[3]])
+    r1 = tf.stack([2.0*q[0]*q[1] + 2.0*q[2]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[2]**2, 2.0*q[1]*q[2] - 2.0*q[0]*q[3]])
+    r2 = tf.stack([2.0*q[0]*q[2] - 2.0*q[1]*q[3], 2.0*q[1]*q[2] + 2.0*q[0]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[1]**2])
+    rr = tf.transpose(tf.stack([r0, r1, r2]), [1, 0])
+    rr = tf.concat([rr, tf.convert_to_tensor([[0], [0], [0]], tf.float32)], axis=1) # Pad right column.
+    rr = tf.concat([rr, tf.convert_to_tensor([[0, 0, 0, 1]], tf.float32)], axis=0)  # Pad bottom row.
+    return rr
+
+#----------------------------------------------------------------------------
+# Cube pose fitter.
+#----------------------------------------------------------------------------
+
+def fit_pose(max_iter           = 10000,
+             repeats            = 1,
+             log_interval       = 10,
+             display_interval   = None,
+             display_res        = 512,
+             lr_base            = 0.01,
+             lr_falloff         = 1.0,
+             nr_base            = 1.0,
+             nr_falloff         = 1e-4,
+             grad_phase_start   = 0.5,
+             resolution         = 256,
+             out_dir            = '.',
+             log_fn             = None,
+             imgsave_interval   = None,
+             imgsave_fn         = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/cube_p.npz') as f:
+        pos_idx, pos, col_idx, col = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Pose matrix input to TF graph.    
+    pose_in = tf.placeholder(tf.float32, [4]) # Quaternion.
+    noise_in = tf.placeholder(tf.float32, [4]) # Mollification noise.
+    
+    # Setup TF graph for reference.
+    mtx_total = tf.matmul(mtx_in, q_to_mtx_tf(pose_in))
+    pos_clip = tf.matmul(pos, mtx_total, transpose_b=True)[tf.newaxis, ...]
+    rast_out, _ = dr.rasterize(pos_clip, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color, _ = dr.interpolate(col[tf.newaxis, ...], rast_out, col_idx)
+    color = dr.antialias(color, rast_out, pos_clip, pos_idx)
+
+    # Setup TF graph for optimization candidate.
+    pose_var = tf.get_variable('pose', initializer=tf.zeros_initializer(), shape=[4])
+    pose_var_in = tf.placeholder(tf.float32, [4])
+    pose_set = tf.assign(pose_var, pose_var_in)
+    pose_norm_op = tf.assign(pose_var, pose_var / tf.reduce_sum(pose_var**2)**0.5) # Normalization operation.
+    pose_total = q_mul_tf(pose_var, noise_in)
+    mtx_total_opt = tf.matmul(mtx_in, q_to_mtx_tf(pose_total))
+    pos_clip_opt = tf.matmul(pos, mtx_total_opt, transpose_b=True)[tf.newaxis, ...]
+    rast_out_opt, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color_opt, _ = dr.interpolate(col[tf.newaxis, ...], rast_out_opt, col_idx)
+    color_opt = dr.antialias(color_opt, rast_out_opt, pos_clip_opt, pos_idx)
+
+    # Image-space loss.
+    diff = (color_opt - color)**2 # L2 norm.
+    diff = tf.tanh(5.0 * tf.reduce_max(diff, axis=-1)) # Add some oomph to the loss.
+    loss = tf.reduce_mean(diff)
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.999).minimize(loss, var_list=[pose_var])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Repeats.
+    for rep in range(repeats):
+
+        # Optimize.
+        util.init_uninitialized_vars()
+        loss_best = np.inf
+        pose_best = None
+        for it in range(max_iter + 1):
+            # Modelview + projection matrix.
+            mvp = np.matmul(util.projection(x=0.4), util.translate(0, 0, -3.5)).astype(np.float32)
+
+            # Learning and noise rate scheduling.
+            itf = 1.0 * it / max_iter
+            lr = lr_base * lr_falloff**itf
+            nr = nr_base * nr_falloff**itf
+
+            # Noise input.
+            if itf >= grad_phase_start:
+                noise = q_unit()
+            else:
+                noise = q_scale(q_rnd(), nr)
+                noise = q_mul(noise, q_rnd_S4()) # Orientation noise.
+
+            # Initialize optimization.
+            if it == 0:
+                pose_target = q_rnd()                
+                util.run(pose_set, {pose_var_in: q_rnd()})
+                util.run(pose_norm_op)
+                util.run(loss, {mtx_in: mvp, pose_in: pose_target, noise_in: noise}) # Pipecleaning pass.
+
+            # Run gradient training step.
+            if itf >= grad_phase_start:
+                util.run(train_op, {mtx_in: mvp, pose_in: pose_target, noise_in: noise, lr_in: lr})
+                util.run(pose_norm_op)
+
+            # Measure image-space loss and update best found pose.
+            loss_val = util.run(loss, {mtx_in: mvp, pose_in: pose_target, noise_in: noise, lr_in: lr})
+            if loss_val < loss_best:
+                pose_best = util.run(pose_total, {noise_in: noise})
+                if loss_val > 0.0:
+                    loss_best = loss_val
+            else:
+                # Return to best pose in the greedy phase.
+                if itf < grad_phase_start:
+                    util.run(pose_set, {pose_var_in: pose_best})
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                err = q_angle_deg(util.run(pose_var), pose_target)
+                ebest = q_angle_deg(pose_best, pose_target)
+                s = "rep=%d,iter=%d,err=%f,err_best=%f,loss=%f,loss_best=%f,lr=%f,nr=%f" % (rep, it, err, ebest, loss_val, loss_best, lr, nr)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+            if display_image or save_image:
+                img_ref, img_opt = util.run([color, color_opt], {mtx_in: mvp, pose_in: pose_target, noise_in: noise})
+                img_best, = util.run([color_opt], {mtx_in: mvp, pose_in: pose_best, noise_in: q_unit()})
+                img_ref = img_ref[0]
+                img_opt = img_opt[0]
+                img_best = img_best[0]
+                result_image = np.concatenate([img_ref, img_best, img_opt], axis=1)
+
+            if display_image:
+                util.display_image(result_image, size=display_res, title='(%d) %d / %d' % (rep, it, max_iter))
+            if save_image:
+                util.save_image(out_dir + '/' + (imgsave_fn % (rep, it)), result_image)
+
+    # All repeats done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    repeats = 1
+
+    def usage():
+        print("Usage: python pose.py [-v] [repeats]")
+        exit()
+
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 10
+        elif a.isascii() and a.isdecimal():
+            repeats = int(a)
+        else:
+            usage()
+
+    if repeats <= 0:
+        usage()
+
+    # Initialize TensorFlow.
+    util.init_tf()
+
+    # Run.
+    fit_pose(max_iter=1000, repeats=repeats, log_interval=100, display_interval=display_interval, out_dir='out/pose', log_fn='log.txt', imgsave_interval=1000, imgsave_fn='img_%03d_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/tensorflow/triangle.py b/pose_estimation/nvdiffrast/samples/tensorflow/triangle.py
new file mode 100755
index 0000000000000000000000000000000000000000..4d4c54426e569ee68d8a9f255f7495ed68f62f89
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/tensorflow/triangle.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imageio
+import logging
+import os
+import numpy as np
+import tensorflow as tf
+import nvdiffrast.tensorflow as dr
+
+# Silence deprecation warnings and debug level logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+os.environ.setdefault('TF_CPP_MIN_LOG_LEVEL', '1')
+
+pos = tf.convert_to_tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=tf.float32)
+col = tf.convert_to_tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=tf.float32)
+tri = tf.convert_to_tensor([[0, 1, 2]], dtype=tf.int32)
+
+rast, _ = dr.rasterize(pos, tri, resolution=[256, 256])
+out, _ = dr.interpolate(col, rast, tri)
+
+with tf.Session() as sess:
+    img = sess.run(out)
+    
+img = img[0, ::-1, :, :] # Flip vertically.
+img = np.clip(np.rint(img * 255), 0, 255).astype(np.uint8) # Quantize to np.uint8
+
+print("Saving to 'tri.png'.")
+imageio.imsave('tri.png', img)
diff --git a/pose_estimation/nvdiffrast/samples/tensorflow/util.py b/pose_estimation/nvdiffrast/samples/tensorflow/util.py
new file mode 100755
index 0000000000000000000000000000000000000000..64fc2d9ef2fd42eec4d5c2d62d038f5b17de8a4b
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/tensorflow/util.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+
+import os
+import numpy as np
+import tensorflow as tf
+
+# Silence deprecation warnings from TensorFlow 1.13 onwards
+import logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+
+from typing import Any, List
+
+#----------------------------------------------------------------------------
+# Projection and transformation matrix helpers.
+#----------------------------------------------------------------------------
+
+def projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n/x,    0,            0,              0], 
+                     [  0, n/-x,            0,              0], 
+                     [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)], 
+                     [  0,    0,           -1,              0]]).astype(np.float32)
+                    
+def translate(x, y, z):
+    return np.array([[1, 0, 0, x], 
+                     [0, 1, 0, y], 
+                     [0, 0, 1, z], 
+                     [0, 0, 0, 1]]).astype(np.float32)
+
+def rotate_x(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[1,  0, 0, 0], 
+                     [0,  c, s, 0], 
+                     [0, -s, c, 0], 
+                     [0,  0, 0, 1]]).astype(np.float32)
+
+def rotate_y(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[ c, 0, s, 0], 
+                     [ 0, 1, 0, 0], 
+                     [-s, 0, c, 0], 
+                     [ 0, 0, 0, 1]]).astype(np.float32)
+
+def random_rotation_translation(t):
+    m = np.random.normal(size=[3, 3])
+    m[1] = np.cross(m[0], m[2])
+    m[2] = np.cross(m[0], m[1])
+    m = m / np.linalg.norm(m, axis=1, keepdims=True)
+    m = np.pad(m, [[0, 1], [0, 1]], mode='constant')
+    m[3, 3] = 1.0
+    m[:3, 3] = np.random.uniform(-t, t, size=[3])
+    return m
+
+#----------------------------------------------------------------------------
+# Bilinear downsample by 2x.
+#----------------------------------------------------------------------------
+
+def bilinear_downsample(x):
+    w = tf.constant([[1, 3, 3, 1], [3, 9, 9, 3], [3, 9, 9, 3], [1, 3, 3, 1]], dtype=tf.float32) / 64.0
+    w = w[..., tf.newaxis, tf.newaxis] * tf.eye(x.shape[-1].value, batch_shape=[1, 1])
+    x = tf.nn.conv2d(x, w, strides=2, padding='SAME')
+    return x
+
+#----------------------------------------------------------------------------
+# Image display function using OpenGL.
+#----------------------------------------------------------------------------
+
+_glfw_window = None
+def display_image(image, zoom=None, size=None, title=None): # HWC
+    # Import OpenGL and glfw.
+    import OpenGL.GL as gl
+    import glfw
+
+    # Zoom image if requested.
+    image = np.asarray(image)
+    if size is not None:
+        assert zoom is None
+        zoom = max(1, size // image.shape[0])
+    if zoom is not None:
+        image = image.repeat(zoom, axis=0).repeat(zoom, axis=1)
+    height, width, channels = image.shape
+
+    # Initialize window.
+    if title is None:
+        title = 'Debug window'
+    global _glfw_window
+    if _glfw_window is None:
+        glfw.init()
+        _glfw_window = glfw.create_window(width, height, title, None, None)
+        glfw.make_context_current(_glfw_window)
+        glfw.show_window(_glfw_window)
+        glfw.swap_interval(0)
+    else:
+        glfw.make_context_current(_glfw_window)
+        glfw.set_window_title(_glfw_window, title)
+        glfw.set_window_size(_glfw_window, width, height)
+
+    # Update window.
+    glfw.poll_events()
+    gl.glClearColor(0, 0, 0, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    gl.glWindowPos2f(0, 0)
+    gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+    gl_format = {3: gl.GL_RGB, 2: gl.GL_RG, 1: gl.GL_LUMINANCE}[channels]
+    gl_dtype = {'uint8': gl.GL_UNSIGNED_BYTE, 'float32': gl.GL_FLOAT}[image.dtype.name]
+    gl.glDrawPixels(width, height, gl_format, gl_dtype, image[::-1])
+    glfw.swap_buffers(_glfw_window)
+    if glfw.window_should_close(_glfw_window):
+        return False
+    return True
+
+#----------------------------------------------------------------------------
+# Image save helper.
+#----------------------------------------------------------------------------
+
+def save_image(fn, x):
+    import imageio
+    x = np.rint(x * 255.0)
+    x = np.clip(x, 0, 255).astype(np.uint8)
+    imageio.imsave(fn, x)
+
+#----------------------------------------------------------------------------
+
+# TensorFlow utilities
+
+#----------------------------------------------------------------------------
+
+def _sanitize_tf_config(config_dict: dict = None) -> dict:
+    # Defaults.
+    cfg = dict()
+    cfg["rnd.np_random_seed"]               = None      # Random seed for NumPy. None = keep as is.
+    cfg["rnd.tf_random_seed"]               = "auto"    # Random seed for TensorFlow. 'auto' = derive from NumPy random state. None = keep as is.
+    cfg["env.TF_CPP_MIN_LOG_LEVEL"]         = "1"       # 0 = Print all available debug info from TensorFlow. 1 = Print warnings and errors, but disable debug info.
+    cfg["env.HDF5_USE_FILE_LOCKING"]        = "FALSE"   # Disable HDF5 file locking to avoid concurrency issues with network shares.
+    cfg["graph_options.place_pruned_graph"] = True      # False = Check that all ops are available on the designated device. True = Skip the check for ops that are not used.
+    cfg["gpu_options.allow_growth"]         = True      # False = Allocate all GPU memory at the beginning. True = Allocate only as much GPU memory as needed.
+
+    # Remove defaults for environment variables that are already set.
+    for key in list(cfg):
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            if fields[1] in os.environ:
+                del cfg[key]
+
+    # User overrides.
+    if config_dict is not None:
+        cfg.update(config_dict)
+    return cfg
+
+
+def init_tf(config_dict: dict = None) -> None:
+    """Initialize TensorFlow session using good default settings."""
+    # Skip if already initialized.
+    if tf.get_default_session() is not None:
+        return
+
+    # Setup config dict and random seeds.
+    cfg = _sanitize_tf_config(config_dict)
+    np_random_seed = cfg["rnd.np_random_seed"]
+    if np_random_seed is not None:
+        np.random.seed(np_random_seed)
+    tf_random_seed = cfg["rnd.tf_random_seed"]
+    if tf_random_seed == "auto":
+        tf_random_seed = np.random.randint(1 << 31)
+    if tf_random_seed is not None:
+        tf.set_random_seed(tf_random_seed)
+
+    # Setup environment variables.
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            os.environ[fields[1]] = str(value)
+
+    # Create default TensorFlow session.
+    create_session(cfg, force_as_default=True)
+
+
+def assert_tf_initialized():
+    """Check that TensorFlow session has been initialized."""
+    if tf.get_default_session() is None:
+        raise RuntimeError("No default TensorFlow session found. Please call util.init_tf().")
+
+
+def create_session(config_dict: dict = None, force_as_default: bool = False) -> tf.Session:
+    """Create tf.Session based on config dict."""
+    # Setup TensorFlow config proto.
+    cfg = _sanitize_tf_config(config_dict)
+    config_proto = tf.ConfigProto()
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] not in ["rnd", "env"]:
+            obj = config_proto
+            for field in fields[:-1]:
+                obj = getattr(obj, field)
+            setattr(obj, fields[-1], value)
+
+    # Create session.
+    session = tf.Session(config=config_proto)
+    if force_as_default:
+        # pylint: disable=protected-access
+        session._default_session = session.as_default()
+        session._default_session.enforce_nesting = False
+        session._default_session.__enter__()
+    return session
+
+
+def is_tf_expression(x: Any) -> bool:
+    """Check whether the input is a valid Tensorflow expression, i.e., Tensorflow Tensor, Variable, or Operation."""
+    return isinstance(x, (tf.Tensor, tf.Variable, tf.Operation))
+
+
+def absolute_name_scope(scope: str) -> tf.name_scope:
+    """Forcefully enter the specified name scope, ignoring any surrounding scopes."""
+    return tf.name_scope(scope + "/")
+
+
+def init_uninitialized_vars(target_vars: List[tf.Variable] = None) -> None:
+    """Initialize all tf.Variables that have not already been initialized.
+
+    Equivalent to the following, but more efficient and does not bloat the tf graph:
+    tf.variables_initializer(tf.report_uninitialized_variables()).run()
+    """
+    assert_tf_initialized()
+    if target_vars is None:
+        target_vars = tf.global_variables()
+
+    test_vars = []
+    test_ops = []
+
+    with tf.control_dependencies(None):  # ignore surrounding control_dependencies
+        for var in target_vars:
+            assert is_tf_expression(var)
+
+            try:
+                tf.get_default_graph().get_tensor_by_name(var.name.replace(":0", "/IsVariableInitialized:0"))
+            except KeyError:
+                # Op does not exist => variable may be uninitialized.
+                test_vars.append(var)
+
+                with absolute_name_scope(var.name.split(":")[0]):
+                    test_ops.append(tf.is_variable_initialized(var))
+
+    init_vars = [var for var, inited in zip(test_vars, run(test_ops)) if not inited]
+    run([var.initializer for var in init_vars])
+
+def run(*args, **kwargs) -> Any:
+    """Run the specified ops in the default session."""
+    assert_tf_initialized()
+    return tf.get_default_session().run(*args, **kwargs)
diff --git a/pose_estimation/nvdiffrast/samples/torch/__init__.py b/pose_estimation/nvdiffrast/samples/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pose_estimation/nvdiffrast/samples/torch/cube.py b/pose_estimation/nvdiffrast/samples/torch/cube.py
new file mode 100755
index 0000000000000000000000000000000000000000..28a3705300db5c5d06f66fb3b49e3e081470f6dc
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/torch/cube.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import numpy as np
+import torch
+import imageio
+
+import util
+
+import nvdiffrast.torch as dr
+
+# Transform vertex positions to clip space
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    # (x,y,z) -> (x,y,z,1)
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, vtx_col, col_idx, resolution: int):
+    pos_clip    = transform_pos(mtx, pos)
+    rast_out, _ = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+    color, _    = dr.interpolate(vtx_col[None, ...], rast_out, col_idx)
+    color       = dr.antialias(color, rast_out, pos_clip, pos_idx)
+    return color
+
+def make_grid(arr, ncols=2):
+    n, height, width, nc = arr.shape
+    nrows = n//ncols
+    assert n == nrows*ncols
+    return arr.reshape(nrows, ncols, height, width, nc).swapaxes(1,2).reshape(height*nrows, width*ncols, nc)
+
+def fit_cube(max_iter          = 5000,
+             resolution        = 4,
+             discontinuous     = False,
+             repeats           = 1,
+             log_interval      = 10,
+             display_interval  = None,
+             display_res       = 512,
+             out_dir           = None,
+             log_fn            = None,
+             mp4save_interval  = None,
+             mp4save_fn        = None):
+
+    log_file = None
+    writer = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(f'{out_dir}/{log_fn}', 'wt')
+        if mp4save_interval != 0:
+            writer = imageio.get_writer(f'{out_dir}/{mp4save_fn}', mode='I', fps=30, codec='libx264', bitrate='16M')
+    else:
+        mp4save_interval = None
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    fn = 'cube_%s.npz' % ('d' if discontinuous else 'c')
+    with np.load(f'{datadir}/{fn}') as f:
+        pos_idx, vtxp, col_idx, vtxc = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], vtxp.shape[0]))
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(vtxp.astype(np.float32)).cuda()
+    vtx_col = torch.from_numpy(vtxc.astype(np.float32)).cuda()
+
+    glctx = dr.RasterizeGLContext()
+
+    # Repeats.
+    for rep in range(repeats):
+
+        ang = 0.0
+        gl_avg = []
+
+        vtx_pos_rand = np.random.uniform(-0.5, 0.5, size=vtxp.shape) + vtxp
+        vtx_col_rand = np.random.uniform(0.0, 1.0, size=vtxc.shape)
+        vtx_pos_opt  = torch.tensor(vtx_pos_rand, dtype=torch.float32, device='cuda', requires_grad=True)
+        vtx_col_opt  = torch.tensor(vtx_col_rand, dtype=torch.float32, device='cuda', requires_grad=True)
+
+        # Adam optimizer for vertex position and color with a learning rate ramp.
+        optimizer    = torch.optim.Adam([vtx_pos_opt, vtx_col_opt], lr=1e-2)
+        scheduler    = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: max(0.01, 10**(-x*0.0005)))
+
+        for it in range(max_iter + 1):
+            # Random rotation/translation matrix for optimization.
+            r_rot = util.random_rotation_translation(0.25)
+
+            # Smooth rotation for display.
+            a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+            # Modelview and modelview + projection matrices.
+            proj  = util.projection(x=0.4)
+            r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+            r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+            a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+            a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+
+            # Compute geometric error for logging.
+            with torch.no_grad():
+                geom_loss = torch.mean(torch.sum((torch.abs(vtx_pos_opt) - .5)**2, dim=1)**0.5)
+                gl_avg.append(float(geom_loss))
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                gl_val = np.mean(np.asarray(gl_avg))
+                gl_avg = []
+                s = ("rep=%d," % rep) if repeats > 1 else ""
+                s += "iter=%d,err=%f" % (it, gl_val)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            color     = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_col, col_idx, resolution)
+            color_opt = render(glctx, r_mvp, vtx_pos_opt, pos_idx, vtx_col_opt, col_idx, resolution)
+
+            # Compute loss and train.
+            loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_mp4      = mp4save_interval and (it % mp4save_interval == 0)
+
+            if display_image or save_mp4:
+                ang = ang + 0.01
+
+                img_b = color[0].cpu().numpy()
+                img_o = color_opt[0].detach().cpu().numpy()
+                img_d = render(glctx, a_mvp, vtx_pos_opt, pos_idx, vtx_col_opt, col_idx, display_res)[0]
+                img_r = render(glctx, a_mvp, vtx_pos, pos_idx, vtx_col, col_idx, display_res)[0]
+
+                scl = display_res // img_o.shape[0]
+                img_b = np.repeat(np.repeat(img_b, scl, axis=0), scl, axis=1)
+                img_o = np.repeat(np.repeat(img_o, scl, axis=0), scl, axis=1)
+                result_image = make_grid(np.stack([img_o, img_b, img_d.detach().cpu().numpy(), img_r.cpu().numpy()]))
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+                if save_mp4:
+                    writer.append_data(np.clip(np.rint(result_image*255.0), 0, 255).astype(np.uint8))
+
+    # Done.
+    if writer is not None:
+        writer.close()
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Cube fit example')
+    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--discontinuous', action='store_true', default=False)
+    parser.add_argument('--resolution', type=int, default=0, required=True)
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--mp4save-interval', type=int, default=100)
+    parser.add_argument('--max-iter', type=int, default=1000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        ds = 'd' if args.discontinuous else 'c'
+        out_dir = f'{args.outdir}/cube_{ds}_{args.resolution}'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_cube(
+        max_iter=args.max_iter,
+        resolution=args.resolution,
+        discontinuous=args.discontinuous,
+        log_interval=10,
+        display_interval=args.display_interval,
+        out_dir=out_dir,
+        log_fn='log.txt',
+        mp4save_interval=args.mp4save_interval,
+        mp4save_fn='progress.mp4'
+    )
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/torch/earth.py b/pose_estimation/nvdiffrast/samples/torch/earth.py
new file mode 100755
index 0000000000000000000000000000000000000000..917aefa6c47b5c390e11586db37bb44ea7c709d4
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/torch/earth.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import numpy as np
+import torch
+
+import util
+
+import nvdiffrast.torch as dr
+
+#----------------------------------------------------------------------------
+# Helpers.
+
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, uv, uv_idx, tex, resolution, enable_mip, max_mip_level):
+    pos_clip = transform_pos(mtx, pos)
+    rast_out, rast_out_db = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+
+    if enable_mip:
+        texc, texd = dr.interpolate(uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+        color = dr.texture(tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    else:
+        texc, _ = dr.interpolate(uv[None, ...], rast_out, uv_idx)
+        color = dr.texture(tex[None, ...], texc, filter_mode='linear')
+
+    color = color * torch.clamp(rast_out[..., -1:], 0, 1) # Mask out background.
+    return color
+
+#----------------------------------------------------------------------------
+
+def fit_earth(max_iter          = 20000,
+              log_interval      = 10,
+              display_interval  = None,
+              display_res       = 1024,
+              enable_mip        = True,
+              res               = 512,
+              ref_res           = 4096,
+              lr_base           = 1e-2,
+              lr_ramp           = 0.1,
+              out_dir           = None,
+              log_fn            = None,
+              texsave_interval  = None,
+              texsave_fn        = None,
+              imgsave_interval  = None,
+              imgsave_fn        = None):
+
+    log_file = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(out_dir + '/' + log_fn, 'wt')
+    else:
+        imgsave_interval, texsave_interval = None, None
+    
+    # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
+    # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/earth.npz') as f:
+        pos_idx, pos, uv_idx, uv, tex = f.values()
+    tex = tex.astype(np.float32)/255.0
+    max_mip_level = 9 # Texture is a 4x3 atlas of 512x512 maps.
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Some input geometry contains vertex positions in (N, 4) (with v[:,3]==1).  Drop
+    # the last column in that case.
+    if pos.shape[1] == 4: pos = pos[:, 0:3]
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(pos.astype(np.float32)).cuda()
+    uv_idx  = torch.from_numpy(uv_idx.astype(np.int32)).cuda()
+    vtx_uv  = torch.from_numpy(uv.astype(np.float32)).cuda()
+
+    tex     = torch.from_numpy(tex.astype(np.float32)).cuda()
+    tex_opt = torch.full(tex.shape, 0.2, device='cuda', requires_grad=True)
+    glctx = dr.RasterizeGLContext()
+
+    ang = 0.0
+
+    # Adam optimizer for texture with a learning rate ramp.
+    optimizer    = torch.optim.Adam([tex_opt], lr=lr_base)
+    scheduler    = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: lr_ramp**(float(x)/float(max_iter)))
+
+    # Render.
+    ang = 0.0
+    texloss_avg = []
+    for it in range(max_iter + 1):
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+        dist = np.random.uniform(0.0, 48.5)
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -1.5 - dist), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+
+        # Measure texture-space RMSE loss
+        with torch.no_grad():
+            texmask = torch.zeros_like(tex)
+            tr = tex.shape[1]//4
+            texmask[tr+13:2*tr-13, 25:-25, :] += 1.0
+            texmask[25:-25, tr+13:2*tr-13, :] += 1.0
+            # Measure only relevant portions of texture when calculating texture
+            # PSNR.
+            texloss = (torch.sum(texmask * (tex - tex_opt)**2)/torch.sum(texmask))**0.5 # RMSE within masked area.
+            texloss_avg.append(float(texloss))
+
+        # Render reference and optimized frames. Always enable mipmapping for reference.
+        color = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex, ref_res, True, max_mip_level)
+        color_opt = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex_opt, res, enable_mip, max_mip_level)
+
+        # Reduce the reference to correct size.
+        while color.shape[1] > res:
+            color = util.bilinear_downsample(color)
+
+        # Compute loss and perform a training step.
+        loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            texloss_val = np.mean(np.asarray(texloss_avg))
+            texloss_avg = []
+            psnr = -10.0 * np.log10(texloss_val**2) # PSNR based on average RMSE.
+            s = "iter=%d,loss=%f,psnr=%f" % (it, texloss_val, psnr)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save image.
+        display_image = display_interval and (it % display_interval == 0)
+        save_image = imgsave_interval and (it % imgsave_interval == 0)
+        save_texture = texsave_interval and (it % texsave_interval) == 0
+
+        if display_image or save_image:
+            ang = ang + 0.1
+
+            with torch.no_grad():
+                result_image = render(glctx, a_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex_opt, res, enable_mip, max_mip_level)[0].cpu().numpy()
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+                if save_image:
+                    util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+                if save_texture:
+                    texture = tex_opt.cpu().numpy()[::-1]
+                    util.save_image(out_dir + '/' + (texsave_fn % it), texture)
+
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Earth texture fitting example')
+    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--mip', action='store_true', default=False)
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--max-iter', type=int, default=10000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        ms = 'mip' if args.mip else 'nomip'
+        out_dir = f'{args.outdir}/earth_{ms}'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/torch/envphong.py b/pose_estimation/nvdiffrast/samples/torch/envphong.py
new file mode 100755
index 0000000000000000000000000000000000000000..55befe782df7f1959da5379b7e1f1926abe990ec
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/torch/envphong.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import numpy as np
+import torch
+import os
+import pathlib
+import imageio
+
+import util
+
+import nvdiffrast.torch as dr
+
+#----------------------------------------------------------------------------
+# Environment map and Phong BRDF learning.
+#----------------------------------------------------------------------------
+
+def fit_env_phong(max_iter          = 1000,
+                  log_interval      = 10,
+                  display_interval  = None,
+                  display_res       = 1024,
+                  res               = 1024,
+                  lr_base           = 1e-2,
+                  lr_ramp           = 1.0,
+                  out_dir           = None,
+                  log_fn            = None,
+                  mp4save_interval  = None,
+                  mp4save_fn        = None):
+
+    log_file = None
+    writer = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(out_dir + '/' + log_fn, 'wt')
+        if mp4save_interval != 0:
+            writer = imageio.get_writer(f'{out_dir}/{mp4save_fn}', mode='I', fps=30, codec='libx264', bitrate='16M')
+    else:
+        mp4save_interval = None
+
+    # Texture adapted from https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/envphong.npz') as f:
+        pos_idx, pos, normals, env = f.values()
+    env = env.astype(np.float32)/255.0
+    env = np.stack(env)[:, ::-1].copy()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Move all the stuff to GPU.
+    pos_idx = torch.as_tensor(pos_idx, dtype=torch.int32, device='cuda')
+    pos = torch.as_tensor(pos, dtype=torch.float32, device='cuda')
+    normals = torch.as_tensor(normals, dtype=torch.float32, device='cuda')
+    env = torch.as_tensor(env, dtype=torch.float32, device='cuda')
+
+    # Target Phong parameters.
+    phong_rgb = np.asarray([1.0, 0.8, 0.6], np.float32)
+    phong_exp = 25.0
+    phong_rgb_t = torch.as_tensor(phong_rgb, dtype=torch.float32, device='cuda')
+
+    # Learned variables: environment maps, phong color, phong exponent.
+    env_var = torch.ones_like(env) * .5
+    env_var.requires_grad_()
+    phong_var_raw = torch.as_tensor(np.random.uniform(size=[4]), dtype=torch.float32, device='cuda')
+    phong_var_raw.requires_grad_()
+    phong_var_mul = torch.as_tensor([1.0, 1.0, 1.0, 10.0], dtype=torch.float32, device='cuda')
+
+    # Render.
+    ang = 0.0
+    imgloss_avg, phong_avg = [], []
+    glctx = dr.RasterizeGLContext()
+    zero_tensor = torch.as_tensor(0.0, dtype=torch.float32, device='cuda')
+    one_tensor = torch.as_tensor(1.0, dtype=torch.float32, device='cuda')
+
+    # Adam optimizer for environment map and phong with a learning rate ramp.
+    optimizer = torch.optim.Adam([env_var, phong_var_raw], lr=lr_base)
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: lr_ramp**(float(x)/float(max_iter)))
+
+    for it in range(max_iter + 1):
+        phong_var = phong_var_raw * phong_var_mul
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+        a_mvc = a_mvp
+        r_mvp = torch.as_tensor(r_mvp, dtype=torch.float32, device='cuda')
+        a_mvp = torch.as_tensor(a_mvp, dtype=torch.float32, device='cuda')
+
+        # Solve camera positions.
+        a_campos = torch.as_tensor(np.linalg.inv(a_mv)[:3, 3], dtype=torch.float32, device='cuda')
+        r_campos = torch.as_tensor(np.linalg.inv(r_mv)[:3, 3], dtype=torch.float32, device='cuda')
+
+        # Random light direction.        
+        lightdir = np.random.normal(size=[3])
+        lightdir /= np.linalg.norm(lightdir) + 1e-8
+        lightdir = torch.as_tensor(lightdir, dtype=torch.float32, device='cuda')
+
+        def render_refl(ldir, cpos, mvp):
+            # Transform and rasterize.
+            viewvec = pos[..., :3] - cpos[np.newaxis, np.newaxis, :] # View vectors at vertices.
+            reflvec = viewvec - 2.0 * normals[np.newaxis, ...] * torch.sum(normals[np.newaxis, ...] * viewvec, -1, keepdim=True) # Reflection vectors at vertices.
+            reflvec = reflvec / torch.sum(reflvec**2, -1, keepdim=True)**0.5 # Normalize.
+            pos_clip = torch.matmul(pos, mvp.t())[np.newaxis, ...]
+            rast_out, rast_out_db = dr.rasterize(glctx, pos_clip, pos_idx, [res, res])
+            refl, refld = dr.interpolate(reflvec, rast_out, pos_idx, rast_db=rast_out_db, diff_attrs='all') # Interpolated reflection vectors.
+
+            # Phong light.
+            refl = refl / (torch.sum(refl**2, -1, keepdim=True) + 1e-8)**0.5  # Normalize.
+            ldotr = torch.sum(-ldir * refl, -1, keepdim=True) # L dot R.
+
+            # Return
+            return refl, refld, ldotr, (rast_out[..., -1:] == 0)
+
+        # Render the reflections.
+        refl, refld, ldotr, mask = render_refl(lightdir, r_campos, r_mvp)
+
+        # Reference color. No need for AA because we are not learning geometry.
+        color = dr.texture(env[np.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+        color = color + phong_rgb_t * torch.max(zero_tensor, ldotr) ** phong_exp # Phong.
+        color = torch.where(mask, one_tensor, color) # White background.
+
+        # Candidate rendering same up to this point, but uses learned texture and Phong parameters instead.
+        color_opt = dr.texture(env_var[np.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+        color_opt = color_opt + phong_var[:3] * torch.max(zero_tensor, ldotr) ** phong_var[3] # Phong.
+        color_opt = torch.where(mask, one_tensor, color_opt) # White background.
+
+        # Compute loss and train.
+        loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+
+        # Collect losses.
+        imgloss_avg.append(loss.detach().cpu().numpy())
+        phong_avg.append(phong_var.detach().cpu().numpy())
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            imgloss_val, imgloss_avg = np.mean(np.asarray(imgloss_avg, np.float32)), []
+            phong_val, phong_avg = np.mean(np.asarray(phong_avg, np.float32), axis=0), []
+            phong_rgb_rmse = np.mean((phong_val[:3] - phong_rgb)**2)**0.5
+            phong_exp_rel_err = np.abs(phong_val[3] - phong_exp)/phong_exp
+            s = "iter=%d,phong_rgb_rmse=%f,phong_exp_rel_err=%f,img_rmse=%f" % (it, phong_rgb_rmse, phong_exp_rel_err, imgloss_val)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result image.        
+        display_image = display_interval and (it % display_interval == 0)
+        save_mp4 = mp4save_interval and (it % mp4save_interval == 0)
+
+        if display_image or save_mp4:
+            lightdir = np.asarray([.8, -1., .5, 0.0])
+            lightdir = np.matmul(a_mvc, lightdir)[:3]
+            lightdir /= np.linalg.norm(lightdir)
+            lightdir = torch.as_tensor(lightdir, dtype=torch.float32, device='cuda')
+            refl, refld, ldotr, mask = render_refl(lightdir, a_campos, a_mvp)
+            color_opt = dr.texture(env_var[np.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+            color_opt = color_opt + phong_var[:3] * torch.max(zero_tensor, ldotr) ** phong_var[3]
+            color_opt = torch.where(mask, one_tensor, color_opt)
+            result_image = color_opt.detach()[0].cpu().numpy()
+            if display_image:
+                util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+            if save_mp4:
+                writer.append_data(np.clip(np.rint(result_image*255.0), 0, 255).astype(np.uint8))
+
+    # Done.
+    if writer is not None:
+        writer.close()
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Environment map fitting example')
+    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--mp4save-interval', type=int, default=10)
+    parser.add_argument('--max-iter', type=int, default=5000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        out_dir = f'{args.outdir}/env_phong'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_env_phong(
+        max_iter=args.max_iter,
+        log_interval=100,
+        display_interval=args.display_interval,
+        out_dir=out_dir,
+        mp4save_interval=args.mp4save_interval,
+        mp4save_fn='progress.mp4'
+    )
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/torch/pose.py b/pose_estimation/nvdiffrast/samples/torch/pose.py
new file mode 100755
index 0000000000000000000000000000000000000000..d193816f0bbad075a9fab04233e644020beced75
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/torch/pose.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import numpy as np
+import torch
+import imageio
+
+import util
+
+import nvdiffrast.torch as dr
+
+#----------------------------------------------------------------------------
+# Quaternion math.
+#----------------------------------------------------------------------------
+
+# Unit quaternion.
+def q_unit():
+    return np.asarray([1, 0, 0, 0], np.float32)
+
+# Get a random normalized quaternion.
+def q_rnd():
+    u, v, w = np.random.uniform(0.0, 1.0, size=[3])
+    v *= 2.0 * np.pi
+    w *= 2.0 * np.pi
+    return np.asarray([(1.0-u)**0.5 * np.sin(v), (1.0-u)**0.5 * np.cos(v), u**0.5 * np.sin(w), u**0.5 * np.cos(w)], np.float32)
+
+# Get a random quaternion from the octahedral symmetric group S_4.
+_r2 = 0.5**0.5
+_q_S4 = [[ 1.0, 0.0, 0.0, 0.0], [ 0.0, 1.0, 0.0, 0.0], [ 0.0, 0.0, 1.0, 0.0], [ 0.0, 0.0, 0.0, 1.0],
+         [-0.5, 0.5, 0.5, 0.5], [-0.5,-0.5,-0.5, 0.5], [ 0.5,-0.5, 0.5, 0.5], [ 0.5, 0.5,-0.5, 0.5],
+         [ 0.5, 0.5, 0.5, 0.5], [-0.5, 0.5,-0.5, 0.5], [ 0.5,-0.5,-0.5, 0.5], [-0.5,-0.5, 0.5, 0.5],
+         [ _r2,-_r2, 0.0, 0.0], [ _r2, _r2, 0.0, 0.0], [ 0.0, 0.0, _r2, _r2], [ 0.0, 0.0,-_r2, _r2],
+         [ 0.0, _r2, _r2, 0.0], [ _r2, 0.0, 0.0,-_r2], [ _r2, 0.0, 0.0, _r2], [ 0.0,-_r2, _r2, 0.0],
+         [ _r2, 0.0, _r2, 0.0], [ 0.0, _r2, 0.0, _r2], [ _r2, 0.0,-_r2, 0.0], [ 0.0,-_r2, 0.0, _r2]]
+def q_rnd_S4():
+    return np.asarray(_q_S4[np.random.randint(24)], np.float32)
+
+# Quaternion slerp.
+def q_slerp(p, q, t):
+    d = np.dot(p, q)
+    if d < 0.0:
+        q = -q
+        d = -d
+    if d > 0.999:
+        a = p + t * (q-p)
+        return a / np.linalg.norm(a)
+    t0 = np.arccos(d)
+    tt = t0 * t
+    st = np.sin(tt)
+    st0 = np.sin(t0)
+    s1 = st / st0
+    s0 = np.cos(tt) - d*s1
+    return s0*p + s1*q
+
+# Quaterion scale (slerp vs. identity quaternion).
+def q_scale(q, scl):
+    return q_slerp(q_unit(), q, scl)
+
+# Quaternion product.
+def q_mul(p, q):
+    s1, V1 = p[0], p[1:]
+    s2, V2 = q[0], q[1:]
+    s = s1*s2 - np.dot(V1, V2)
+    V = s1*V2 + s2*V1 + np.cross(V1, V2)
+    return np.asarray([s, V[0], V[1], V[2]], np.float32)
+
+# Angular difference between two quaternions in degrees.
+def q_angle_deg(p, q):
+    p = p.detach().cpu().numpy()
+    q = q.detach().cpu().numpy()
+    d = np.abs(np.dot(p, q))
+    d = min(d, 1.0)
+    return np.degrees(2.0 * np.arccos(d))
+
+# Quaternion product
+def q_mul_torch(p, q):
+    a = p[0]*q[0] - p[1]*q[1] - p[2]*q[2] - p[3]*q[3]
+    b = p[0]*q[1] + p[1]*q[0] + p[2]*q[3] - p[3]*q[2]
+    c = p[0]*q[2] + p[2]*q[0] + p[3]*q[1] - p[1]*q[3]
+    d = p[0]*q[3] + p[3]*q[0] + p[1]*q[2] - p[2]*q[1]
+    return torch.stack([a, b, c, d])
+
+# Convert quaternion to 4x4 rotation matrix.
+def q_to_mtx(q):
+    r0 = torch.stack([1.0-2.0*q[1]**2 - 2.0*q[2]**2, 2.0*q[0]*q[1] - 2.0*q[2]*q[3], 2.0*q[0]*q[2] + 2.0*q[1]*q[3]])
+    r1 = torch.stack([2.0*q[0]*q[1] + 2.0*q[2]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[2]**2, 2.0*q[1]*q[2] - 2.0*q[0]*q[3]])
+    r2 = torch.stack([2.0*q[0]*q[2] - 2.0*q[1]*q[3], 2.0*q[1]*q[2] + 2.0*q[0]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[1]**2])
+    rr = torch.transpose(torch.stack([r0, r1, r2]), 1, 0)
+    rr = torch.cat([rr, torch.tensor([[0], [0], [0]], dtype=torch.float32).cuda()], dim=1) # Pad right column.
+    rr = torch.cat([rr, torch.tensor([[0, 0, 0, 1]], dtype=torch.float32).cuda()], dim=0)  # Pad bottom row.
+    return rr
+
+# Transform vertex positions to clip space
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    # (x,y,z) -> (x,y,z,1)
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, col, col_idx, resolution: int):
+    # Setup TF graph for reference.
+    pos_clip    = transform_pos(mtx, pos)
+    rast_out, _ = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+    color   , _ = dr.interpolate(col[None, ...], rast_out, col_idx)
+    color       = dr.antialias(color, rast_out, pos_clip, pos_idx)
+    return color
+
+#----------------------------------------------------------------------------
+# Cube pose fitter.
+#----------------------------------------------------------------------------
+
+def fit_pose(max_iter           = 10000,
+             repeats            = 1,
+             log_interval       = 10,
+             display_interval   = None,
+             display_res        = 512,
+             lr_base            = 0.01,
+             lr_falloff         = 1.0,
+             nr_base            = 1.0,
+             nr_falloff         = 1e-4,
+             grad_phase_start   = 0.5,
+             resolution         = 256,
+             out_dir            = None,
+             log_fn             = None,
+             mp4save_interval   = None,
+             mp4save_fn         = None):
+
+    log_file = None
+    writer = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(out_dir + '/' + log_fn, 'wt')
+        if mp4save_interval != 0:
+            writer = imageio.get_writer(f'{out_dir}/{mp4save_fn}', mode='I', fps=30, codec='libx264', bitrate='16M')
+    else:
+        mp4save_interval = None
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/cube_p.npz') as f:
+        pos_idx, pos, col_idx, col = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Some input geometry contains vertex positions in (N, 4) (with v[:,3]==1).  Drop
+    # the last column in that case.
+    if pos.shape[1] == 4: pos = pos[:, 0:3]
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(pos.astype(np.float32)).cuda()
+    col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
+    vtx_col = torch.from_numpy(col.astype(np.float32)).cuda()
+
+    glctx = dr.RasterizeGLContext()
+
+    for rep in range(repeats):
+        pose_target = torch.tensor(q_rnd(), device='cuda')
+        pose_init   = q_rnd()
+        pose_opt    = torch.tensor(pose_init / np.sum(pose_init**2)**0.5, dtype=torch.float32, device='cuda', requires_grad=True)
+
+        loss_best   = np.inf
+        pose_best   = pose_opt.detach().clone()
+
+        # Modelview + projection matrix.
+        mvp = torch.tensor(np.matmul(util.projection(x=0.4), util.translate(0, 0, -3.5)).astype(np.float32), device='cuda')
+
+        # Adam optimizer for texture with a learning rate ramp.
+        optimizer = torch.optim.Adam([pose_opt], betas=(0.9, 0.999), lr=lr_base)
+        # Render.
+        for it in range(max_iter + 1):
+            # Set learning rate.
+            itf = 1.0 * it / max_iter
+            nr = nr_base * nr_falloff**itf
+            lr = lr_base * lr_falloff**itf
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = lr
+
+            # Noise input.
+            if itf >= grad_phase_start:
+                noise = q_unit()
+            else:
+                noise = q_scale(q_rnd(), nr)
+                noise = q_mul(noise, q_rnd_S4()) # Orientation noise.
+
+            # Render.
+            color          = render(glctx, torch.matmul(mvp, q_to_mtx(pose_target)), vtx_pos, pos_idx, vtx_col, col_idx, resolution)
+            pose_total_opt = q_mul_torch(pose_opt, noise)
+            mtx_total_opt  = torch.matmul(mvp, q_to_mtx(pose_total_opt))
+            color_opt      = render(glctx, mtx_total_opt, vtx_pos, pos_idx, vtx_col, col_idx, resolution)
+
+            # Image-space loss.
+            diff = (color_opt - color)**2 # L2 norm.
+            diff = torch.tanh(5.0 * torch.max(diff, dim=-1)[0])
+            loss = torch.mean(diff)
+
+            # Measure image-space loss and update best found pose.
+            loss_val = float(loss)
+            if (loss_val < loss_best) and (loss_val > 0.0):
+                pose_best = pose_total_opt.detach().clone()
+                loss_best = loss_val
+                if itf < grad_phase_start:
+                    with torch.no_grad(): pose_opt[:] = pose_best
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                err = q_angle_deg(pose_opt, pose_target)
+                ebest = q_angle_deg(pose_best, pose_target)
+                s = "rep=%d,iter=%d,err=%f,err_best=%f,loss=%f,loss_best=%f,lr=%f,nr=%f" % (rep, it, err, ebest, loss_val, loss_best, lr, nr)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Run gradient training step.
+            if itf >= grad_phase_start:
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+            with torch.no_grad():
+                pose_opt /= torch.sum(pose_opt**2)**0.5
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_mp4      = mp4save_interval and (it % mp4save_interval == 0)
+
+            if display_image or save_mp4:
+                c = color[0].detach().cpu().numpy()
+                img_ref  = color[0].detach().cpu().numpy()
+                img_opt  = color_opt[0].detach().cpu().numpy()
+                img_best = render(glctx, torch.matmul(mvp, q_to_mtx(pose_best)), vtx_pos, pos_idx, vtx_col, col_idx, resolution)[0].detach().cpu().numpy()
+                result_image = np.concatenate([img_ref, img_best, img_opt], axis=1)
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='(%d) %d / %d' % (rep, it, max_iter))
+                if save_mp4:
+                    writer.append_data(np.clip(np.rint(result_image*255.0), 0, 255).astype(np.uint8))
+
+    # Done.
+    if writer is not None:
+        writer.close()
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Cube pose fitting example')
+    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--mp4save-interval', type=int, default=10)
+    parser.add_argument('--max-iter', type=int, default=1000)
+    parser.add_argument('--repeats', type=int, default=1)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        out_dir = f'{args.outdir}/pose'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_pose(
+        max_iter=args.max_iter,
+        repeats=args.repeats,
+        log_interval=100,
+        display_interval=args.display_interval,
+        out_dir=out_dir,
+        log_fn='log.txt',
+        mp4save_interval=args.mp4save_interval,
+        mp4save_fn='progress.mp4'
+    )
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/samples/torch/triangle.py b/pose_estimation/nvdiffrast/samples/torch/triangle.py
new file mode 100755
index 0000000000000000000000000000000000000000..f4e74581cf865b39321d8fd2e266e33b55643fcd
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/torch/triangle.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imageio
+import numpy as np
+import torch
+import nvdiffrast.torch as dr
+
+def tensor(*args, **kwargs):
+    return torch.tensor(*args, device='cuda', **kwargs)
+
+pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
+col = tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=torch.float32)
+tri = tensor([[0, 1, 2]], dtype=torch.int32)
+
+glctx = dr.RasterizeGLContext()
+rast, _ = dr.rasterize(glctx, pos, tri, resolution=[256, 256])
+out, _ = dr.interpolate(col, rast, tri)
+
+img = out.cpu().numpy()[0, ::-1, :, :] # Flip vertically.
+img = np.clip(np.rint(img * 255), 0, 255).astype(np.uint8) # Quantize to np.uint8
+
+print("Saving to 'tri.png'.")
+imageio.imsave('tri.png', img)
diff --git a/pose_estimation/nvdiffrast/samples/torch/util.py b/pose_estimation/nvdiffrast/samples/torch/util.py
new file mode 100755
index 0000000000000000000000000000000000000000..240c517b96cec78c836cd836aa1387ca492026e9
--- /dev/null
+++ b/pose_estimation/nvdiffrast/samples/torch/util.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import torch
+
+#----------------------------------------------------------------------------
+# Projection and transformation matrix helpers.
+#----------------------------------------------------------------------------
+
+def projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n/x,    0,            0,              0],
+                     [  0, n/-x,            0,              0],
+                     [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)],
+                     [  0,    0,           -1,              0]]).astype(np.float32)
+
+def translate(x, y, z):
+    return np.array([[1, 0, 0, x],
+                     [0, 1, 0, y],
+                     [0, 0, 1, z],
+                     [0, 0, 0, 1]]).astype(np.float32)
+
+def rotate_x(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[1,  0, 0, 0],
+                     [0,  c, s, 0],
+                     [0, -s, c, 0],
+                     [0,  0, 0, 1]]).astype(np.float32)
+
+def rotate_y(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[ c, 0, s, 0],
+                     [ 0, 1, 0, 0],
+                     [-s, 0, c, 0],
+                     [ 0, 0, 0, 1]]).astype(np.float32)
+
+def random_rotation_translation(t):
+    m = np.random.normal(size=[3, 3])
+    m[1] = np.cross(m[0], m[2])
+    m[2] = np.cross(m[0], m[1])
+    m = m / np.linalg.norm(m, axis=1, keepdims=True)
+    m = np.pad(m, [[0, 1], [0, 1]], mode='constant')
+    m[3, 3] = 1.0
+    m[:3, 3] = np.random.uniform(-t, t, size=[3])
+    return m
+
+#----------------------------------------------------------------------------
+# Bilinear downsample by 2x.
+#----------------------------------------------------------------------------
+
+def bilinear_downsample(x):
+    w = torch.tensor([[1, 3, 3, 1], [3, 9, 9, 3], [3, 9, 9, 3], [1, 3, 3, 1]], dtype=torch.float32, device=x.device) / 64.0
+    w = w.expand(x.shape[-1], 1, 4, 4) 
+    x = torch.nn.functional.conv2d(x.permute(0, 3, 1, 2), w, padding=1, stride=2, groups=x.shape[-1])
+    return x.permute(0, 2, 3, 1)
+
+#----------------------------------------------------------------------------
+# Image display function using OpenGL.
+#----------------------------------------------------------------------------
+
+_glfw_window = None
+def display_image(image, zoom=None, size=None, title=None): # HWC
+    # Import OpenGL and glfw.
+    import OpenGL.GL as gl
+    import glfw
+
+    # Zoom image if requested.
+    image = np.asarray(image)
+    if size is not None:
+        assert zoom is None
+        zoom = max(1, size // image.shape[0])
+    if zoom is not None:
+        image = image.repeat(zoom, axis=0).repeat(zoom, axis=1)
+    height, width, channels = image.shape
+
+    # Initialize window.
+    if title is None:
+        title = 'Debug window'
+    global _glfw_window
+    if _glfw_window is None:
+        glfw.init()
+        _glfw_window = glfw.create_window(width, height, title, None, None)
+        glfw.make_context_current(_glfw_window)
+        glfw.show_window(_glfw_window)
+        glfw.swap_interval(0)
+    else:
+        glfw.make_context_current(_glfw_window)
+        glfw.set_window_title(_glfw_window, title)
+        glfw.set_window_size(_glfw_window, width, height)
+
+    # Update window.
+    glfw.poll_events()
+    gl.glClearColor(0, 0, 0, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    gl.glWindowPos2f(0, 0)
+    gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+    gl_format = {3: gl.GL_RGB, 2: gl.GL_RG, 1: gl.GL_LUMINANCE}[channels]
+    gl_dtype = {'uint8': gl.GL_UNSIGNED_BYTE, 'float32': gl.GL_FLOAT}[image.dtype.name]
+    gl.glDrawPixels(width, height, gl_format, gl_dtype, image[::-1])
+    glfw.swap_buffers(_glfw_window)
+    if glfw.window_should_close(_glfw_window):
+        return False
+    return True
+
+#----------------------------------------------------------------------------
+# Image save helper.
+#----------------------------------------------------------------------------
+
+def save_image(fn, x):
+    import imageio
+    x = np.rint(x * 255.0)
+    x = np.clip(x, 0, 255).astype(np.uint8)
+    imageio.imsave(fn, x)
+
+#----------------------------------------------------------------------------
diff --git a/pose_estimation/nvdiffrast/setup.py b/pose_estimation/nvdiffrast/setup.py
new file mode 100755
index 0000000000000000000000000000000000000000..889c87521642305e62815f32d6a42c7b307852a0
--- /dev/null
+++ b/pose_estimation/nvdiffrast/setup.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from pose_estimation.nvdiffrast import nvdiffrast
+import setuptools
+import os
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="nvdiffrast",
+    version=nvdiffrast.__version__,
+    author="Samuli Laine",
+    author_email="slaine@nvidia.com",
+    description="nvdiffrast - modular primitives for high-performance differentiable rendering",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/NVlabs/nvdiffrast",
+    packages=setuptools.find_packages(),
+    package_data={
+        'nvdiffrast': [
+            'common/*.h',
+            'common/*.inl',
+            'common/*.cu',
+            'common/*.cpp',
+            'lib/*.h',
+            'torch/*.h',
+            'torch/*.inl',
+            'torch/*.cpp',
+            'tensorflow/*.cu',
+        ] + (['lib/*.lib'] if os.name == 'nt' else [])
+    },
+    include_package_data=True,
+    install_requires=['numpy'],  # note: can't require torch here as it will install torch even for a TensorFlow container
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+)
diff --git a/pose_estimation/options/__init__.py b/pose_estimation/options/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e7eedebe54aa70169fd25951b3034d819e396c90
--- /dev/null
+++ b/pose_estimation/options/__init__.py
@@ -0,0 +1 @@
+"""This package options includes option modules: training options, test options, and basic options (used in both training and test)."""
diff --git a/pose_estimation/options/base_options.py b/pose_estimation/options/base_options.py
new file mode 100755
index 0000000000000000000000000000000000000000..67375d0801197b864c613a7fb0e44e93d3fd991f
--- /dev/null
+++ b/pose_estimation/options/base_options.py
@@ -0,0 +1,169 @@
+"""This script contains base options for Deep3DFaceRecon_pytorch
+"""
+
+import argparse
+import os
+from util import util
+import numpy as np
+import torch
+import models
+import data
+
+
+class BaseOptions():
+    """This class defines options used during both training and test time.
+
+    It also implements several helper functions such as parsing, printing, and saving the options.
+    It also gathers additional options defined in <modify_commandline_options> functions in both dataset class and model class.
+    """
+
+    def __init__(self, cmd_line=None):
+        """Reset the class; indicates the class hasn't been initailized"""
+        self.initialized = False
+        self.cmd_line = None
+        if cmd_line is not None:
+            self.cmd_line = cmd_line.split()
+
+    def initialize(self, parser):
+        """Define the common options that are used in both training and test."""
+        # basic parameters
+        parser.add_argument('--name', type=str, default='face_recon', help='name of the experiment. It decides where to store samples and models')
+        parser.add_argument('--gpu_ids', type=str, default='0', help='gpu ids: e.g. 0  0,1,2, 0,2. use -1 for CPU')
+        parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints', help='models are saved here')
+        parser.add_argument('--vis_batch_nums', type=float, default=1, help='batch nums of images for visulization')
+        parser.add_argument('--eval_batch_nums', type=float, default=float('inf'), help='batch nums of images for evaluation')
+        parser.add_argument('--use_ddp', type=util.str2bool, nargs='?', const=True, default=True, help='whether use distributed data parallel')
+        parser.add_argument('--ddp_port', type=str, default='12355', help='ddp port')
+        parser.add_argument('--display_per_batch', type=util.str2bool, nargs='?', const=True, default=True, help='whether use batch to show losses')
+        parser.add_argument('--add_image', type=util.str2bool, nargs='?', const=True, default=True, help='whether add image to tensorboard')
+        parser.add_argument('--world_size', type=int, default=1, help='batch nums of images for evaluation')
+
+        # model parameters
+        parser.add_argument('--model', type=str, default='facerecon', help='chooses which model to use.')
+
+        # additional parameters
+        parser.add_argument('--epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
+        parser.add_argument('--verbose', action='store_true', help='if specified, print more debugging information')
+        parser.add_argument('--suffix', default='', type=str, help='customized suffix: opt.name = opt.name + suffix: e.g., {model}_{netG}_size{load_size}')
+
+        self.initialized = True
+        return parser
+
+    def gather_options(self):
+        """Initialize our parser with basic options(only once).
+        Add additional model-specific and dataset-specific options.
+        These options are defined in the <modify_commandline_options> function
+        in model and dataset classes.
+        """
+        if not self.initialized:  # check if it has been initialized
+            parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+            parser = self.initialize(parser)
+
+        # get the basic options
+        if self.cmd_line is None:
+            opt, _ = parser.parse_known_args()
+        else:
+            opt, _ = parser.parse_known_args(self.cmd_line)
+
+        # set cuda visible devices
+        os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_ids
+
+        # modify model-related parser options
+        model_name = opt.model
+        model_option_setter = models.get_option_setter(model_name)
+        parser = model_option_setter(parser, self.isTrain)
+        if self.cmd_line is None:
+            opt, _ = parser.parse_known_args()  # parse again with new defaults
+        else:
+            opt, _ = parser.parse_known_args(self.cmd_line)  # parse again with new defaults
+
+        # modify dataset-related parser options
+        if opt.dataset_mode:
+            dataset_name = opt.dataset_mode
+            dataset_option_setter = data.get_option_setter(dataset_name)
+            parser = dataset_option_setter(parser, self.isTrain)
+
+        # save and return the parser
+        self.parser = parser
+        if self.cmd_line is None:
+            return parser.parse_args()
+        else:
+            return parser.parse_args(self.cmd_line)
+
+    def print_options(self, opt):
+        """Print and save options
+
+        It will print both current options and default values(if different).
+        It will save options into a text file / [checkpoints_dir] / opt.txt
+        """
+        message = ''
+        message += '----------------- Options ---------------\n'
+        for k, v in sorted(vars(opt).items()):
+            comment = ''
+            default = self.parser.get_default(k)
+            if v != default:
+                comment = '\t[default: %s]' % str(default)
+            message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment)
+        message += '----------------- End -------------------'
+        print(message)
+
+        # save to the disk
+        expr_dir = os.path.join(opt.checkpoints_dir, opt.name)
+        util.mkdirs(expr_dir)
+        file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase))
+        try:
+            with open(file_name, 'wt') as opt_file:
+                opt_file.write(message)
+                opt_file.write('\n')
+        except PermissionError as error:
+            print("permission error {}".format(error))
+            pass
+
+    def parse(self):
+        """Parse our options, create checkpoints directory suffix, and set up gpu device."""
+        opt = self.gather_options()
+        opt.isTrain = self.isTrain   # train or test
+
+        # process opt.suffix
+        if opt.suffix:
+            suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
+            opt.name = opt.name + suffix
+
+
+        # set gpu ids
+        str_ids = opt.gpu_ids.split(',')
+        gpu_ids = []
+        for str_id in str_ids:
+            id = int(str_id)
+            if id >= 0:
+                gpu_ids.append(id)
+        opt.world_size = len(gpu_ids)
+        # if len(opt.gpu_ids) > 0:
+        #     torch.cuda.set_device(gpu_ids[0])
+        if opt.world_size == 1:
+            opt.use_ddp = False
+
+        if opt.phase != 'test':
+            # set continue_train automatically
+            if opt.pretrained_name is None:
+                model_dir = os.path.join(opt.checkpoints_dir, opt.name)
+            else:
+                model_dir = os.path.join(opt.checkpoints_dir, opt.pretrained_name)
+            if os.path.isdir(model_dir):
+                model_pths = [i for i in os.listdir(model_dir) if i.endswith('pth')]
+                if os.path.isdir(model_dir) and len(model_pths) != 0:
+                    opt.continue_train= True
+        
+            # update the latest epoch count
+            if opt.continue_train:
+                if opt.epoch == 'latest':
+                    epoch_counts = [int(i.split('.')[0].split('_')[-1]) for i in model_pths if 'latest' not in i]
+                    if len(epoch_counts) != 0:
+                        opt.epoch_count = max(epoch_counts) + 1
+                else:
+                    opt.epoch_count = int(opt.epoch) + 1
+                    
+
+        self.print_options(opt)
+        self.opt = opt
+        return self.opt
diff --git a/pose_estimation/options/test_options.py b/pose_estimation/options/test_options.py
new file mode 100755
index 0000000000000000000000000000000000000000..d06de6443c72b47591fdf65f54df4b2580f4ea3f
--- /dev/null
+++ b/pose_estimation/options/test_options.py
@@ -0,0 +1,23 @@
+"""This script contains the test options for Deep3DFaceRecon_pytorch
+"""
+
+from .base_options import BaseOptions
+
+
+class TestOptions(BaseOptions):
+    """This class includes test options.
+
+    It also includes shared options defined in BaseOptions.
+    """
+
+    def initialize(self, parser):
+        parser = BaseOptions.initialize(self, parser)  # define shared options
+        parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
+        parser.add_argument('--dataset_mode', type=str, default=None, help='chooses how datasets are loaded. [None | flist]')
+        parser.add_argument('--img_folder', type=str, default='examples', help='folder for test images.')
+        parser.add_argument('--start', type=int, default=0, help='start folder')
+        parser.add_argument('--skip_model', action='store_true', help='whether to run model')
+
+        # Dropout and Batchnorm has different behavior during training and test.
+        self.isTrain = False
+        return parser
diff --git a/pose_estimation/options/train_options.py b/pose_estimation/options/train_options.py
new file mode 100755
index 0000000000000000000000000000000000000000..1337bfdd5f372b5c686a91b394a2aadbe5741f44
--- /dev/null
+++ b/pose_estimation/options/train_options.py
@@ -0,0 +1,53 @@
+"""This script contains the training options for Deep3DFaceRecon_pytorch
+"""
+
+from .base_options import BaseOptions
+from util import util
+
+class TrainOptions(BaseOptions):
+    """This class includes training options.
+
+    It also includes shared options defined in BaseOptions.
+    """
+
+    def initialize(self, parser):
+        parser = BaseOptions.initialize(self, parser)
+        # dataset parameters
+        # for train
+        parser.add_argument('--data_root', type=str, default='./', help='dataset root')
+        parser.add_argument('--flist', type=str, default='datalist/train/masks.txt', help='list of mask names of training set')
+        parser.add_argument('--batch_size', type=int, default=32)
+        parser.add_argument('--dataset_mode', type=str, default='flist', help='chooses how datasets are loaded. [None | flist]')
+        parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly')
+        parser.add_argument('--num_threads', default=4, type=int, help='# threads for loading data')
+        parser.add_argument('--max_dataset_size', type=int, default=float("inf"), help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.')
+        parser.add_argument('--preprocess', type=str, default='shift_scale_rot_flip', help='scaling and cropping of images at load time [shift_scale_rot_flip | shift_scale | shift | shift_rot_flip ]')
+        parser.add_argument('--use_aug', type=util.str2bool, nargs='?', const=True, default=True, help='whether use data augmentation')
+
+        # for val
+        parser.add_argument('--flist_val', type=str, default='datalist/val/masks.txt', help='list of mask names of val set')
+        parser.add_argument('--batch_size_val', type=int, default=32)
+
+
+        # visualization parameters
+        parser.add_argument('--display_freq', type=int, default=1000, help='frequency of showing training results on screen')
+        parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console')
+        
+        # network saving and loading parameters
+        parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results')
+        parser.add_argument('--save_epoch_freq', type=int, default=1, help='frequency of saving checkpoints at the end of epochs')
+        parser.add_argument('--evaluation_freq', type=int, default=5000, help='evaluation freq')
+        parser.add_argument('--save_by_iter', action='store_true', help='whether saves model by iteration')
+        parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model')
+        parser.add_argument('--epoch_count', type=int, default=1, help='the starting epoch count, we save the model by <epoch_count>, <epoch_count>+<save_latest_freq>, ...')
+        parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc')
+        parser.add_argument('--pretrained_name', type=str, default=None, help='resume training from another checkpoint')
+
+        # training parameters
+        parser.add_argument('--n_epochs', type=int, default=20, help='number of epochs with the initial learning rate')
+        parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate for adam')
+        parser.add_argument('--lr_policy', type=str, default='step', help='learning rate policy. [linear | step | plateau | cosine]')
+        parser.add_argument('--lr_decay_epochs', type=int, default=10, help='multiply by a gamma every lr_decay_epochs epoches')
+
+        self.isTrain = True
+        return parser
diff --git a/pose_estimation/process_test_images.py b/pose_estimation/process_test_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd903278bd7c078a1aaaa4de09979fd8810f0184
--- /dev/null
+++ b/pose_estimation/process_test_images.py
@@ -0,0 +1,48 @@
+"""
+    Processes a directory containing *.jpg/png and outputs crops and poses.
+"""
+import glob
+import os
+import subprocess
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--input_dir', default='/media/data6/ericryanchan/mafu/Deep3DFaceRecon_pytorch/test_images')
+parser.add_argument('--gpu', default=0)
+args = parser.parse_args()
+
+# print('Processing images:', sorted(glob.glob(os.path.join(args.input_dir, "*"))))
+
+# Compute facial landmarks.
+print("Computing facial landmarks for model...")
+cmd = "python batch_mtcnn.py"
+input_flag = " --in_root " + args.input_dir
+cmd += input_flag
+subprocess.run([cmd], shell=True, check=True)
+
+
+# Run model inference to produce crops and raw poses.
+print("Running model inference...")
+cmd = "python test.py"
+input_flag = " --img_folder=" + args.input_dir
+gpu_flag = " --gpu_ids=" + str(args.gpu) 
+model_name_flag = " --name=pretrained"
+model_file_flag = " --epoch=20 "
+cmd += input_flag + gpu_flag + model_name_flag + model_file_flag
+subprocess.run([cmd], shell=True, check=True)
+
+# Perform final cropping of 1024x1024 images.
+print("Processing final crops...")
+cmd = "python crop_images.py"
+input_flag = " --indir " + args.input_dir
+output_flag = " --outdir " + os.path.join(args.input_dir, 'cropped_images')
+cmd += input_flag + output_flag
+subprocess.run([cmd], shell=True, check=True)
+
+# Process poses into our representation -- produces a cameras.json file.
+print("Processing final poses...")
+cmd = "python 3dface2idr.py"
+input_flag = " --in_root " + os.path.join(args.input_dir, "epoch_20_000000")
+output_flag = " --out_root " + os.path.join(args.input_dir, "cropped_images")
+
+cmd += input_flag + output_flag
+subprocess.run([cmd], shell=True, check=True)
\ No newline at end of file
diff --git a/pose_estimation/test.py b/pose_estimation/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6f87a907dd10c68de36427cec540ca8eaff6a08
--- /dev/null
+++ b/pose_estimation/test.py
@@ -0,0 +1,108 @@
+"""This script is the test script for Deep3DFaceRecon_pytorch
+"""
+
+import os
+from options.test_options import TestOptions
+from models import create_model
+from util.visualizer import MyVisualizer
+from util.preprocess import align_img
+from PIL import Image
+import numpy as np
+from util.load_mats import load_lm3d
+import torch
+import json
+
+def get_data_path(root='examples'):
+    im_path = [os.path.join(root, i) for i in sorted(os.listdir(root)) if i.endswith('png') or i.endswith('jpg')]
+    lm_path = [i.replace('png', 'txt').replace('jpg', 'txt') for i in im_path]
+    lm_path = [os.path.join(i.replace(i.split(os.path.sep)[-1],''),'detections',i.split(os.path.sep)[-1]) for i in lm_path]
+    return im_path, lm_path
+
+def read_data(im_path, lm_path, lm3d_std, to_tensor=True, rescale_factor=466.285): 
+    im = Image.open(im_path).convert('RGB')
+    _, H = im.size
+    lm = np.loadtxt(lm_path).astype(np.float32)
+    lm = lm.reshape([-1, 2])
+    lm[:, -1] = H - 1 - lm[:, -1]
+    _, im_pil, lm, _, im_high = align_img(im, lm, lm3d_std, rescale_factor=rescale_factor)
+    if to_tensor:
+        im = torch.tensor(np.array(im_pil)/255., dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)
+        lm = torch.tensor(lm).unsqueeze(0)
+    else:
+        im = im_pil
+    return im, lm, im_pil, im_high
+
+def main(rank, opt, name='examples'):
+    device = torch.device(rank)
+    torch.cuda.set_device(device)
+    model = create_model(opt)
+    model.setup(opt)
+    model.device = device
+    model.parallelize()
+    model.eval()
+    visualizer = MyVisualizer(opt)
+    print("ROOT")
+    print(name)
+    im_path, lm_path = get_data_path(name)
+    lm3d_std = load_lm3d(opt.bfm_folder)
+
+    cropping_params = {}
+
+    out_dir_crop1024 = os.path.join(name, "crop_1024")
+    if not os.path.exists(out_dir_crop1024):
+        os.makedirs(out_dir_crop1024)
+    out_dir = os.path.join(name, 'epoch_%s_%06d'%(opt.epoch, 0))
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    for i in range(len(im_path)):
+        print(i, im_path[i])
+        img_name = im_path[i].split(os.path.sep)[-1].replace('.png','').replace('.jpg','')
+        if not os.path.isfile(lm_path[i]):
+            continue
+        
+        # 2 passes for cropping image for NeRF and for pose extraction
+        for r in range(2):
+            if r==0:
+                rescale_factor = 300 # optimized for NeRF training
+                center_crop_size = 700
+                output_size = 512
+
+                # left = int(im_high.size[0]/2 - center_crop_size/2)
+                # upper = int(im_high.size[1]/2 - center_crop_size/2)
+                # right = left + center_crop_size
+                # lower = upper + center_crop_size
+                # im_cropped = im_high.crop((left, upper, right,lower))
+                # im_cropped = im_cropped.resize((output_size, output_size), resample=Image.LANCZOS)
+                cropping_params[os.path.basename(im_path[i])] = {
+                    'lm': np.loadtxt(lm_path[i]).astype(np.float32).tolist(),
+                    'lm3d_std': lm3d_std.tolist(),
+                    'rescale_factor': rescale_factor,
+                    'center_crop_size': center_crop_size,
+                    'output_size': output_size}
+
+                # im_high.save(os.path.join(out_dir_crop1024, img_name+'.png'), compress_level=0)
+                # im_cropped.save(os.path.join(out_dir_crop1024, img_name+'.png'), compress_level=0)
+            elif not opt.skip_model:
+                rescale_factor = 466.285
+                im_tensor, lm_tensor, _, im_high = read_data(im_path[i], lm_path[i], lm3d_std, rescale_factor=rescale_factor)
+             
+                data = {
+                    'imgs': im_tensor,
+                    'lms': lm_tensor
+                }
+                model.set_input(data)  # unpack data from data loader
+                model.test()           # run inference
+              #  visuals = model.get_current_visuals()  # get image results
+              #  visualizer.display_current_results(visuals, 0, opt.epoch, dataset=name.split(os.path.sep)[-1], 
+                #    save_results=True, count=i, name=img_name, add_image=False)
+             #   import pdb; pdb.set_trace()
+                model.save_mesh(os.path.join(out_dir,img_name+'.obj'))
+                model.save_coeff(os.path.join(out_dir,img_name+'.mat')) # save predicted coefficients
+
+    with open(os.path.join(name, 'cropping_params.json'), 'w') as outfile:
+        json.dump(cropping_params, outfile, indent=4)
+
+if __name__ == '__main__':
+    opt = TestOptions().parse()  # get test options
+    main(0, opt,opt.img_folder)
+    
diff --git a/pose_estimation/util/BBRegressorParam_r.mat b/pose_estimation/util/BBRegressorParam_r.mat
new file mode 100755
index 0000000000000000000000000000000000000000..a0da99af145c400a5216d9f6fb251d9412565921
--- /dev/null
+++ b/pose_estimation/util/BBRegressorParam_r.mat
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5a07b8ce75a39d96b918dc0fc6e110a72e090da16f5f056a0ef7bfbc3f4560
+size 22019
diff --git a/pose_estimation/util/__init__.py b/pose_estimation/util/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..718f8f67264ef6124c9628dba97f4e556d77e435
--- /dev/null
+++ b/pose_estimation/util/__init__.py
@@ -0,0 +1,2 @@
+"""This package includes a miscellaneous collection of useful helper functions."""
+from util import *
diff --git a/pose_estimation/util/detect_lm68.py b/pose_estimation/util/detect_lm68.py
new file mode 100755
index 0000000000000000000000000000000000000000..b7e40997289e17405e1fb6c408d21adce7b626ce
--- /dev/null
+++ b/pose_estimation/util/detect_lm68.py
@@ -0,0 +1,106 @@
+import os
+import cv2
+import numpy as np
+from scipy.io import loadmat
+import tensorflow as tf
+from util.preprocess import align_for_lm
+from shutil import move
+
+mean_face = np.loadtxt('util/test_mean_face.txt')
+mean_face = mean_face.reshape([68, 2])
+
+def save_label(labels, save_path):
+    np.savetxt(save_path, labels)
+
+def draw_landmarks(img, landmark, save_name):
+    landmark = landmark
+    lm_img = np.zeros([img.shape[0], img.shape[1], 3])
+    lm_img[:] = img.astype(np.float32)
+    landmark = np.round(landmark).astype(np.int32)
+
+    for i in range(len(landmark)):
+        for j in range(-1, 1):
+            for k in range(-1, 1):
+                if img.shape[0] - 1 - landmark[i, 1]+j > 0 and \
+                        img.shape[0] - 1 - landmark[i, 1]+j < img.shape[0] and \
+                        landmark[i, 0]+k > 0 and \
+                        landmark[i, 0]+k < img.shape[1]:
+                    lm_img[img.shape[0] - 1 - landmark[i, 1]+j, landmark[i, 0]+k,
+                           :] = np.array([0, 0, 255])
+    lm_img = lm_img.astype(np.uint8)
+
+    cv2.imwrite(save_name, lm_img)
+
+
+def load_data(img_name, txt_name):
+    return cv2.imread(img_name), np.loadtxt(txt_name)
+
+# create tensorflow graph for landmark detector
+def load_lm_graph(graph_filename):
+    with tf.gfile.GFile(graph_filename, 'rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name='net')
+        img_224 = graph.get_tensor_by_name('net/input_imgs:0')
+        output_lm = graph.get_tensor_by_name('net/lm:0')
+        lm_sess = tf.Session(graph=graph)
+
+    return lm_sess,img_224,output_lm
+
+# landmark detection
+def detect_68p(img_path,sess,input_op,output_op):
+    print('detecting landmarks......')
+    names = [i for i in sorted(os.listdir(
+        img_path)) if 'jpg' in i or 'png' in i or 'jpeg' in i or 'PNG' in i]
+    vis_path = os.path.join(img_path, 'vis')
+    remove_path = os.path.join(img_path, 'remove')
+    save_path = os.path.join(img_path, 'landmarks')
+    if not os.path.isdir(vis_path):
+        os.makedirs(vis_path)
+    if not os.path.isdir(remove_path):
+        os.makedirs(remove_path)
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+
+    for i in range(0, len(names)):
+        name = names[i]
+        print('%05d' % (i), ' ', name)
+        full_image_name = os.path.join(img_path, name)
+        txt_name = '.'.join(name.split('.')[:-1]) + '.txt'
+        full_txt_name = os.path.join(img_path, 'detections', txt_name) # 5 facial landmark path for each image
+
+        # if an image does not have detected 5 facial landmarks, remove it from the training list
+        if not os.path.isfile(full_txt_name):
+            move(full_image_name, os.path.join(remove_path, name))
+            continue 
+
+        # load data
+        img, five_points = load_data(full_image_name, full_txt_name)
+        input_img, scale, bbox = align_for_lm(img, five_points) # align for 68 landmark detection 
+
+        # if the alignment fails, remove corresponding image from the training list
+        if scale == 0:
+            move(full_txt_name, os.path.join(
+                remove_path, txt_name))
+            move(full_image_name, os.path.join(remove_path, name))
+            continue
+
+        # detect landmarks
+        input_img = np.reshape(
+            input_img, [1, 224, 224, 3]).astype(np.float32)
+        landmark = sess.run(
+            output_op, feed_dict={input_op: input_img})
+
+        # transform back to original image coordinate
+        landmark = landmark.reshape([68, 2]) + mean_face
+        landmark[:, 1] = 223 - landmark[:, 1]
+        landmark = landmark / scale
+        landmark[:, 0] = landmark[:, 0] + bbox[0]
+        landmark[:, 1] = landmark[:, 1] + bbox[1]
+        landmark[:, 1] = img.shape[0] - 1 - landmark[:, 1]
+
+        if i % 100 == 0:
+            draw_landmarks(img, landmark, os.path.join(vis_path, name))
+        save_label(landmark, os.path.join(save_path, txt_name))
diff --git a/pose_estimation/util/generate_list.py b/pose_estimation/util/generate_list.py
new file mode 100755
index 0000000000000000000000000000000000000000..943d906781063c3584a7e5b5c784f8aac0694985
--- /dev/null
+++ b/pose_estimation/util/generate_list.py
@@ -0,0 +1,34 @@
+"""This script is to generate training list files for Deep3DFaceRecon_pytorch
+"""
+
+import os
+
+# save path to training data
+def write_list(lms_list, imgs_list, msks_list, mode='train',save_folder='datalist', save_name=''):
+    save_path = os.path.join(save_folder, mode)
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+    with open(os.path.join(save_path, save_name + 'landmarks.txt'), 'w') as fd:
+        fd.writelines([i + '\n' for i in lms_list])
+
+    with open(os.path.join(save_path, save_name + 'images.txt'), 'w') as fd:
+        fd.writelines([i + '\n' for i in imgs_list])
+
+    with open(os.path.join(save_path, save_name + 'masks.txt'), 'w') as fd:
+        fd.writelines([i + '\n' for i in msks_list])   
+
+# check if the path is valid
+def check_list(rlms_list, rimgs_list, rmsks_list):
+    lms_list, imgs_list, msks_list = [], [], []
+    for i in range(len(rlms_list)):
+        flag = 'false'
+        lm_path = rlms_list[i]
+        im_path = rimgs_list[i]
+        msk_path = rmsks_list[i]
+        if os.path.isfile(lm_path) and os.path.isfile(im_path) and os.path.isfile(msk_path):
+            flag = 'true'
+            lms_list.append(rlms_list[i])
+            imgs_list.append(rimgs_list[i])
+            msks_list.append(rmsks_list[i])
+        print(i, rlms_list[i], flag)
+    return lms_list, imgs_list, msks_list
diff --git a/pose_estimation/util/html.py b/pose_estimation/util/html.py
new file mode 100755
index 0000000000000000000000000000000000000000..cc3262a1eafda34842e4dbad47bb6ba72f0c5a68
--- /dev/null
+++ b/pose_estimation/util/html.py
@@ -0,0 +1,86 @@
+import dominate
+from dominate.tags import meta, h3, table, tr, td, p, a, img, br
+import os
+
+
+class HTML:
+    """This HTML class allows us to save images and write texts into a single HTML file.
+
+     It consists of functions such as <add_header> (add a text header to the HTML file),
+     <add_images> (add a row of images to the HTML file), and <save> (save the HTML to the disk).
+     It is based on Python library 'dominate', a Python library for creating and manipulating HTML documents using a DOM API.
+    """
+
+    def __init__(self, web_dir, title, refresh=0):
+        """Initialize the HTML classes
+
+        Parameters:
+            web_dir (str) -- a directory that stores the webpage. HTML file will be created at <web_dir>/index.html; images will be saved at <web_dir/images/
+            title (str)   -- the webpage name
+            refresh (int) -- how often the website refresh itself; if 0; no refreshing
+        """
+        self.title = title
+        self.web_dir = web_dir
+        self.img_dir = os.path.join(self.web_dir, 'images')
+        if not os.path.exists(self.web_dir):
+            os.makedirs(self.web_dir)
+        if not os.path.exists(self.img_dir):
+            os.makedirs(self.img_dir)
+
+        self.doc = dominate.document(title=title)
+        if refresh > 0:
+            with self.doc.head:
+                meta(http_equiv="refresh", content=str(refresh))
+
+    def get_image_dir(self):
+        """Return the directory that stores images"""
+        return self.img_dir
+
+    def add_header(self, text):
+        """Insert a header to the HTML file
+
+        Parameters:
+            text (str) -- the header text
+        """
+        with self.doc:
+            h3(text)
+
+    def add_images(self, ims, txts, links, width=400):
+        """add images to the HTML file
+
+        Parameters:
+            ims (str list)   -- a list of image paths
+            txts (str list)  -- a list of image names shown on the website
+            links (str list) --  a list of hyperref links; when you click an image, it will redirect you to a new page
+        """
+        self.t = table(border=1, style="table-layout: fixed;")  # Insert a table
+        self.doc.add(self.t)
+        with self.t:
+            with tr():
+                for im, txt, link in zip(ims, txts, links):
+                    with td(style="word-wrap: break-word;", halign="center", valign="top"):
+                        with p():
+                            with a(href=os.path.join('images', link)):
+                                img(style="width:%dpx" % width, src=os.path.join('images', im))
+                            br()
+                            p(txt)
+
+    def save(self):
+        """save the current content to the HMTL file"""
+        html_file = '%s/index.html' % self.web_dir
+        f = open(html_file, 'wt')
+        f.write(self.doc.render())
+        f.close()
+
+
+if __name__ == '__main__':  # we show an example usage here.
+    html = HTML('web/', 'test_html')
+    html.add_header('hello world')
+
+    ims, txts, links = [], [], []
+    for n in range(4):
+        ims.append('image_%d.png' % n)
+        txts.append('text_%d' % n)
+        links.append('image_%d.png' % n)
+    html.add_images(ims, txts, links)
+    html.save()
diff --git a/pose_estimation/util/load_mats.py b/pose_estimation/util/load_mats.py
new file mode 100755
index 0000000000000000000000000000000000000000..5b1f4a73c83035c6313969631eb2ff2b2322df7e
--- /dev/null
+++ b/pose_estimation/util/load_mats.py
@@ -0,0 +1,117 @@
+"""This script is to load 3D face model for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+from PIL import Image
+from scipy.io import loadmat, savemat
+from array import array
+import os.path as osp
+
+# load expression basis
+def LoadExpBasis(bfm_folder='BFM'):
+    n_vertex = 53215
+    Expbin = open(osp.join(bfm_folder, 'Exp_Pca.bin'), 'rb')
+    exp_dim = array('i')
+    exp_dim.fromfile(Expbin, 1)
+    expMU = array('f')
+    expPC = array('f')
+    expMU.fromfile(Expbin, 3*n_vertex)
+    expPC.fromfile(Expbin, 3*exp_dim[0]*n_vertex)
+    Expbin.close()
+
+    expPC = np.array(expPC)
+    expPC = np.reshape(expPC, [exp_dim[0], -1])
+    expPC = np.transpose(expPC)
+
+    expEV = np.loadtxt(osp.join(bfm_folder, 'std_exp.txt'))
+
+    return expPC, expEV
+
+
+# transfer original BFM09 to our face model
+def transferBFM09(bfm_folder='BFM'):
+    print('Transfer BFM09 to BFM_model_front......')
+    original_BFM = loadmat(osp.join(bfm_folder, '01_MorphableModel.mat'))
+    shapePC = original_BFM['shapePC']  # shape basis
+    shapeEV = original_BFM['shapeEV']  # corresponding eigen value
+    shapeMU = original_BFM['shapeMU']  # mean face
+    texPC = original_BFM['texPC']  # texture basis
+    texEV = original_BFM['texEV']  # eigen value
+    texMU = original_BFM['texMU']  # mean texture
+
+    expPC, expEV = LoadExpBasis()
+
+    # transfer BFM09 to our face model
+
+    idBase = shapePC*np.reshape(shapeEV, [-1, 199])
+    idBase = idBase/1e5  # unify the scale to decimeter
+    idBase = idBase[:, :80]  # use only first 80 basis
+
+    exBase = expPC*np.reshape(expEV, [-1, 79])
+    exBase = exBase/1e5  # unify the scale to decimeter
+    exBase = exBase[:, :64]  # use only first 64 basis
+
+    texBase = texPC*np.reshape(texEV, [-1, 199])
+    texBase = texBase[:, :80]  # use only first 80 basis
+
+    # our face model is cropped along face landmarks and contains only 35709 vertex.
+    # original BFM09 contains 53490 vertex, and expression basis provided by Guo et al. contains 53215 vertex.
+    # thus we select corresponding vertex to get our face model.
+
+    index_exp = loadmat(osp.join(bfm_folder, 'BFM_front_idx.mat'))
+    index_exp = index_exp['idx'].astype(np.int32) - 1  # starts from 0 (to 53215)
+
+    index_shape = loadmat(osp.join(bfm_folder, 'BFM_exp_idx.mat'))
+    index_shape = index_shape['trimIndex'].astype(
+        np.int32) - 1  # starts from 0 (to 53490)
+    index_shape = index_shape[index_exp]
+
+    idBase = np.reshape(idBase, [-1, 3, 80])
+    idBase = idBase[index_shape, :, :]
+    idBase = np.reshape(idBase, [-1, 80])
+
+    texBase = np.reshape(texBase, [-1, 3, 80])
+    texBase = texBase[index_shape, :, :]
+    texBase = np.reshape(texBase, [-1, 80])
+
+    exBase = np.reshape(exBase, [-1, 3, 64])
+    exBase = exBase[index_exp, :, :]
+    exBase = np.reshape(exBase, [-1, 64])
+
+    meanshape = np.reshape(shapeMU, [-1, 3])/1e5
+    meanshape = meanshape[index_shape, :]
+    meanshape = np.reshape(meanshape, [1, -1])
+
+    meantex = np.reshape(texMU, [-1, 3])
+    meantex = meantex[index_shape, :]
+    meantex = np.reshape(meantex, [1, -1])
+
+    # other info contains triangles, region used for computing photometric loss,
+    # region used for skin texture regularization, and 68 landmarks index etc.
+    other_info = loadmat(osp.join(bfm_folder, 'facemodel_info.mat'))
+    frontmask2_idx = other_info['frontmask2_idx']
+    skinmask = other_info['skinmask']
+    keypoints = other_info['keypoints']
+    point_buf = other_info['point_buf']
+    tri = other_info['tri']
+    tri_mask2 = other_info['tri_mask2']
+
+    # save our face model
+    savemat(osp.join(bfm_folder, 'BFM_model_front.mat'), {'meanshape': meanshape, 'meantex': meantex, 'idBase': idBase, 'exBase': exBase, 'texBase': texBase,
+            'tri': tri, 'point_buf': point_buf, 'tri_mask2': tri_mask2, 'keypoints': keypoints, 'frontmask2_idx': frontmask2_idx, 'skinmask': skinmask})
+
+
+# load landmarks for standard face, which is used for image preprocessing
+def load_lm3d(bfm_folder):
+
+    Lm3D = loadmat(osp.join(bfm_folder, 'similarity_Lm3D_all.mat'))
+    Lm3D = Lm3D['lm']
+
+    # calculate 5 facial landmarks using 68 landmarks
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
+    Lm3D = np.stack([Lm3D[lm_idx[0], :], np.mean(Lm3D[lm_idx[[1, 2]], :], 0), np.mean(
+        Lm3D[lm_idx[[3, 4]], :], 0), Lm3D[lm_idx[5], :], Lm3D[lm_idx[6], :]], axis=0)
+    Lm3D = Lm3D[[1, 2, 0, 3, 4], :]
+
+    return Lm3D
+
diff --git a/pose_estimation/util/nvdiffrast.py b/pose_estimation/util/nvdiffrast.py
new file mode 100755
index 0000000000000000000000000000000000000000..08490cd190734489406e6f61810bd34629294ef9
--- /dev/null
+++ b/pose_estimation/util/nvdiffrast.py
@@ -0,0 +1,89 @@
+"""This script is the differentiable renderer for Deep3DFaceRecon_pytorch
+    Attention, antialiasing step is missing in current version.
+"""
+
+import torch
+import torch.nn.functional as F
+import kornia
+from kornia.geometry.camera import pixel2cam
+import numpy as np
+from typing import List
+import nvdiffrast.torch as dr
+from scipy.io import loadmat
+from torch import nn
+
+def ndc_projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n/x,    0,            0,              0],
+                     [  0, n/-x,            0,              0],
+                     [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)],
+                     [  0,    0,           -1,              0]]).astype(np.float32)
+
+class MeshRenderer(nn.Module):
+    def __init__(self,
+                rasterize_fov,
+                znear=0.1,
+                zfar=10, 
+                rasterize_size=224):
+        super(MeshRenderer, self).__init__()
+
+        x = np.tan(np.deg2rad(rasterize_fov * 0.5)) * znear
+        self.ndc_proj = torch.tensor(ndc_projection(x=x, n=znear, f=zfar)).matmul(
+                torch.diag(torch.tensor([1., -1, -1, 1])))
+        self.rasterize_size = rasterize_size
+        self.glctx = None
+    
+    def forward(self, vertex, tri, feat=None):
+        """
+        Return:
+            mask               -- torch.tensor, size (B, 1, H, W)
+            depth              -- torch.tensor, size (B, 1, H, W)
+            features(optional) -- torch.tensor, size (B, C, H, W) if feat is not None
+
+        Parameters:
+            vertex          -- torch.tensor, size (B, N, 3)
+            tri             -- torch.tensor, size (B, M, 3) or (M, 3), triangles
+            feat(optional)  -- torch.tensor, size (B, C), features
+        """
+        device = vertex.device
+        rsize = int(self.rasterize_size)
+        ndc_proj = self.ndc_proj.to(device)
+        # trans to homogeneous coordinates of 3d vertices, the direction of y is the same as v
+        if vertex.shape[-1] == 3:
+            vertex = torch.cat([vertex, torch.ones([*vertex.shape[:2], 1]).to(device)], dim=-1)
+            vertex[..., 1] = -vertex[..., 1] 
+
+
+        vertex_ndc = vertex @ ndc_proj.t()
+        if self.glctx is None:
+            self.glctx = dr.RasterizeGLContext(device=device)
+            print("create glctx on device cuda:%d"%device.index)
+        
+        ranges = None
+        if isinstance(tri, List) or len(tri.shape) == 3:
+            vum = vertex_ndc.shape[1]
+            fnum = torch.tensor([f.shape[0] for f in tri]).unsqueeze(1).to(device) 
+            fstartidx = torch.cumsum(fnum, dim=0) - fnum 
+            ranges = torch.cat([fstartidx, fnum], axis=1).type(torch.int32).cpu()
+            for i in range(tri.shape[0]):
+                tri[i] = tri[i] + i*vum
+            vertex_ndc = torch.cat(vertex_ndc, dim=0)
+            tri = torch.cat(tri, dim=0)
+
+        # for range_mode vetex: [B*N, 4], tri: [B*M, 3], for instance_mode vetex: [B, N, 4], tri: [M, 3]
+        tri = tri.type(torch.int32).contiguous()
+        rast_out, _ = dr.rasterize(self.glctx, vertex_ndc.contiguous(), tri, resolution=[rsize, rsize], ranges=ranges)
+
+        depth, _ = dr.interpolate(vertex.reshape([-1,4])[...,2].unsqueeze(1).contiguous(), rast_out, tri) 
+        depth = depth.permute(0, 3, 1, 2)
+        mask =  (rast_out[..., 3] > 0).float().unsqueeze(1)
+        depth = mask * depth
+        
+
+        image = None
+        if feat is not None:
+            image, _ = dr.interpolate(feat, rast_out, tri)
+            image = image.permute(0, 3, 1, 2)
+            image = mask * image
+        
+        return mask, depth, image
+
diff --git a/pose_estimation/util/pose_template.npy b/pose_estimation/util/pose_template.npy
new file mode 100644
index 0000000000000000000000000000000000000000..14cf2ce45853cb5ea36651c326eb67b21f0e7883
--- /dev/null
+++ b/pose_estimation/util/pose_template.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f8655702c6fc4b8f1e17d8307e3e18a4ef8e293ebeee7dd45f3740002ee0326
+size 228
diff --git a/pose_estimation/util/preprocess.py b/pose_estimation/util/preprocess.py
new file mode 100755
index 0000000000000000000000000000000000000000..1d473b2ddc5dc26eec6e6f7a14b3b6fc2f896123
--- /dev/null
+++ b/pose_estimation/util/preprocess.py
@@ -0,0 +1,246 @@
+"""This script contains the image preprocessing code for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+from scipy.io import loadmat
+from PIL import Image
+import cv2
+import os
+from skimage import transform as trans
+import torch
+import warnings
+warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
+warnings.filterwarnings("ignore", category=FutureWarning) 
+
+
+# calculating least square problem for image alignment
+def POS(xp, x):
+    npts = xp.shape[1]
+
+    A = np.zeros([2*npts, 8])
+
+    A[0:2*npts-1:2, 0:3] = x.transpose()
+    A[0:2*npts-1:2, 3] = 1
+
+    A[1:2*npts:2, 4:7] = x.transpose()
+    A[1:2*npts:2, 7] = 1
+
+    b = np.reshape(xp.transpose(), [2*npts, 1])
+
+    k, _, _, _ = np.linalg.lstsq(A, b)
+
+    R1 = k[0:3]
+    R2 = k[4:7]
+    sTx = k[3]
+    sTy = k[7]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2
+    t = np.stack([sTx, sTy], axis=0)
+
+    return t, s
+
+# bounding box for 68 landmark detection
+def BBRegression(points, params):
+
+    w1 = params['W1']
+    b1 = params['B1']
+    w2 = params['W2']
+    b2 = params['B2']
+    data = points.copy()
+    data = data.reshape([5, 2])
+    data_mean = np.mean(data, axis=0)
+    x_mean = data_mean[0]
+    y_mean = data_mean[1]
+    data[:, 0] = data[:, 0] - x_mean
+    data[:, 1] = data[:, 1] - y_mean
+
+    rms = np.sqrt(np.sum(data ** 2)/5)
+    data = data / rms
+    data = data.reshape([1, 10])
+    data = np.transpose(data)
+    inputs = np.matmul(w1, data) + b1
+    inputs = 2 / (1 + np.exp(-2 * inputs)) - 1
+    inputs = np.matmul(w2, inputs) + b2
+    inputs = np.transpose(inputs)
+    x = inputs[:, 0] * rms + x_mean
+    y = inputs[:, 1] * rms + y_mean
+    w = 224/inputs[:, 2] * rms
+    rects = [x, y, w, w]
+    return np.array(rects).reshape([4])
+
+# utils for landmark detection
+def img_padding(img, box):
+    success = True
+    bbox = box.copy()
+    res = np.zeros([2*img.shape[0], 2*img.shape[1], 3])
+    res[img.shape[0] // 2: img.shape[0] + img.shape[0] //
+        2, img.shape[1] // 2: img.shape[1] + img.shape[1]//2] = img
+
+    bbox[0] = bbox[0] + img.shape[1] // 2
+    bbox[1] = bbox[1] + img.shape[0] // 2
+    if bbox[0] < 0 or bbox[1] < 0:
+        success = False
+    return res, bbox, success
+
+# utils for landmark detection
+def crop(img, bbox):
+    padded_img, padded_bbox, flag = img_padding(img, bbox)
+    if flag:
+        crop_img = padded_img[padded_bbox[1]: padded_bbox[1] +
+                            padded_bbox[3], padded_bbox[0]: padded_bbox[0] + padded_bbox[2]]
+        crop_img = cv2.resize(crop_img.astype(np.uint8),
+                            (224, 224), interpolation=cv2.INTER_CUBIC)
+        scale = 224 / padded_bbox[3]
+        return crop_img, scale
+    else:
+        return padded_img, 0
+
+# utils for landmark detection
+def scale_trans(img, lm, t, s):
+    imgw = img.shape[1]
+    imgh = img.shape[0]
+    M_s = np.array([[1, 0, -t[0] + imgw//2 + 0.5], [0, 1, -imgh//2 + t[1]]],
+                   dtype=np.float32)
+    img = cv2.warpAffine(img, M_s, (imgw, imgh))
+    w = int(imgw / s * 100)
+    h = int(imgh / s * 100)
+    img = cv2.resize(img, (w, h))
+    lm = np.stack([lm[:, 0] - t[0] + imgw // 2, lm[:, 1] -
+                   t[1] + imgh // 2], axis=1) / s * 100
+
+    left = w//2 - 112
+    up = h//2 - 112
+    bbox = [left, up, 224, 224]
+    cropped_img, scale2 = crop(img, bbox)
+    assert(scale2!=0)
+    t1 = np.array([bbox[0], bbox[1]])
+
+    # back to raw img s * crop + s * t1 + t2
+    t1 = np.array([w//2 - 112, h//2 - 112])
+    scale = s / 100
+    t2 = np.array([t[0] - imgw/2, t[1] - imgh / 2])
+    inv = (scale/scale2, scale * t1 + t2.reshape([2]))
+    return cropped_img, inv
+
+# utils for landmark detection
+def align_for_lm(img, five_points):
+    five_points = np.array(five_points).reshape([1, 10])
+    params = loadmat('util/BBRegressorParam_r.mat')
+    bbox = BBRegression(five_points, params)
+    assert(bbox[2] != 0)
+    bbox = np.round(bbox).astype(np.int32)
+    crop_img, scale = crop(img, bbox)
+    return crop_img, scale, bbox
+
+
+# resize and crop images for face reconstruction
+def resize_n_crop_img(img, lm, t, s, target_size=1024., mask=None):
+#def resize_n_crop_img(img, lm, t, s, target_size=224., mask=None):
+    w0, h0 = img.size
+    w = (w0*s).astype(np.int32)
+    h = (h0*s).astype(np.int32)
+    left = (w/2 - target_size/2 + float((t[0] - w0/2)*s)).astype(np.int32)
+    right = left + target_size
+    up = (h/2 - target_size/2 + float((h0/2 - t[1])*s)).astype(np.int32)
+    below = up + target_size
+    # img.save("/home/koki/Projects/Deep3DFaceRecon_pytorch/checkpoints/pretrained/results/iphone/epoch_20_000000/img_debug.jpg")
+    img = img.resize((w, h), resample=Image.LANCZOS)
+    # img = np.asarray(img)
+    # cx = int(0.5 * left + 0.5 * right)
+    # cy = int(0.5 * up + 0.5 * below)
+    # img = cv2.circle(img, (cx, cy), 3, (255,0,0), 3)
+    # img = Image.fromarray(img)
+    # print(str(cx/s) + " " + str(cy/s))
+    img = img.crop((left, up, right, below))
+
+    if mask is not None:
+        mask = mask.resize((w, h), resample=Image.LANCZOS)
+        mask = mask.crop((left, up, right, below))
+
+    lm = np.stack([lm[:, 0] - t[0] + w0/2, lm[:, 1] -
+                  t[1] + h0/2], axis=1)*s
+    lm = lm - np.reshape(
+            np.array([(w/2 - target_size/2), (h/2-target_size/2)]), [1, 2])
+    #img.save("/home/koki/Projects/Deep3DFaceRecon_pytorch/checkpoints/pretrained/results/iphone/epoch_20_000000/crop_low.jpg")    
+    # mask.save("/home/koki/Projects/Deep3DFaceRecon_pytorch/checkpoints/pretrained/results/iphone/epoch_20_000000/mask.jpg")    
+    #print(lm)
+    return img, lm, mask
+
+# utils for face reconstruction
+def extract_5p(lm):
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
+    lm5p = np.stack([lm[lm_idx[0], :], np.mean(lm[lm_idx[[1, 2]], :], 0), np.mean(
+        lm[lm_idx[[3, 4]], :], 0), lm[lm_idx[5], :], lm[lm_idx[6], :]], axis=0)
+    lm5p = lm5p[[1, 2, 0, 3, 4], :]
+    return lm5p
+
+# utils for face reconstruction
+def align_img(img, lm, lm3D, mask=None, target_size=1024., rescale_factor=466.285):
+#def align_img(img, lm, lm3D, mask=None, target_size=224., rescale_factor=102.):
+    """
+    Return:
+        transparams        --numpy.array  (raw_W, raw_H, scale, tx, ty)
+        img_new            --PIL.Image  (target_size, target_size, 3)
+        lm_new             --numpy.array  (68, 2), y direction is opposite to v direction
+        mask_new           --PIL.Image  (target_size, target_size)
+    
+    Parameters:
+        img                --PIL.Image  (raw_H, raw_W, 3)
+        lm                 --numpy.array  (68, 2), y direction is opposite to v direction
+        lm3D               --numpy.array  (5, 3)
+        mask               --PIL.Image  (raw_H, raw_W, 3)
+    """
+
+    w0, h0 = img.size
+    if lm.shape[0] != 5:
+        lm5p = extract_5p(lm)
+    else:
+        lm5p = lm
+
+    # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face
+    t, s = POS(lm5p.transpose(), lm3D.transpose())
+    s = rescale_factor/s
+
+    # processing the image
+   # img_new = img.resize((1024,1024),resample=Image.LANCZOS)
+    #lm_new = lm*1024.0/512.0
+   # mask_new=None
+    img_new, lm_new, mask_new = resize_n_crop_img(img, lm, t, s, target_size=target_size, mask=mask)
+    # img.save("/home/koki/Projects/Deep3DFaceRecon_pytorch/checkpoints/pretrained/results/iphone/epoch_20_000000/img_new.jpg")
+    print(w0, h0, s, t[0][0], t[1][0])
+    trans_params = np.array([w0, h0, s, t[0][0], t[1][0]])
+    lm_new *= 224/1024.0
+    img_new_low = img_new.resize((224, 224), resample=Image.LANCZOS)
+
+    return trans_params, img_new_low, lm_new, mask_new, img_new
+
+# utils for face recognition model
+def estimate_norm(lm_68p, H):
+    # from https://github.com/deepinsight/insightface/blob/c61d3cd208a603dfa4a338bd743b320ce3e94730/recognition/common/face_align.py#L68
+    """
+    Return:
+        trans_m            --numpy.array  (2, 3)
+    Parameters:
+        lm                 --numpy.array  (68, 2), y direction is opposite to v direction
+        H                  --int/float , image height
+    """
+    lm = extract_5p(lm_68p)
+    lm[:, -1] = H - 1 - lm[:, -1]
+    tform = trans.SimilarityTransform()
+    src = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+    tform.estimate(lm, src)
+    M = tform.params
+    if np.linalg.det(M) == 0:
+        M = np.eye(3)
+
+    return M[0:2, :]
+
+def estimate_norm_torch(lm_68p, H):
+    lm_68p_ = lm_68p.detach().cpu().numpy()
+    M = []
+    for i in range(lm_68p_.shape[0]):
+        M.append(estimate_norm(lm_68p_[i], H))
+    M = torch.tensor(np.array(M), dtype=torch.float32).to(lm_68p.device)
+    return M
diff --git a/pose_estimation/util/skin_mask.py b/pose_estimation/util/skin_mask.py
new file mode 100755
index 0000000000000000000000000000000000000000..a8a74e4c3b40d13b0258b83a12f56321a85bb179
--- /dev/null
+++ b/pose_estimation/util/skin_mask.py
@@ -0,0 +1,125 @@
+"""This script is to generate skin attention mask for Deep3DFaceRecon_pytorch
+"""
+
+import math
+import numpy as np
+import os
+import cv2
+
+class GMM:
+    def __init__(self, dim, num, w, mu, cov, cov_det, cov_inv):
+        self.dim = dim # feature dimension
+        self.num = num # number of Gaussian components
+        self.w = w # weights of Gaussian components (a list of scalars)
+        self.mu= mu # mean of Gaussian components (a list of 1xdim vectors)
+        self.cov = cov # covariance matrix of Gaussian components (a list of dimxdim matrices)
+        self.cov_det = cov_det # pre-computed determinet of covariance matrices (a list of scalars)
+        self.cov_inv = cov_inv # pre-computed inverse covariance matrices (a list of dimxdim matrices)
+
+        self.factor = [0]*num
+        for i in range(self.num):
+            self.factor[i] = (2*math.pi)**(self.dim/2) * self.cov_det[i]**0.5
+        
+    def likelihood(self, data):
+        assert(data.shape[1] == self.dim)
+        N = data.shape[0]
+        lh = np.zeros(N)
+
+        for i in range(self.num):
+            data_ = data - self.mu[i]
+
+            tmp = np.matmul(data_,self.cov_inv[i]) * data_
+            tmp = np.sum(tmp,axis=1)
+            power = -0.5 * tmp
+
+            p = np.array([math.exp(power[j]) for j in range(N)])
+            p = p/self.factor[i]
+            lh += p*self.w[i]
+        
+        return lh
+
+
+def _rgb2ycbcr(rgb):
+    m = np.array([[65.481, 128.553, 24.966],
+                  [-37.797, -74.203, 112],
+                  [112, -93.786, -18.214]])
+    shape = rgb.shape
+    rgb = rgb.reshape((shape[0] * shape[1], 3))
+    ycbcr = np.dot(rgb, m.transpose() / 255.)
+    ycbcr[:, 0] += 16.
+    ycbcr[:, 1:] += 128.
+    return ycbcr.reshape(shape)
+
+
+def _bgr2ycbcr(bgr):
+    rgb = bgr[..., ::-1]
+    return _rgb2ycbcr(rgb)
+
+
+gmm_skin_w = [0.24063933, 0.16365987, 0.26034665, 0.33535415]
+gmm_skin_mu = [np.array([113.71862, 103.39613, 164.08226]),
+                np.array([150.19858, 105.18467, 155.51428]),
+                np.array([183.92976, 107.62468, 152.71820]),
+                np.array([114.90524, 113.59782, 151.38217])]
+gmm_skin_cov_det = [5692842.5, 5851930.5, 2329131., 1585971.]
+gmm_skin_cov_inv = [np.array([[0.0019472069, 0.0020450759, -0.00060243998],[0.0020450759, 0.017700525, 0.0051420014],[-0.00060243998, 0.0051420014, 0.0081308950]]),
+                    np.array([[0.0027110141, 0.0011036990, 0.0023122299],[0.0011036990, 0.010707724, 0.010742856],[0.0023122299, 0.010742856, 0.017481629]]),
+                    np.array([[0.0048026871, 0.00022935172, 0.0077668377],[0.00022935172, 0.011729696, 0.0081661865],[0.0077668377, 0.0081661865, 0.025374353]]),
+                    np.array([[0.0011989699, 0.0022453172, -0.0010748957],[0.0022453172, 0.047758564, 0.020332102],[-0.0010748957, 0.020332102, 0.024502251]])]
+
+gmm_skin = GMM(3, 4, gmm_skin_w, gmm_skin_mu, [], gmm_skin_cov_det, gmm_skin_cov_inv)
+
+gmm_nonskin_w = [0.12791070, 0.31130761, 0.34245777, 0.21832393]
+gmm_nonskin_mu = [np.array([99.200851, 112.07533, 140.20602]),
+                    np.array([110.91392, 125.52969, 130.19237]),
+                    np.array([129.75864, 129.96107, 126.96808]),
+                    np.array([112.29587, 128.85121, 129.05431])]
+gmm_nonskin_cov_det = [458703648., 6466488., 90611376., 133097.63]
+gmm_nonskin_cov_inv = [np.array([[0.00085371657, 0.00071197288, 0.00023958916],[0.00071197288, 0.0025935620, 0.00076557708],[0.00023958916, 0.00076557708, 0.0015042332]]),
+                    np.array([[0.00024650150, 0.00045542428, 0.00015019422],[0.00045542428, 0.026412144, 0.018419769],[0.00015019422, 0.018419769, 0.037497383]]),
+                    np.array([[0.00037054974, 0.00038146760, 0.00040408765],[0.00038146760, 0.0085505722, 0.0079136286],[0.00040408765, 0.0079136286, 0.010982352]]),
+                    np.array([[0.00013709733, 0.00051228428, 0.00012777430],[0.00051228428, 0.28237113, 0.10528370],[0.00012777430, 0.10528370, 0.23468947]])]
+
+gmm_nonskin = GMM(3, 4, gmm_nonskin_w, gmm_nonskin_mu, [], gmm_nonskin_cov_det, gmm_nonskin_cov_inv)
+
+prior_skin = 0.8
+prior_nonskin = 1 - prior_skin
+
+
+# calculate skin attention mask
+def skinmask(imbgr):
+    im = _bgr2ycbcr(imbgr)
+
+    data = im.reshape((-1,3))
+
+    lh_skin = gmm_skin.likelihood(data)
+    lh_nonskin = gmm_nonskin.likelihood(data)
+
+    tmp1 = prior_skin * lh_skin
+    tmp2 = prior_nonskin * lh_nonskin
+    post_skin = tmp1 / (tmp1+tmp2) # posterior probability
+
+    post_skin = post_skin.reshape((im.shape[0],im.shape[1]))
+
+    post_skin = np.round(post_skin*255)
+    post_skin = post_skin.astype(np.uint8)
+    post_skin = np.tile(np.expand_dims(post_skin,2),[1,1,3]) # reshape to H*W*3
+
+    return post_skin
+
+
+def get_skin_mask(img_path):
+    print('generating skin masks......')
+    names = [i for i in sorted(os.listdir(
+        img_path)) if 'jpg' in i or 'png' in i or 'jpeg' in i or 'PNG' in i]
+    save_path = os.path.join(img_path, 'mask')
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+    
+    for i in range(0, len(names)):
+        name = names[i]
+        print('%05d' % (i), ' ', name)
+        full_image_name = os.path.join(img_path, name)
+        img = cv2.imread(full_image_name).astype(np.float32)
+        skin_img = skinmask(img)
+        cv2.imwrite(os.path.join(save_path, name), skin_img.astype(np.uint8))
diff --git a/pose_estimation/util/test_mean_face.txt b/pose_estimation/util/test_mean_face.txt
new file mode 100755
index 0000000000000000000000000000000000000000..3a46d4db7699ffed8f898fcee64099631509946d
--- /dev/null
+++ b/pose_estimation/util/test_mean_face.txt
@@ -0,0 +1,136 @@
+-5.228591537475585938e+01
+2.078247070312500000e-01
+-5.064269638061523438e+01
+-1.315765380859375000e+01
+-4.952939224243164062e+01
+-2.592591094970703125e+01
+-4.793047332763671875e+01
+-3.832135772705078125e+01
+-4.512159729003906250e+01
+-5.059623336791992188e+01
+-3.917720794677734375e+01
+-6.043736648559570312e+01
+-2.929953765869140625e+01
+-6.861183166503906250e+01
+-1.719801330566406250e+01
+-7.572736358642578125e+01
+-1.961936950683593750e+00
+-7.862001037597656250e+01
+1.467941284179687500e+01
+-7.607844543457031250e+01
+2.744073486328125000e+01
+-6.915261840820312500e+01
+3.855677795410156250e+01
+-5.950350570678710938e+01
+4.478240966796875000e+01
+-4.867547225952148438e+01
+4.714337158203125000e+01
+-3.800830078125000000e+01
+4.940315246582031250e+01
+-2.496297454833984375e+01
+5.117234802246093750e+01
+-1.241538238525390625e+01
+5.190507507324218750e+01
+8.244247436523437500e-01
+-4.150688934326171875e+01
+2.386329650878906250e+01
+-3.570307159423828125e+01
+3.017010498046875000e+01
+-2.790358734130859375e+01
+3.212951660156250000e+01
+-1.941773223876953125e+01
+3.156523132324218750e+01
+-1.138106536865234375e+01
+2.841992187500000000e+01
+5.993263244628906250e+00
+2.895182800292968750e+01
+1.343590545654296875e+01
+3.189880371093750000e+01
+2.203153991699218750e+01
+3.302221679687500000e+01
+2.992478942871093750e+01
+3.099150085449218750e+01
+3.628388977050781250e+01
+2.765748596191406250e+01
+-1.933914184570312500e+00
+1.405374145507812500e+01
+-2.153038024902343750e+00
+5.772636413574218750e+00
+-2.270050048828125000e+00
+-2.121643066406250000e+00
+-2.218330383300781250e+00
+-1.068978118896484375e+01
+-1.187252044677734375e+01
+-1.997912597656250000e+01
+-6.879402160644531250e+00
+-2.143579864501953125e+01
+-1.227821350097656250e+00
+-2.193494415283203125e+01
+4.623237609863281250e+00
+-2.152721405029296875e+01
+9.721397399902343750e+00
+-1.953671264648437500e+01
+-3.648714447021484375e+01
+9.811126708984375000e+00
+-3.130242919921875000e+01
+1.422447967529296875e+01
+-2.212834930419921875e+01
+1.493019866943359375e+01
+-1.500880432128906250e+01
+1.073588562011718750e+01
+-2.095037078857421875e+01
+9.054298400878906250e+00
+-3.050099182128906250e+01
+8.704177856445312500e+00
+1.173237609863281250e+01
+1.054329681396484375e+01
+1.856353759765625000e+01
+1.535009765625000000e+01
+2.893331909179687500e+01
+1.451992797851562500e+01
+3.452944946289062500e+01
+1.065280151367187500e+01
+2.875990295410156250e+01
+8.654792785644531250e+00
+1.942100524902343750e+01
+9.422447204589843750e+00
+-2.204488372802734375e+01
+-3.983994293212890625e+01
+-1.324458312988281250e+01
+-3.467377471923828125e+01
+-6.749649047851562500e+00
+-3.092894744873046875e+01
+-9.183349609375000000e-01
+-3.196458435058593750e+01
+4.220649719238281250e+00
+-3.090406036376953125e+01
+1.089889526367187500e+01
+-3.497008514404296875e+01
+1.874589538574218750e+01
+-4.065438079833984375e+01
+1.124106597900390625e+01
+-4.438417816162109375e+01
+5.181709289550781250e+00
+-4.649170684814453125e+01
+-1.158607482910156250e+00
+-4.680406951904296875e+01
+-7.918922424316406250e+00
+-4.671575164794921875e+01
+-1.452505493164062500e+01
+-4.416526031494140625e+01
+-2.005007171630859375e+01
+-3.997841644287109375e+01
+-1.054919433593750000e+01
+-3.849683380126953125e+01
+-1.051826477050781250e+00
+-3.794863128662109375e+01
+6.412681579589843750e+00
+-3.804645538330078125e+01
+1.627674865722656250e+01
+-4.039697265625000000e+01
+6.373878479003906250e+00
+-4.087213897705078125e+01
+-8.551712036132812500e-01
+-4.157129669189453125e+01
+-1.014953613281250000e+01
+-4.128469085693359375e+01
diff --git a/pose_estimation/util/util.py b/pose_estimation/util/util.py
new file mode 100755
index 0000000000000000000000000000000000000000..bb9a703dbc2285e750c0b3b00aa6cfde6eb5dea7
--- /dev/null
+++ b/pose_estimation/util/util.py
@@ -0,0 +1,208 @@
+"""This script contains basic utilities for Deep3DFaceRecon_pytorch
+"""
+from __future__ import print_function
+import numpy as np
+import torch
+from PIL import Image
+import os
+import importlib
+import argparse
+from argparse import Namespace
+import torchvision
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def copyconf(default_opt, **kwargs):
+    conf = Namespace(**vars(default_opt))
+    for key in kwargs:
+        setattr(conf, key, kwargs[key])
+    return conf
+
+def genvalconf(train_opt, **kwargs):
+    conf = Namespace(**vars(train_opt))
+    attr_dict = train_opt.__dict__
+    for key, value in attr_dict.items():
+        if 'val' in key and key.split('_')[0] in attr_dict:
+            setattr(conf, key.split('_')[0], value)
+
+    for key in kwargs:
+        setattr(conf, key, kwargs[key])
+
+    return conf
+        
+def find_class_in_module(target_cls_name, module):
+    target_cls_name = target_cls_name.replace('_', '').lower()
+    clslib = importlib.import_module(module)
+    cls = None
+    for name, clsobj in clslib.__dict__.items():
+        if name.lower() == target_cls_name:
+            cls = clsobj
+
+    assert cls is not None, "In %s, there should be a class whose name matches %s in lowercase without underscore(_)" % (module, target_cls_name)
+
+    return cls
+
+
+def tensor2im(input_image, imtype=np.uint8):
+    """"Converts a Tensor array into a numpy image array.
+
+    Parameters:
+        input_image (tensor) --  the input image tensor array, range(0, 1)
+        imtype (type)        --  the desired type of the converted numpy array
+    """
+    if not isinstance(input_image, np.ndarray):
+        if isinstance(input_image, torch.Tensor):  # get the data from a variable
+            image_tensor = input_image.data
+        else:
+            return input_image
+        image_numpy = image_tensor.clamp(0.0, 1.0).cpu().float().numpy()  # convert it into a numpy array
+        if image_numpy.shape[0] == 1:  # grayscale to RGB
+            image_numpy = np.tile(image_numpy, (3, 1, 1))
+        image_numpy = np.transpose(image_numpy, (1, 2, 0)) * 255.0  # post-processing: tranpose and scaling
+    else:  # if it is a numpy array, do nothing
+        image_numpy = input_image
+    return image_numpy.astype(imtype)
+
+
+def diagnose_network(net, name='network'):
+    """Calculate and print the mean of average absolute(gradients)
+
+    Parameters:
+        net (torch network) -- Torch network
+        name (str) -- the name of the network
+    """
+    mean = 0.0
+    count = 0
+    for param in net.parameters():
+        if param.grad is not None:
+            mean += torch.mean(torch.abs(param.grad.data))
+            count += 1
+    if count > 0:
+        mean = mean / count
+    print(name)
+    print(mean)
+
+
+def save_image(image_numpy, image_path, aspect_ratio=1.0):
+    """Save a numpy image to the disk
+
+    Parameters:
+        image_numpy (numpy array) -- input numpy array
+        image_path (str)          -- the path of the image
+    """
+
+    image_pil = Image.fromarray(image_numpy)
+    h, w, _ = image_numpy.shape
+
+    if aspect_ratio is None:
+        pass
+    elif aspect_ratio > 1.0:
+        image_pil = image_pil.resize((h, int(w * aspect_ratio)), Image.LANCZOS)
+    elif aspect_ratio < 1.0:
+        image_pil = image_pil.resize((int(h / aspect_ratio), w), Image.LANCZOS)
+    image_pil.save(image_path)
+
+
+def print_numpy(x, val=True, shp=False):
+    """Print the mean, min, max, median, std, and size of a numpy array
+
+    Parameters:
+        val (bool) -- if print the values of the numpy array
+        shp (bool) -- if print the shape of the numpy array
+    """
+    x = x.astype(np.float64)
+    if shp:
+        print('shape,', x.shape)
+    if val:
+        x = x.flatten()
+        print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
+            np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
+
+
+def mkdirs(paths):
+    """create empty directories if they don't exist
+
+    Parameters:
+        paths (str list) -- a list of directory paths
+    """
+    if isinstance(paths, list) and not isinstance(paths, str):
+        for path in paths:
+            mkdir(path)
+    else:
+        mkdir(paths)
+
+
+def mkdir(path):
+    """create a single empty directory if it didn't exist
+
+    Parameters:
+        path (str) -- a single directory path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def correct_resize_label(t, size):
+    device = t.device
+    t = t.detach().cpu()
+    resized = []
+    for i in range(t.size(0)):
+        one_t = t[i, :1]
+        one_np = np.transpose(one_t.numpy().astype(np.uint8), (1, 2, 0))
+        one_np = one_np[:, :, 0]
+        one_image = Image.fromarray(one_np).resize(size, Image.NEAREST)
+        resized_t = torch.from_numpy(np.array(one_image)).long()
+        resized.append(resized_t)
+    return torch.stack(resized, dim=0).to(device)
+
+
+def correct_resize(t, size, mode=Image.LANCZOS):
+    device = t.device
+    t = t.detach().cpu()
+    resized = []
+    for i in range(t.size(0)):
+        one_t = t[i:i + 1]
+        one_image = Image.fromarray(tensor2im(one_t)).resize(size, Image.LANCZOS)
+        resized_t = torchvision.transforms.functional.to_tensor(one_image) * 2 - 1.0
+        resized.append(resized_t)
+    return torch.stack(resized, dim=0).to(device)
+
+def draw_landmarks(img, landmark, color='r', step=2):
+    """
+    Return:
+        img              -- numpy.array, (B, H, W, 3) img with landmark, RGB order, range (0, 255)
+        
+
+    Parameters:
+        img              -- numpy.array, (B, H, W, 3), RGB order, range (0, 255)
+        landmark         -- numpy.array, (B, 68, 2), y direction is opposite to v direction
+        color            -- str, 'r' or 'b' (red or blue)
+    """
+    if color =='r':
+        c = np.array([255., 0, 0])
+    else:
+        c = np.array([0, 0, 255.])
+
+    _, H, W, _ = img.shape
+    img, landmark = img.copy(), landmark.copy()
+    landmark[..., 1] = H - 1 - landmark[..., 1]
+    landmark = np.round(landmark).astype(np.int32)
+    for i in range(landmark.shape[1]):
+        x, y = landmark[:, i, 0], landmark[:, i, 1]
+        for j in range(-step, step):
+            for k in range(-step, step):
+                u = np.clip(x + j, 0, W - 1)
+                v = np.clip(y + k, 0, H - 1)
+                for m in range(landmark.shape[0]):
+                    img[m, v[m], u[m]] = c
+    return img
diff --git a/pose_estimation/util/visualizer.py b/pose_estimation/util/visualizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..5dac564b2b8f36dd55eadf455301e111ad07b734
--- /dev/null
+++ b/pose_estimation/util/visualizer.py
@@ -0,0 +1,226 @@
+"""This script defines the visualizer for Deep3DFaceRecon_pytorch
+"""
+
+import os
+import ntpath
+import time
+from . import util, html
+from torch.utils.tensorboard import SummaryWriter
+
+def save_images(webpage, visuals, image_path, aspect_ratio=1.0, width=256):
+    """Save images to the disk.
+
+    Parameters:
+        webpage (the HTML class) -- the HTML webpage class that stores these imaegs (see html.py for more details)
+        visuals (OrderedDict)    -- an ordered dictionary that stores (name, images (either tensor or numpy) ) pairs
+        image_path (str)         -- the string is used to create image paths
+        aspect_ratio (float)     -- the aspect ratio of saved images
+        width (int)              -- the images will be resized to width x width
+
+    This function will save images stored in 'visuals' to the HTML file specified by 'webpage'.
+    """
+    image_dir = webpage.get_image_dir()
+    short_path = ntpath.basename(image_path[0])
+    name = os.path.splitext(short_path)[0]
+
+    webpage.add_header(name)
+    ims, txts, links = [], [], []
+
+    for label, im_data in visuals.items():
+        im = util.tensor2im(im_data)
+        image_name = '%s/%s.png' % (label, name)
+        os.makedirs(os.path.join(image_dir, label), exist_ok=True)
+        save_path = os.path.join(image_dir, image_name)
+        util.save_image(im, save_path, aspect_ratio=aspect_ratio)
+        ims.append(image_name)
+        txts.append(label)
+        links.append(image_name)
+    webpage.add_images(ims, txts, links, width=width)
+
+
+class Visualizer():
+    """This class includes several functions that can display/save images and print/save logging information.
+
+    It uses a Python library tensprboardX for display, and a Python library 'dominate' (wrapped in 'HTML') for creating HTML files with images.
+    """
+
+    def __init__(self, opt):
+        """Initialize the Visualizer class
+
+        Parameters:
+            opt -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        Step 1: Cache the training/test options
+        Step 2: create a tensorboard writer
+        Step 3: create an HTML object for saveing HTML filters
+        Step 4: create a logging file to store training losses
+        """
+        self.opt = opt  # cache the option
+        self.use_html = opt.isTrain and not opt.no_html
+        self.writer = SummaryWriter(os.path.join(opt.checkpoints_dir, 'logs', opt.name))
+        self.win_size = opt.display_winsize
+        self.name = opt.name
+        self.saved = False
+        if self.use_html:  # create an HTML object at <checkpoints_dir>/web/; images will be saved under <checkpoints_dir>/web/images/
+            self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web')
+            self.img_dir = os.path.join(self.web_dir, 'images')
+            print('create web directory %s...' % self.web_dir)
+            util.mkdirs([self.web_dir, self.img_dir])
+        # create a logging file to store training losses
+        self.log_name = os.path.join(opt.checkpoints_dir, opt.name, 'loss_log.txt')
+        with open(self.log_name, "a") as log_file:
+            now = time.strftime("%c")
+            log_file.write('================ Training Loss (%s) ================\n' % now)
+
+    def reset(self):
+        """Reset the self.saved status"""
+        self.saved = False
+
+
+    def display_current_results(self, visuals, total_iters, epoch, save_result):
+        """Display current results on tensorboad; save current results to an HTML file.
+
+        Parameters:
+            visuals (OrderedDict) - - dictionary of images to display or save
+            total_iters (int) -- total iterations
+            epoch (int) - - the current epoch
+            save_result (bool) - - if save the current results to an HTML file
+        """
+        for label, image in visuals.items():
+            self.writer.add_image(label, util.tensor2im(image), total_iters, dataformats='HWC')
+
+        if self.use_html and (save_result or not self.saved):  # save images to an HTML file if they haven't been saved.
+            self.saved = True
+            # save images to the disk
+            for label, image in visuals.items():
+                image_numpy = util.tensor2im(image)
+                img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label))
+                util.save_image(image_numpy, img_path)
+
+            # update website
+            webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, refresh=0)
+            for n in range(epoch, 0, -1):
+                webpage.add_header('epoch [%d]' % n)
+                ims, txts, links = [], [], []
+
+                for label, image_numpy in visuals.items():
+                    image_numpy = util.tensor2im(image)
+                    img_path = 'epoch%.3d_%s.png' % (n, label)
+                    ims.append(img_path)
+                    txts.append(label)
+                    links.append(img_path)
+                webpage.add_images(ims, txts, links, width=self.win_size)
+            webpage.save()
+
+    def plot_current_losses(self, total_iters, losses):
+        # G_loss_collection = {}
+        # D_loss_collection = {}
+        # for name, value in losses.items():
+        #     if 'G' in name or 'NCE' in name or 'idt' in name:
+        #         G_loss_collection[name] = value
+        #     else:
+        #         D_loss_collection[name] = value
+        # self.writer.add_scalars('G_collec', G_loss_collection, total_iters)
+        # self.writer.add_scalars('D_collec', D_loss_collection, total_iters)
+        for name, value in losses.items():
+            self.writer.add_scalar(name, value, total_iters)
+
+    # losses: same format as |losses| of plot_current_losses
+    def print_current_losses(self, epoch, iters, losses, t_comp, t_data):
+        """print current losses on console; also save the losses to the disk
+
+        Parameters:
+            epoch (int) -- current epoch
+            iters (int) -- current training iteration during this epoch (reset to 0 at the end of every epoch)
+            losses (OrderedDict) -- training losses stored in the format of (name, float) pairs
+            t_comp (float) -- computational time per data point (normalized by batch_size)
+            t_data (float) -- data loading time per data point (normalized by batch_size)
+        """
+        message = '(epoch: %d, iters: %d, time: %.3f, data: %.3f) ' % (epoch, iters, t_comp, t_data)
+        for k, v in losses.items():
+            message += '%s: %.3f ' % (k, v)
+
+        print(message)  # print the message
+        with open(self.log_name, "a") as log_file:
+            log_file.write('%s\n' % message)  # save the message
+
+
+class MyVisualizer:
+    def __init__(self, opt):
+        """Initialize the Visualizer class
+
+        Parameters:
+            opt -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        Step 1: Cache the training/test options
+        Step 2: create a tensorboard writer
+        Step 3: create an HTML object for saveing HTML filters
+        Step 4: create a logging file to store training losses
+        """
+        self.opt = opt  # cache the optio
+        self.name = opt.name
+        self.img_folder = opt.img_folder
+        self.img_dir = os.path.join(opt.checkpoints_dir, opt.name, 'results')
+        
+        if opt.phase != 'test':
+            self.writer = SummaryWriter(os.path.join(opt.checkpoints_dir, opt.name, 'logs'))
+            # create a logging file to store training losses
+            self.log_name = os.path.join(opt.checkpoints_dir, opt.name, 'loss_log.txt')
+            with open(self.log_name, "a") as log_file:
+                now = time.strftime("%c")
+                log_file.write('================ Training Loss (%s) ================\n' % now)
+
+
+    def display_current_results(self, visuals, total_iters, epoch, dataset='train', save_results=False, count=0, name=None,
+            add_image=True):
+        """Display current results on tensorboad; save current results to an HTML file.
+
+        Parameters:
+            visuals (OrderedDict) - - dictionary of images to display or save
+            total_iters (int) -- total iterations
+            epoch (int) - - the current epoch
+            dataset (str) - - 'train' or 'val' or 'test'
+        """
+        # if (not add_image) and (not save_results): return
+        for label, image in visuals.items():
+            for i in range(image.shape[0]):
+                image_numpy = util.tensor2im(image[i])
+                if add_image:
+                    self.writer.add_image(label + '%s_%02d'%(dataset, i + count),
+                            image_numpy, total_iters, dataformats='HWC')
+
+                if save_results:
+                    #save_path = os.path.join(self.img_dir, dataset, 'epoch_%s_%06d'%(epoch, total_iters))
+                    save_path = os.path.join(self.img_folder, 'epoch_%s_%06d'%(epoch, total_iters))
+                    #print(self.img_folder)
+                    if not os.path.isdir(save_path):
+                        os.makedirs(save_path)
+
+                    if name is not None:
+                        img_path = os.path.join(save_path, '%s.png' % name)
+                    else:
+                        img_path = os.path.join(save_path, '%s_%03d.png' % (label, i + count))
+                    util.save_image(image_numpy, img_path)
+
+
+    def plot_current_losses(self, total_iters, losses, dataset='train'):
+        for name, value in losses.items():
+            self.writer.add_scalar(name + '/%s'%dataset, value, total_iters)
+
+    # losses: same format as |losses| of plot_current_losses
+    def print_current_losses(self, epoch, iters, losses, t_comp, t_data, dataset='train'):
+        """print current losses on console; also save the losses to the disk
+
+        Parameters:
+            epoch (int) -- current epoch
+            iters (int) -- current training iteration during this epoch (reset to 0 at the end of every epoch)
+            losses (OrderedDict) -- training losses stored in the format of (name, float) pairs
+            t_comp (float) -- computational time per data point (normalized by batch_size)
+            t_data (float) -- data loading time per data point (normalized by batch_size)
+        """
+        message = '(dataset: %s, epoch: %d, iters: %d, time: %.3f, data: %.3f) ' % (
+            dataset, epoch, iters, t_comp, t_data)
+        for k, v in losses.items():
+            message += '%s: %.3f ' % (k, v)
+
+        print(message)  # print the message
+        with open(self.log_name, "a") as log_file:
+            log_file.write('%s\n' % message)  # save the message
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dac22083c6f603cfb5f917be4d6723bed2b10ace
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,34 @@
+--extra-index-url https://download.pytorch.org/whl/cu116
+pip==23.1.2
+torch==1.12.1
+torchvision==0.13.1
+accelerate==0.18.0
+git+https://github.com/huggingface/transformers.git
+click==8.1.3
+diffusers[torch]==0.14.0
+dominate==2.7.0
+face_alignment==1.3.4
+gdown==4.7.1
+glfw==2.2.0
+imageio==2.9.0
+imgui>=1.3.0
+kornia==0.6.11
+matplotlib==3.4.2
+mrcfile==1.4.3
+mtcnn==0.1.1
+ninja==1.10.2
+numpy>=1.22.4
+pandas==1.5.3
+Pillow==9.3.0
+ply==3.11
+psutil==5.9.4
+PyOpenGL==3.1.5
+pyspng==0.1.1
+Requests==2.26.0
+scikit_image==0.20.0
+scipy==1.9.1
+setuptools==67.6.1
+tqdm==4.62.2
+trimesh==3.21.3
+tensorflow==2.11.0
+