Spaces:

Kyle-Liz
/

3DFauna_demo

Sleeping

App Files Files Community

kyleleey commited on Dec 24, 2023

Commit

98a77e0

1 Parent(s): 9df3c71

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +184 -0
ckpts/configs.yml +354 -0
ckpts/iter0800000.pth +3 -0
video3d/__init__.py +6 -0
video3d/cages/cages.py +218 -0
video3d/cub_dataloaders.py +404 -0
video3d/cub_dataloaders_ddp.py +434 -0
video3d/dataloaders.py +375 -0
video3d/dataloaders_ddp.py +1210 -0
video3d/diffusion/sd.py +252 -0
video3d/diffusion/sd_utils.py +123 -0
video3d/diffusion/vsd.py +323 -0
video3d/discriminator_architecture.py +83 -0
video3d/flow/__init__.py +0 -0
video3d/flow/flow.py +51 -0
video3d/flow/utils.py +23 -0
video3d/geometry/dlmesh.py +85 -0
video3d/geometry/dmtet.py +361 -0
video3d/model.py +1526 -0
video3d/model_ddp.py +0 -0
video3d/networks.py +1724 -0
video3d/render/light.py +191 -0
video3d/render/material.py +282 -0
video3d/render/mesh.py +377 -0
video3d/render/mlptexture.py +122 -0
video3d/render/obj.py +288 -0
video3d/render/regularizer.py +93 -0
video3d/render/render.py +369 -0
video3d/render/renderutils/__init__.py +11 -0
video3d/render/renderutils/bsdf.py +151 -0
video3d/render/renderutils/c_src/bsdf.cu +710 -0
video3d/render/renderutils/c_src/bsdf.h +84 -0
video3d/render/renderutils/c_src/common.cpp +74 -0
video3d/render/renderutils/c_src/common.h +41 -0
video3d/render/renderutils/c_src/cubemap.cu +350 -0
video3d/render/renderutils/c_src/cubemap.h +38 -0
video3d/render/renderutils/c_src/loss.cu +210 -0
video3d/render/renderutils/c_src/loss.h +38 -0
video3d/render/renderutils/c_src/mesh.cu +94 -0
video3d/render/renderutils/c_src/mesh.h +23 -0
video3d/render/renderutils/c_src/normal.cu +182 -0
video3d/render/renderutils/c_src/normal.h +27 -0
video3d/render/renderutils/c_src/tensor.h +92 -0
video3d/render/renderutils/c_src/torch_bindings.cpp +1062 -0
video3d/render/renderutils/c_src/vec3f.h +109 -0
video3d/render/renderutils/c_src/vec4f.h +25 -0
video3d/render/renderutils/loss.py +41 -0
video3d/render/renderutils/ops.py +554 -0
video3d/render/renderutils/tests/test_bsdf.py +296 -0
video3d/render/renderutils/tests/test_cubemap.py +47 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,184 @@

+__pycache__
+data
+data/*/
+data/*/*
+!data/preprocessing/
+pretrained/*/
+results
+neural_renderer
+*.zip
+unchanged/
+cvpr23_results/
+# slurm.bash
+results
+results/*/
+results/*
+results/*/*
+results/dor_checkpoints/*
+results/dor_checkpoints/*/*
+results/dor_checkpoints/*/*/*
+.vscode
+.vscode/
+dor_bash_files/
+zzli_bash_files/
+ray_bash_files/
+config/dor_exp/
+config/zzli_exp/
+config/ray_exp/
+wandb
+wandb/*/
+wandb/*/*
+wandb/*/*/*
+canon/out/*
+canon/out/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+/.idea
+# dependencies
+# nvdiffrast/
+data/preprocessing/videos/RAFT/
+preprocessing_data/RAFT/
+preprocessing_data/RAFT/*
+preprocessing_data/preprocessing/videos/RAFT/
+# debug
+DINO_v2_check/out_dor
+DINO_v2_check/out_dor/*
+eval/*/
+scripts/vis/
+eval/

ckpts/configs.yml ADDED Viewed

	@@ -0,0 +1,354 @@

+amb_diff_max:
+- 1.0
+- 1.0
+amb_diff_min:
+- 0.0
+- 0.5
+arti_reg_loss_epochs:
+- 8
+- 276
+arti_reg_loss_weight: 0.2
+articulation_arch: attention
+articulation_epochs:
+- 2
+- 276
+articulation_feature_mode: sample+global
+articulation_multiplier: 0.1
+attach_legs_to_body_epochs:
+- 8
+- 276
+avg_seqshape_epochs:
+- 0
+- 0
+avg_texture_epochs:
+- 0
+- 0
+background_mode: none
+backward_prior: true
+bank_mean_dist_loss_weight: 0.0
+batch_size: 6
+best_pose_start_iter: 10000
+blur_mask: false
+body_bone_idx_preset:
+  0:
+  - 0
+  - 0
+  - 0
+  - 0
+  500000:
+  - 0
+  - 0
+  - 0
+  - 0
+body_bones_type: z_minmax_y+
+body_rotate_reg_mode: all-bones
+bone_y_thresh: 0.4
+bsdf: diffuse
+cam_pos_z_offset: 10
+checkpoint_dir: /viscam/u/zzli/workspace/4DAnimalKingdom_dev/results/paper_exp/same_dino_1109/mb_all_data_1k_artiID_r500k
+clip_tex: false
+clip_tex_loss_weight: 0.0
+combine_dataset: true
+config: config/zzli_exp/same_dino_1109/mb_data1k_artiID_r500k.yml
+constrain_legs: false
+crop_fov_approx: 25
+data_loader_mode: n_frame
+dataset: video
+debug_seq: false
+deform_epochs:
+- 0
+- 276
+deformation_reg_loss_weight: 10.0
+device: cuda:0
+diffusion_albedo_ratio: 0.2
+diffusion_angle_front: 60
+diffusion_angle_overhead: 30
+diffusion_append_prompt_directions: true
+diffusion_guidance_scale: 100
+diffusion_light_ambient: 0.5
+diffusion_light_diffuse: 0.8
+diffusion_loss_weight: 0.0001
+diffusion_max_step: 0.6
+diffusion_num_random_cameras: 1
+diffusion_phi_offset: 180
+diffusion_precision: float16
+diffusion_prompt: an elephant
+diffusion_radius_range:
+- 9
+- 11
+diffusion_random_light: true
+diffusion_resolution: 256
+diffusion_shading_ratio: 0.4
+diffusion_theta_range:
+- 0
+- 100
+diffusion_uniform_sphere_rate: 1
+dim_of_classes: 128
+dino_feat_im_loss_weight:
+  0: 10.0
+  300000: 1.0
+dino_feature_dim: 16
+dino_feature_input: false
+dino_feature_recon_dim: 16
+dino_max: 1.0
+dino_min: 0.0
+disable_fewshot: false
+disc_gt: false
+disc_iv: true
+disc_iv_label: Real
+disc_reg_mul: 10.0
+discriminator_loss_weight: 1.0
+dmtet_grid: 256
+dmtet_grid_smaller: 256
+dmtet_grid_smaller_epoch: 1
+embed_concat_pts: true
+embedder_freq_arti: 8
+embedder_freq_deform: 10
+embedder_freq_dino: 8
+embedder_freq_shape: 8
+embedder_freq_tex: 10
+enable_articulation: true
+enable_articulation_bone_threshold: true
+enable_articulation_idadd: true
+enable_deform: true
+enable_disc: true
+enable_encoder: true
+enable_lighting: true
+enable_mask_distribution: true
+enable_memory_bank: true
+enable_pose: true
+enable_prior: true
+enable_sds: false
+encoder_arch: vit
+encoder_frozen: true
+encoder_pretrained: true
+enhance_back_view: true
+enhance_back_view_path: /viscam/u/zzli/workspace/Animal-Data-Engine/data/data_resize_update/segmented_back_view_data
+extra_renders:
+  instance:
+  - geo_normal
+  - diffuse
+  - gray
+faces_per_pixel: 10
+few_shot_category_num: -1
+few_shot_class_vector_init: copy
+few_shot_data_dir:
+- /viscam/u/zzli/workspace/Animal-Data-Engine/data/data_resize_update/few_shot_data_all
+- /viscam/projects/articulated/dor/Animal-Data-Engine/data/data_resize_update/train_with_classes_filtered
+few_shot_iteration_save: true
+few_shot_iteration_save_freq: 2000
+few_shot_lr: 0.0001
+few_shot_optimize: exp
+few_shot_optimize_bank: all
+few_shot_original_classes_num: 7
+few_shot_resume: true
+few_shot_test_category_names:
+- caracal
+- impala
+- ox
+- squirrel
+- wolf
+few_shot_test_category_num: 5
+few_shot_val_image_num: 5
+fix_viz_batch: false
+flow_loss_epochs:
+- 0
+- 0
+flow_loss_weight: 0.0
+forbid_leg_rotate: true
+fov_w: 60
+full_size_h: 1080
+full_size_w: 1920
+gamma: 1e-6
+gan_tex: false
+grid_scale: 7
+hidden_size: 256
+in_image_size: 256
+init_sdf: ellipsoid
+is_dry_run: false
+iter_arti_reg_loss_start: 60000
+iter_articulation_start: 20000
+iter_attach_leg_to_body_start: 60000
+iter_deformation_start: 500000
+iter_leg_rotation_start: 300000
+iter_nozeroy_start: 20000
+jitter_grid: 0.05
+kd_max:
+- 1.0
+- 1.0
+- 1.0
+- 1.0
+kd_min:
+- 0.0
+- 0.0
+- 0.0
+- 0.0
+keep_num_checkpoint: 1
+ks_max:
+- 0.0
+- 0.0
+- 0.0
+ks_min:
+- 0.0
+- 0.0
+- 0.0
+latent_dim: 256
+load_dino_cluster: false
+load_dino_feature: true
+log_freq_images: 501
+log_freq_losses: 50
+log_train_images: true
+logit_loss_dino_feat_im_loss_multiplier:
+  0: 50.0
+  300000: 500.0
+logit_loss_weight: 1.0
+lookat_init:
+- 0.0
+- 0.0
+- 0.0
+lookat_zeroy: true
+lr: 6.0e-05
+mask_disc_loss_feat_condition: true
+mask_disc_loss_weight: 0.1
+mask_discriminator_iter:
+- 80000
+- 300000
+mask_distribution_loss_freq: 1
+mask_distribution_loss_weight: 0.0
+mask_distribution_path: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/mask_distribution
+max_arti_angle: 60
+max_trans_xy_range_ratio: 0.5
+max_trans_z_range_ratio: 0.5
+memory_bank_init: copy
+memory_bank_size: 60
+memory_bank_topk: 10
+memory_encoder: DINO
+memory_retrieve: cos-linear
+mesh_edge_length_loss_weight: 0.0
+mesh_normal_consistency_loss_weight: 0.0
+min_seq_len: 1
+nrm_max:
+- 1.0
+- 1.0
+- 1.0
+nrm_min:
+- -1.0
+- -1.0
+- 0.0
+num_body_bones: 8
+num_epochs: 1375
+num_iterations: 10000000
+num_layers_arti: 4
+num_layers_deform: 5
+num_layers_dino: 5
+num_layers_light: 5
+num_layers_tex: 8
+num_leg_bones: 3
+num_legs: 4
+num_sample_frames: 1
+num_workers: 8
+out_image_size: 256
+perturb_articulation_epochs:
+- 0
+- 0
+perturb_normal: false
+perturb_sdf: false
+pose_arch: encoder_dino_patch_key
+pose_entropy_loss_weight: 0.0
+pose_epochs:
+- 0
+- 0
+pose_xflip_recon_epochs:
+- 0
+- 0
+pose_xflip_reg_loss_weight: 0.0
+prior_condition_choice: mod
+prior_lr: 0.0006
+prior_sdf_mode: mlp
+pyplot_metrics: false
+random_flip_train: true
+random_mask_law: random_azimuth
+random_sample_train_frames: false
+random_sample_val_frames: true
+rank: 0
+reg_body_rotate_mult: 0.1
+render_dino_mode: feature_mlp
+renderer_spp: 4
+resume: true
+resume_prior_optim: true
+rgb_loss_weight: 1.0
+rgb_suffix: .png
+root_dir: /viscam/u/zzli
+rot_all_quad_epochs:
+- 0
+- 276
+rot_rand_quad_epochs:
+- 0
+- 0
+rot_rep: quadlookat
+rot_temp_scalar: 1.0
+run_few_shot: true
+run_train: true
+save_checkpoint_freq: 1
+save_result_freq: 501
+sdf_bce_reg_loss_min_weight: 0
+sdf_bce_reg_loss_weight: 0
+sdf_gradient_reg_loss_min_weight: 0.1
+sdf_gradient_reg_loss_weight: 0.1
+sdf_inflate_reg_loss_epochs:
+- 0
+- 0
+sdf_reg_decay_start_iter: 10000
+seed: 0
+seqshape_epochs:
+- 0
+- 0
+shuffle_train_seqs: true
+sigma: 1e-6
+silhouette_dt_loss_weight: 0.0
+silhouette_inv_dt_loss_weight: 50.0
+silhouette_loss_weight: 5.0
+skinning_temperature: 0.05
+skip_beginning: 0
+skip_end: 0
+small_leg_angle: true
+smooth_deformation_loss_weight: 10.0
+static_root_bones: false
+sym_deform: true
+sym_dino: false
+sym_prior_shape: true
+sym_texture: true
+temp_clip_high: 10.0
+temp_clip_low: 1.0
+tex_im_size: 256
+texture_epochs:
+- 0
+- 276
+texture_mode: mlp
+train_data_dir:
+  bear: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/bear_comb_dinov2_new/train
+  cow: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/cow_comb_dinov2_new/train
+  elephant: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/elephant_comb_dinov2_new/train
+  giraffe: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/giraffe_comb_dinov2_new/train
+  horse: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/horse_comb_dinov2_new/train
+  sheep: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/sheep_comb_dinov2_new/train
+  zebra: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/zebra_comb_dinov2_new/train
+train_with_cub: false
+use_logger: true
+use_scheduler: false
+use_wandb: false
+val_data_dir:
+  bear: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/bear_comb_dinov2_new/val
+  cow: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/cow_comb_dinov2_new/val
+  elephant: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/elephant_comb_dinov2_new/val
+  giraffe: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/giraffe_comb_dinov2_new/val
+  horse: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/horse_comb_dinov2_new/val
+  sheep: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/sheep_comb_dinov2_new/val
+  zebra: /viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new/zebra_comb_dinov2_new/val
+visualize_validation: true
+vit_final_layer_type: conv
+which_vit: dino_vits8
+world_size: 1
+zflip_epochs:
+- 0
+- 0

ckpts/iter0800000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c7b090f1ff3e76e2ba608a25a2bd79af2892d6bb307132c9d038082395c1d57
+size 306560367

video3d/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .utils.misc import setup_runtime
+from .trainer import Trainer
+from .trainer_ddp import TrainerDDP
+from .model import Unsup3D
+from .model_ddp import Unsup3DDDP
+from .trainer_few_shot import Fewshot_Trainer

video3d/cages/cages.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Cages code used from https://github.com/yifita/deep_cage
+import torch
+import numpy as np
+import trimesh
+def deform_with_MVC(cage, cage_deformed, cage_face, query, verbose=False):
+    """
+    cage (B,C,3)
+    cage_deformed (B,C,3)
+    cage_face (B,F,3) int64
+    query (B,Q,3)
+    """
+    weights, weights_unnormed = mean_value_coordinates_3D(query, cage, cage_face, verbose=True)
+#     weights = weights.detach()
+    deformed = torch.sum(weights.unsqueeze(-1)*cage_deformed.unsqueeze(1), dim=2)
+    if verbose:
+        return deformed, weights, weights_unnormed
+    return deformed
+def loadInitCage(template):
+    init_cage_V, init_cage_F = read_trimesh(template)
+    init_cage_V = torch.from_numpy(init_cage_V[:,:3].astype(np.float32)).unsqueeze(0)*2.0
+    init_cage_F = torch.from_numpy(init_cage_F[:,:3].astype(np.int64)).unsqueeze(0)
+    return init_cage_V, init_cage_F
+def read_trimesh(path):
+    mesh = trimesh.load(path)
+    return mesh.vertices, mesh.faces
+# util functions from pytorch_points
+PI = 3.1415927
+def normalize_to_box(input):
+    """
+    normalize point cloud to unit bounding box
+    center = (max - min)/2
+    scale = max(abs(x))
+    input: pc [N, P, dim] or [P, dim]
+    output: pc, centroid, furthest_distance
+    """
+    if len(input.shape) == 2:
+        axis = 0
+        P = input.shape[0]
+        D = input.shape[1]
+    elif len(input.shape) == 3:
+        axis = 1
+        P = input.shape[1]
+        D = input.shape[2]
+    if isinstance(input, np.ndarray):
+        maxP = np.amax(input, axis=axis, keepdims=True)
+        minP = np.amin(input, axis=axis, keepdims=True)
+        centroid = (maxP+minP)/2
+        input = input - centroid
+        furthest_distance = np.amax(np.abs(input), axis=(axis, -1), keepdims=True)
+        input = input / furthest_distance
+    elif isinstance(input, torch.Tensor):
+        maxP = torch.max(input, dim=axis, keepdim=True)[0]
+        minP = torch.min(input, dim=axis, keepdim=True)[0]
+        centroid = (maxP+minP)/2
+        input = input - centroid
+        in_shape = list(input.shape[:axis])+[P*D]
+        furthest_distance = torch.max(torch.abs(input).view(in_shape), dim=axis, keepdim=True)[0]
+        furthest_distance = furthest_distance.unsqueeze(-1)
+        input = input / furthest_distance
+    return input, centroid, furthest_distance
+def normalize(tensor, dim=-1):
+    """normalize tensor in specified dimension"""
+    return torch.nn.functional.normalize(tensor, p=2, dim=dim, eps=1e-12, out=None)
+def check_values(tensor):
+    """return true if tensor doesn't contain NaN or Inf"""
+    return not (torch.any(torch.isnan(tensor)).item() or torch.any(torch.isinf(tensor)).item())
+class ScatterAdd(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, src, idx, dim, out_size, fill=0.0):
+        out = torch.full(out_size, fill, device=src.device, dtype=src.dtype)
+        ctx.save_for_backward(idx)
+        out.scatter_add_(dim, idx, src)
+        ctx.mark_non_differentiable(idx)
+        ctx.dim = dim
+        return out
+    @staticmethod
+    def backward(ctx, ograd):
+        idx, = ctx.saved_tensors
+        grad = torch.gather(ograd, ctx.dim, idx)
+        return grad, None, None, None, None
+_scatter_add = ScatterAdd.apply
+def scatter_add(src, idx, dim, out_size=None, fill=0.0):
+    if out_size is None:
+        out_size = list(src.size())
+        dim_size = idx.max().item()+1
+        out_size[dim] = dim_size
+    return _scatter_add(src, idx, dim, out_size, fill)
+def mean_value_coordinates_3D(query, vertices, faces, verbose=False):
+    """
+    Tao Ju et.al. MVC for 3D triangle meshes
+    params:
+        query    (B,P,3)
+        vertices (B,N,3)
+        faces    (B,F,3)
+    return:
+        wj       (B,P,N)
+    """
+    B, F, _ = faces.shape
+    _, P, _ = query.shape
+    _, N, _ = vertices.shape
+    # u_i = p_i - x (B,P,N,3)
+    uj = vertices.unsqueeze(1) - query.unsqueeze(2)
+    # \|u_i\| (B,P,N,1)
+    dj = torch.norm(uj, dim=-1, p=2, keepdim=True)
+    uj = normalize(uj, dim=-1)
+    # gather triangle B,P,F,3,3
+    ui = torch.gather(uj.unsqueeze(2).expand(-1,-1,F,-1,-1),
+                                   3,
+                                   faces.unsqueeze(1).unsqueeze(-1).expand(-1,P,-1,-1,3))
+    # li = \|u_{i+1}-u_{i-1}\| (B,P,F,3)
+    li = torch.norm(ui[:,:,:,[1, 2, 0],:] - ui[:, :, :,[2, 0, 1],:], dim=-1, p=2)
+    eps = 2e-5
+    li = torch.where(li>=2, li-(li.detach()-(2-eps)), li)
+    li = torch.where(li<=-2, li-(li.detach()+(2-eps)), li)
+    # asin(x) is inf at +/-1
+    # θi =  2arcsin[li/2] (B,P,F,3)
+    theta_i = 2*torch.asin(li/2)
+    assert(check_values(theta_i))
+    # B,P,F,1
+    h = torch.sum(theta_i, dim=-1, keepdim=True)/2
+    # wi← sin[θi]d{i−1}d{i+1}
+    # (B,P,F,3) ci ← (2sin[h]sin[h−θi])/(sin[θ_{i+1}]sin[θ_{i−1}])−1
+    ci = 2*torch.sin(h)*torch.sin(h-theta_i)/(torch.sin(theta_i[:,:,:,[1, 2, 0]])*torch.sin(theta_i[:,:,:,[2, 0, 1]]))-1
+    # NOTE: because of floating point ci can be slightly larger than 1, causing problem with sqrt(1-ci^2)
+    # NOTE: sqrt(x)' is nan for x=0, hence use eps
+    eps = 1e-5
+    ci = torch.where(ci>=1, ci-(ci.detach()-(1-eps)), ci)
+    ci = torch.where(ci<=-1, ci-(ci.detach()+(1-eps)), ci)
+    # si← sign[det[u1,u2,u3]]sqrt(1-ci^2)
+    # (B,P,F)*(B,P,F,3)
+    si = torch.sign(torch.det(ui)).unsqueeze(-1)*torch.sqrt(1-ci**2)  # sqrt gradient nan for 0
+    assert(check_values(si))
+    # (B,P,F,3)
+    di = torch.gather(dj.unsqueeze(2).squeeze(-1).expand(-1,-1,F,-1), 3,
+                      faces.unsqueeze(1).expand(-1,P,-1,-1))
+    assert(check_values(di))
+    # if si.requires_grad:
+    #     vertices.register_hook(save_grad("mvc/dv"))
+    #     li.register_hook(save_grad("mvc/dli"))
+    #     theta_i.register_hook(save_grad("mvc/dtheta"))
+    #     ci.register_hook(save_grad("mvc/dci"))
+    #     si.register_hook(save_grad("mvc/dsi"))
+    #     di.register_hook(save_grad("mvc/ddi"))
+    # wi← (θi −c[i+1]θ[i−1] −c[i−1]θ[i+1])/(disin[θi+1]s[i−1])
+    # B,P,F,3
+    # CHECK is there a 2* in the denominator
+    wi = (theta_i-ci[:,:,:,[1,2,0]]*theta_i[:,:,:,[2,0,1]]-ci[:,:,:,[2,0,1]]*theta_i[:,:,:,[1,2,0]])/(di*torch.sin(theta_i[:,:,:,[1,2,0]])*si[:,:,:,[2,0,1]])
+    # if ∃i,|si| ≤ ε, set wi to 0. coplaner with T but outside
+    # ignore coplaner outside triangle
+    # alternative check
+    # (B,F,3,3)
+    # triangle_points = torch.gather(vertices.unsqueeze(1).expand(-1,F,-1,-1), 2, faces.unsqueeze(-1).expand(-1,-1,-1,3))
+    # # (B,P,F,3), (B,1,F,3) -> (B,P,F,1)
+    # determinant = dot_product(triangle_points[:,:,:,0].unsqueeze(1)-query.unsqueeze(2),
+    #                           torch.cross(triangle_points[:,:,:,1]-triangle_points[:,:,:,0],
+    #                                       triangle_points[:,:,:,2]-triangle_points[:,:,:,0], dim=-1).unsqueeze(1), dim=-1, keepdim=True).detach()
+    # # (B,P,F,1)
+    # sqrdist = determinant*determinant / (4 * sqrNorm(torch.cross(triangle_points[:,:,:,1]-triangle_points[:,:,:,0], triangle_points[:,:,:,2]-triangle_points[:,:,:,0], dim=-1), keepdim=True))
+    wi = torch.where(torch.any(torch.abs(si) <= 1e-5, keepdim=True, dim=-1), torch.zeros_like(wi), wi)
+    # wi = torch.where(sqrdist <= 1e-5, torch.zeros_like(wi), wi)
+    # if π −h < ε, x lies on t, use 2D barycentric coordinates
+    # inside triangle
+    inside_triangle = (PI-h).squeeze(-1)<1e-4
+    # set all F for this P to zero
+    wi = torch.where(torch.any(inside_triangle, dim=-1, keepdim=True).unsqueeze(-1), torch.zeros_like(wi), wi)
+    # CHECK is it di https://www.cse.wustl.edu/~taoju/research/meanvalue.pdf or li http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.516.1856&rep=rep1&type=pdf
+    wi = torch.where(inside_triangle.unsqueeze(-1).expand(-1,-1,-1,wi.shape[-1]), torch.sin(theta_i)*di[:,:,:,[2,0,1]]*di[:,:,:,[1,2,0]], wi)
+    # sum over all faces face -> vertex (B,P,F*3) -> (B,P,N)
+    wj = scatter_add(wi.reshape(B,P,-1).contiguous(), faces.unsqueeze(1).expand(-1,P,-1,-1).reshape(B,P,-1), 2, out_size=(B,P,N))
+    # close to vertex (B,P,N)
+    close_to_point = dj.squeeze(-1) < 1e-8
+    # set all F for this P to zero
+    wj = torch.where(torch.any(close_to_point, dim=-1, keepdim=True), torch.zeros_like(wj), wj)
+    wj = torch.where(close_to_point, torch.ones_like(wj), wj)
+    # (B,P,1)
+    sumWj = torch.sum(wj, dim=-1, keepdim=True)
+    sumWj = torch.where(sumWj==0, torch.ones_like(sumWj), sumWj)
+    wj_normalised = wj / sumWj
+    # if wj.requires_grad:
+    #     saved_variables["mvc/wi"] = wi
+    #     wi.register_hook(save_grad("mvc/dwi"))
+    #     wj.register_hook(save_grad("mvc/dwj"))
+    if verbose:
+        return wj_normalised, wi
+    else:
+        return wj_normalised

video3d/cub_dataloaders.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import os.path as osp
+import cv2
+import numpy as np
+import scipy.io as sio
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from types import SimpleNamespace
+def get_cub_loader(data_dir, split='test', is_validation=False, batch_size=256, num_workers=4, image_size=256):
+    opts = SimpleNamespace()
+    opts.data_dir = data_dir
+    opts.padding_frac = 0.05
+    opts.jitter_frac = 0.05
+    opts.input_size = image_size
+    opts.split = split
+    dataset = CUBDataset(opts)
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=not is_validation,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader
+class CUBDataset(Dataset):
+    def __init__(self, opts):
+        super().__init__()
+        self.opts = opts
+        self.img_size = opts.input_size
+        self.jitter_frac = opts.jitter_frac
+        self.padding_frac = opts.padding_frac
+        self.split = opts.split
+        self.data_dir = opts.data_dir
+        self.data_cache_dir = osp.join(self.data_dir, 'cachedir/cub')
+        self.img_dir = osp.join(self.data_dir, 'images')
+        self.anno_path = osp.join(self.data_cache_dir, 'data', '%s_cub_cleaned.mat' % self.split)
+        self.anno_sfm_path = osp.join(self.data_cache_dir, 'sfm', 'anno_%s.mat' % self.split)
+        if not osp.exists(self.anno_path):
+            print('%s doesnt exist!' % self.anno_path)
+            import pdb; pdb.set_trace()
+        # Load the annotation file.
+        print('loading %s' % self.anno_path)
+        self.anno = sio.loadmat(
+            self.anno_path, struct_as_record=False, squeeze_me=True)['images']
+        self.anno_sfm = sio.loadmat(
+            self.anno_sfm_path, struct_as_record=False, squeeze_me=True)['sfm_anno']
+        self.kp_perm = np.array([1, 2, 3, 4, 5, 6, 11, 12, 13, 10, 7, 8, 9, 14, 15]) - 1;
+        self.num_imgs = len(self.anno)
+        print('%d images' % self.num_imgs)
+    def forward_img(self, index):
+        data = self.anno[index]
+        data_sfm = self.anno_sfm[0]
+        # sfm_pose = (sfm_c, sfm_t, sfm_r)
+        sfm_pose = [np.copy(data_sfm.scale), np.copy(data_sfm.trans), np.copy(data_sfm.rot)]
+        sfm_rot = np.pad(sfm_pose[2], (0,1), 'constant')
+        sfm_rot[3, 3] = 1
+        sfm_pose[2] = quaternion_from_matrix(sfm_rot, isprecise=True)
+        img_path = osp.join(self.img_dir, str(data.rel_path))
+        #img_path = img_path.replace("JPEG", "jpg")
+        img = np.array(Image.open(img_path))
+        # Some are grayscale:
+        if len(img.shape) == 2:
+            img = np.repeat(np.expand_dims(img, 2), 3, axis=2)
+        mask = data.mask
+        mask = np.expand_dims(mask, 2)
+        h,w,_ = mask.shape
+        # Adjust to 0 indexing
+        bbox = np.array(
+            [data.bbox.x1, data.bbox.y1, data.bbox.x2, data.bbox.y2],
+            float) - 1
+        parts = data.parts.T.astype(float)
+        kp = np.copy(parts)
+        vis = kp[:, 2] > 0
+        kp[vis, :2] -= 1
+        # Peturb bbox
+        if self.split == 'train':
+            bbox = peturb_bbox(
+                bbox, pf=self.padding_frac, jf=self.jitter_frac)
+        else:
+            bbox = peturb_bbox(
+                bbox, pf=self.padding_frac, jf=0)
+        bbox = square_bbox(bbox)
+        # crop image around bbox, translate kps
+        img, mask, kp, sfm_pose = self.crop_image(img, mask, bbox, kp, vis, sfm_pose)
+        # scale image, and mask. And scale kps.
+        img, mask, kp, sfm_pose = self.scale_image(img, mask, kp, vis, sfm_pose)
+        # Mirror image on random.
+        if self.split == 'train':
+            img, mask, kp, sfm_pose = self.mirror_image(img, mask, kp, sfm_pose)
+        # Normalize kp to be [-1, 1]
+        img_h, img_w = img.shape[:2]
+        kp_norm, sfm_pose = self.normalize_kp(kp, sfm_pose, img_h, img_w)
+        # img = Image.fromarray(np.asarray(img, np.uint8))
+        mask = np.asarray(mask, np.float32)
+        return img, kp_norm, mask, sfm_pose, img_path
+    def normalize_kp(self, kp, sfm_pose, img_h, img_w):
+        vis = kp[:, 2, None] > 0
+        new_kp = np.stack([2 * (kp[:, 0] / img_w) - 1,
+                           2 * (kp[:, 1] / img_h) - 1,
+                           kp[:, 2]]).T
+        sfm_pose[0] *= (1.0/img_w + 1.0/img_h)
+        sfm_pose[1][0] = 2.0 * (sfm_pose[1][0] / img_w) - 1
+        sfm_pose[1][1] = 2.0 * (sfm_pose[1][1] / img_h) - 1
+        new_kp = vis * new_kp
+        return new_kp, sfm_pose
+    def crop_image(self, img, mask, bbox, kp, vis, sfm_pose):
+        # crop image and mask and translate kps
+        img = crop(img, bbox, bgval=1)
+        mask = crop(mask, bbox, bgval=0)
+        kp[vis, 0] -= bbox[0]
+        kp[vis, 1] -= bbox[1]
+        sfm_pose[1][0] -= bbox[0]
+        sfm_pose[1][1] -= bbox[1]
+        return img, mask, kp, sfm_pose
+    def scale_image(self, img, mask, kp, vis, sfm_pose):
+        # Scale image so largest bbox size is img_size
+        bwidth = np.shape(img)[0]
+        bheight = np.shape(img)[1]
+        scale = self.img_size / float(max(bwidth, bheight))
+        img_scale, _ = resize_img(img, scale)
+        # if img_scale.shape[0] != self.img_size:
+        #     print('bad!')
+        #     import ipdb; ipdb.set_trace()
+        # mask_scale, _ = resize_img(mask, scale)
+#         mask_scale, _ = resize_img(mask, scale, interpolation=cv2.INTER_NEAREST)
+        mask_scale, _ = resize_img(mask, scale)
+        kp[vis, :2] *= scale
+        sfm_pose[0] *= scale
+        sfm_pose[1] *= scale
+        return img_scale, mask_scale, kp, sfm_pose
+    def mirror_image(self, img, mask, kp, sfm_pose):
+        kp_perm = self.kp_perm
+        if np.random.rand(1) > 0.5:
+            # Need copy bc torch collate doesnt like neg strides
+            img_flip = img[:, ::-1, :].copy()
+            mask_flip = mask[:, ::-1].copy()
+            # Flip kps.
+            new_x = img.shape[1] - kp[:, 0] - 1
+            kp_flip = np.hstack((new_x[:, None], kp[:, 1:]))
+            kp_flip = kp_flip[kp_perm, :]
+            # Flip sfm_pose Rot.
+            R = quaternion_matrix(sfm_pose[2])
+            flip_R = np.diag([-1, 1, 1, 1]).dot(R.dot(np.diag([-1, 1, 1, 1])))
+            sfm_pose[2] = quaternion_from_matrix(flip_R, isprecise=True)
+            # Flip tx
+            tx = img.shape[1] - sfm_pose[1][0] - 1
+            sfm_pose[1][0] = tx
+            return img_flip, mask_flip, kp_flip, sfm_pose
+        else:
+            return img, mask, kp, sfm_pose
+    def __len__(self):
+        return self.num_imgs
+    def __getitem__(self, index):
+        img, kp, mask, sfm_pose, img_path = self.forward_img(index)
+        sfm_pose[0].shape = 1
+        mask = np.expand_dims(mask, 2)
+        images = torch.FloatTensor(img /255.).permute(2,0,1).unsqueeze(0)
+        masks = torch.FloatTensor(mask).permute(2,0,1).repeat(1,3,1,1)
+        mask_dt = compute_distance_transform(masks)
+        # flows = torch.zeros(1,2, self.img_size, self.img_size)
+        flows = torch.zeros(1)
+        bboxs = torch.FloatTensor([0, 0, 0, self.img_size, self.img_size, 1, 1, 0]).unsqueeze(0) # frame_id, crop_x0, crop_y0, crop_w, crop_h, resize_sx, resize_sy, sharpness
+        bg_image = images[0]
+        seq_idx = torch.LongTensor([index])
+        frame_idx = torch.LongTensor([0])
+        return images, masks, mask_dt, flows, bboxs, bg_image, seq_idx, frame_idx
+def compute_distance_transform(mask):
+    mask_dt = []
+    for m in mask:
+        dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        inv_dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(1 - m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        mask_dt += [torch.stack([dt, inv_dt], 0)]
+    return torch.stack(mask_dt, 0)  # Bx2xHxW
+def resize_img(img, scale_factor):
+    new_size = (np.round(np.array(img.shape[:2]) * scale_factor)).astype(int)
+    new_img = cv2.resize(img, (new_size[1], new_size[0]))
+    # This is scale factor of [height, width] i.e. [y, x]
+    actual_factor = [new_size[0] / float(img.shape[0]),
+                     new_size[1] / float(img.shape[1])]
+    return new_img, actual_factor
+def peturb_bbox(bbox, pf=0, jf=0):
+    '''
+    Jitters and pads the input bbox.
+    Args:
+        bbox: Zero-indexed tight bbox.
+        pf: padding fraction.
+        jf: jittering fraction.
+    Returns:
+        pet_bbox: Jittered and padded box. Might have -ve or out-of-image coordinates
+    '''
+    pet_bbox = [coord for coord in bbox]
+    bwidth = bbox[2] - bbox[0] + 1
+    bheight = bbox[3] - bbox[1] + 1
+    pet_bbox[0] -= (pf*bwidth) + (1-2*np.random.random())*jf*bwidth
+    pet_bbox[1] -= (pf*bheight) + (1-2*np.random.random())*jf*bheight
+    pet_bbox[2] += (pf*bwidth) + (1-2*np.random.random())*jf*bwidth
+    pet_bbox[3] += (pf*bheight) + (1-2*np.random.random())*jf*bheight
+    return pet_bbox
+def square_bbox(bbox):
+    '''
+    Converts a bbox to have a square shape by increasing size along non-max dimension.
+    '''
+    sq_bbox = [int(round(coord)) for coord in bbox]
+    bwidth = sq_bbox[2] - sq_bbox[0] + 1
+    bheight = sq_bbox[3] - sq_bbox[1] + 1
+    maxdim = float(max(bwidth, bheight))
+    dw_b_2 = int(round((maxdim-bwidth)/2.0))
+    dh_b_2 = int(round((maxdim-bheight)/2.0))
+    sq_bbox[0] -= dw_b_2
+    sq_bbox[1] -= dh_b_2
+    sq_bbox[2] = sq_bbox[0] + maxdim - 1
+    sq_bbox[3] = sq_bbox[1] + maxdim - 1
+    return sq_bbox
+def crop(img, bbox, bgval=0):
+    '''
+    Crops a region from the image corresponding to the bbox.
+    If some regions specified go outside the image boundaries, the pixel values are set to bgval.
+    Args:
+        img: image to crop
+        bbox: bounding box to crop
+        bgval: default background for regions outside image
+    '''
+    bbox = [int(round(c)) for c in bbox]
+    bwidth = bbox[2] - bbox[0] + 1
+    bheight = bbox[3] - bbox[1] + 1
+    im_shape = np.shape(img)
+    im_h, im_w = im_shape[0], im_shape[1]
+    nc = 1 if len(im_shape) < 3 else im_shape[2]
+    img_out = np.ones((bheight, bwidth, nc))*bgval
+    x_min_src = max(0, bbox[0])
+    x_max_src = min(im_w, bbox[2]+1)
+    y_min_src = max(0, bbox[1])
+    y_max_src = min(im_h, bbox[3]+1)
+    x_min_trg = x_min_src - bbox[0]
+    x_max_trg = x_max_src - x_min_src + x_min_trg
+    y_min_trg = y_min_src - bbox[1]
+    y_max_trg = y_max_src - y_min_src + y_min_trg
+    img_out[y_min_trg:y_max_trg, x_min_trg:x_max_trg, :] = img[y_min_src:y_max_src, x_min_src:x_max_src, :]
+    return img_out
+# https://github.com/akanazawa/cmr/blob/master/utils/transformations.py
+import math
+import numpy
+_EPS = numpy.finfo(float).eps * 4.0
+def quaternion_matrix(quaternion):
+    """Return homogeneous rotation matrix from quaternion.
+    >>> M = quaternion_matrix([0.99810947, 0.06146124, 0, 0])
+    >>> numpy.allclose(M, rotation_matrix(0.123, [1, 0, 0]))
+    True
+    >>> M = quaternion_matrix([1, 0, 0, 0])
+    >>> numpy.allclose(M, numpy.identity(4))
+    True
+    >>> M = quaternion_matrix([0, 1, 0, 0])
+    >>> numpy.allclose(M, numpy.diag([1, -1, -1, 1]))
+    True
+    """
+    q = numpy.array(quaternion, dtype=numpy.float64, copy=True)
+    n = numpy.dot(q, q)
+    if n < _EPS:
+        return numpy.identity(4)
+    q *= math.sqrt(2.0 / n)
+    q = numpy.outer(q, q)
+    return numpy.array([
+        [1.0-q[2, 2]-q[3, 3],     q[1, 2]-q[3, 0],     q[1, 3]+q[2, 0], 0.0],
+        [    q[1, 2]+q[3, 0], 1.0-q[1, 1]-q[3, 3],     q[2, 3]-q[1, 0], 0.0],
+        [    q[1, 3]-q[2, 0],     q[2, 3]+q[1, 0], 1.0-q[1, 1]-q[2, 2], 0.0],
+        [                0.0,                 0.0,                 0.0, 1.0]])
+def quaternion_from_matrix(matrix, isprecise=False):
+    """Return quaternion from rotation matrix.
+    If isprecise is True, the input matrix is assumed to be a precise rotation
+    matrix and a faster algorithm is used.
+    >>> q = quaternion_from_matrix(numpy.identity(4), True)
+    >>> numpy.allclose(q, [1, 0, 0, 0])
+    True
+    >>> q = quaternion_from_matrix(numpy.diag([1, -1, -1, 1]))
+    >>> numpy.allclose(q, [0, 1, 0, 0]) or numpy.allclose(q, [0, -1, 0, 0])
+    True
+    >>> R = rotation_matrix(0.123, (1, 2, 3))
+    >>> q = quaternion_from_matrix(R, True)
+    >>> numpy.allclose(q, [0.9981095, 0.0164262, 0.0328524, 0.0492786])
+    True
+    >>> R = [[-0.545, 0.797, 0.260, 0], [0.733, 0.603, -0.313, 0],
+    ...      [-0.407, 0.021, -0.913, 0], [0, 0, 0, 1]]
+    >>> q = quaternion_from_matrix(R)
+    >>> numpy.allclose(q, [0.19069, 0.43736, 0.87485, -0.083611])
+    True
+    >>> R = [[0.395, 0.362, 0.843, 0], [-0.626, 0.796, -0.056, 0],
+    ...      [-0.677, -0.498, 0.529, 0], [0, 0, 0, 1]]
+    >>> q = quaternion_from_matrix(R)
+    >>> numpy.allclose(q, [0.82336615, -0.13610694, 0.46344705, -0.29792603])
+    True
+    >>> R = random_rotation_matrix()
+    >>> q = quaternion_from_matrix(R)
+    >>> is_same_transform(R, quaternion_matrix(q))
+    True
+    >>> is_same_quaternion(quaternion_from_matrix(R, isprecise=False),
+    ...                    quaternion_from_matrix(R, isprecise=True))
+    True
+    >>> R = euler_matrix(0.0, 0.0, numpy.pi/2.0)
+    >>> is_same_quaternion(quaternion_from_matrix(R, isprecise=False),
+    ...                    quaternion_from_matrix(R, isprecise=True))
+    True
+    """
+    M = numpy.array(matrix, dtype=numpy.float64, copy=False)[:4, :4]
+    if isprecise:
+        q = numpy.empty((4, ))
+        t = numpy.trace(M)
+        if t > M[3, 3]:
+            q[0] = t
+            q[3] = M[1, 0] - M[0, 1]
+            q[2] = M[0, 2] - M[2, 0]
+            q[1] = M[2, 1] - M[1, 2]
+        else:
+            i, j, k = 0, 1, 2
+            if M[1, 1] > M[0, 0]:
+                i, j, k = 1, 2, 0
+            if M[2, 2] > M[i, i]:
+                i, j, k = 2, 0, 1
+            t = M[i, i] - (M[j, j] + M[k, k]) + M[3, 3]
+            q[i] = t
+            q[j] = M[i, j] + M[j, i]
+            q[k] = M[k, i] + M[i, k]
+            q[3] = M[k, j] - M[j, k]
+            q = q[[3, 0, 1, 2]]
+        q *= 0.5 / math.sqrt(t * M[3, 3])
+    else:
+        m00 = M[0, 0]
+        m01 = M[0, 1]
+        m02 = M[0, 2]
+        m10 = M[1, 0]
+        m11 = M[1, 1]
+        m12 = M[1, 2]
+        m20 = M[2, 0]
+        m21 = M[2, 1]
+        m22 = M[2, 2]
+        # symmetric matrix K
+        K = numpy.array([[m00-m11-m22, 0.0,         0.0,         0.0],
+                         [m01+m10,     m11-m00-m22, 0.0,         0.0],
+                         [m02+m20,     m12+m21,     m22-m00-m11, 0.0],
+                         [m21-m12,     m02-m20,     m10-m01,     m00+m11+m22]])
+        K /= 3.0
+        # quaternion is eigenvector of K that corresponds to largest eigenvalue
+        w, V = numpy.linalg.eigh(K)
+        q = V[[3, 0, 1, 2], numpy.argmax(w)]
+    if q[0] < 0.0:
+        numpy.negative(q, q)
+    return q

video3d/cub_dataloaders_ddp.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import os.path as osp
+import cv2
+import numpy as np
+import scipy.io as sio
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from types import SimpleNamespace
+def get_cub_loader(data_dir, split='test', is_validation=False, batch_size=256, num_workers=4, image_size=256):
+    opts = SimpleNamespace()
+    opts.data_dir = data_dir
+    opts.padding_frac = 0.05
+    opts.jitter_frac = 0.05
+    opts.input_size = image_size
+    opts.split = split
+    dataset = CUBDataset(opts)
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=not is_validation,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader
+def get_cub_loader_ddp(data_dir, world_size, rank, split='test', is_validation=False, batch_size=256, num_workers=4, image_size=256):
+    opts = SimpleNamespace()
+    opts.data_dir = data_dir
+    opts.padding_frac = 0.05
+    opts.jitter_frac = 0.05
+    opts.input_size = image_size
+    opts.split = split
+    dataset = CUBDataset(opts)
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=rank,
+    )
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        shuffle=not is_validation,
+        drop_last=True,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader
+class CUBDataset(Dataset):
+    def __init__(self, opts):
+        super().__init__()
+        self.opts = opts
+        self.img_size = opts.input_size
+        self.jitter_frac = opts.jitter_frac
+        self.padding_frac = opts.padding_frac
+        self.split = opts.split
+        self.data_dir = opts.data_dir
+        self.data_cache_dir = osp.join(self.data_dir, 'cachedir/cub')
+        self.img_dir = osp.join(self.data_dir, 'images')
+        self.anno_path = osp.join(self.data_cache_dir, 'data', '%s_cub_cleaned.mat' % self.split)
+        self.anno_sfm_path = osp.join(self.data_cache_dir, 'sfm', 'anno_%s.mat' % self.split)
+        if not osp.exists(self.anno_path):
+            print('%s doesnt exist!' % self.anno_path)
+            import pdb; pdb.set_trace()
+        # Load the annotation file.
+        print('loading %s' % self.anno_path)
+        self.anno = sio.loadmat(
+            self.anno_path, struct_as_record=False, squeeze_me=True)['images']
+        self.anno_sfm = sio.loadmat(
+            self.anno_sfm_path, struct_as_record=False, squeeze_me=True)['sfm_anno']
+        self.kp_perm = np.array([1, 2, 3, 4, 5, 6, 11, 12, 13, 10, 7, 8, 9, 14, 15]) - 1;
+        self.num_imgs = len(self.anno)
+        print('%d images' % self.num_imgs)
+    def forward_img(self, index):
+        data = self.anno[index]
+        data_sfm = self.anno_sfm[0]
+        # sfm_pose = (sfm_c, sfm_t, sfm_r)
+        sfm_pose = [np.copy(data_sfm.scale), np.copy(data_sfm.trans), np.copy(data_sfm.rot)]
+        sfm_rot = np.pad(sfm_pose[2], (0,1), 'constant')
+        sfm_rot[3, 3] = 1
+        sfm_pose[2] = quaternion_from_matrix(sfm_rot, isprecise=True)
+        img_path = osp.join(self.img_dir, str(data.rel_path))
+        #img_path = img_path.replace("JPEG", "jpg")
+        img = np.array(Image.open(img_path))
+        # Some are grayscale:
+        if len(img.shape) == 2:
+            img = np.repeat(np.expand_dims(img, 2), 3, axis=2)
+        mask = data.mask
+        mask = np.expand_dims(mask, 2)
+        h,w,_ = mask.shape
+        # Adjust to 0 indexing
+        bbox = np.array(
+            [data.bbox.x1, data.bbox.y1, data.bbox.x2, data.bbox.y2],
+            float) - 1
+        parts = data.parts.T.astype(float)
+        kp = np.copy(parts)
+        vis = kp[:, 2] > 0
+        kp[vis, :2] -= 1
+        # Peturb bbox
+        if self.split == 'train':
+            bbox = peturb_bbox(
+                bbox, pf=self.padding_frac, jf=self.jitter_frac)
+        else:
+            bbox = peturb_bbox(
+                bbox, pf=self.padding_frac, jf=0)
+        bbox = square_bbox(bbox)
+        # crop image around bbox, translate kps
+        img, mask, kp, sfm_pose = self.crop_image(img, mask, bbox, kp, vis, sfm_pose)
+        # scale image, and mask. And scale kps.
+        img, mask, kp, sfm_pose = self.scale_image(img, mask, kp, vis, sfm_pose)
+        # Mirror image on random.
+        if self.split == 'train':
+            img, mask, kp, sfm_pose = self.mirror_image(img, mask, kp, sfm_pose)
+        # Normalize kp to be [-1, 1]
+        img_h, img_w = img.shape[:2]
+        kp_norm, sfm_pose = self.normalize_kp(kp, sfm_pose, img_h, img_w)
+        # img = Image.fromarray(np.asarray(img, np.uint8))
+        mask = np.asarray(mask, np.float32)
+        return img, kp_norm, mask, sfm_pose, img_path
+    def normalize_kp(self, kp, sfm_pose, img_h, img_w):
+        vis = kp[:, 2, None] > 0
+        new_kp = np.stack([2 * (kp[:, 0] / img_w) - 1,
+                           2 * (kp[:, 1] / img_h) - 1,
+                           kp[:, 2]]).T
+        sfm_pose[0] *= (1.0/img_w + 1.0/img_h)
+        sfm_pose[1][0] = 2.0 * (sfm_pose[1][0] / img_w) - 1
+        sfm_pose[1][1] = 2.0 * (sfm_pose[1][1] / img_h) - 1
+        new_kp = vis * new_kp
+        return new_kp, sfm_pose
+    def crop_image(self, img, mask, bbox, kp, vis, sfm_pose):
+        # crop image and mask and translate kps
+        img = crop(img, bbox, bgval=1)
+        mask = crop(mask, bbox, bgval=0)
+        kp[vis, 0] -= bbox[0]
+        kp[vis, 1] -= bbox[1]
+        sfm_pose[1][0] -= bbox[0]
+        sfm_pose[1][1] -= bbox[1]
+        return img, mask, kp, sfm_pose
+    def scale_image(self, img, mask, kp, vis, sfm_pose):
+        # Scale image so largest bbox size is img_size
+        bwidth = np.shape(img)[0]
+        bheight = np.shape(img)[1]
+        scale = self.img_size / float(max(bwidth, bheight))
+        img_scale, _ = resize_img(img, scale)
+        # if img_scale.shape[0] != self.img_size:
+        #     print('bad!')
+        #     import ipdb; ipdb.set_trace()
+        # mask_scale, _ = resize_img(mask, scale)
+#         mask_scale, _ = resize_img(mask, scale, interpolation=cv2.INTER_NEAREST)
+        mask_scale, _ = resize_img(mask, scale)
+        kp[vis, :2] *= scale
+        sfm_pose[0] *= scale
+        sfm_pose[1] *= scale
+        return img_scale, mask_scale, kp, sfm_pose
+    def mirror_image(self, img, mask, kp, sfm_pose):
+        kp_perm = self.kp_perm
+        if np.random.rand(1) > 0.5:
+            # Need copy bc torch collate doesnt like neg strides
+            img_flip = img[:, ::-1, :].copy()
+            mask_flip = mask[:, ::-1].copy()
+            # Flip kps.
+            new_x = img.shape[1] - kp[:, 0] - 1
+            kp_flip = np.hstack((new_x[:, None], kp[:, 1:]))
+            kp_flip = kp_flip[kp_perm, :]
+            # Flip sfm_pose Rot.
+            R = quaternion_matrix(sfm_pose[2])
+            flip_R = np.diag([-1, 1, 1, 1]).dot(R.dot(np.diag([-1, 1, 1, 1])))
+            sfm_pose[2] = quaternion_from_matrix(flip_R, isprecise=True)
+            # Flip tx
+            tx = img.shape[1] - sfm_pose[1][0] - 1
+            sfm_pose[1][0] = tx
+            return img_flip, mask_flip, kp_flip, sfm_pose
+        else:
+            return img, mask, kp, sfm_pose
+    def __len__(self):
+        return self.num_imgs
+    def __getitem__(self, index):
+        img, kp, mask, sfm_pose, img_path = self.forward_img(index)
+        sfm_pose[0].shape = 1
+        mask = np.expand_dims(mask, 2)
+        images = torch.FloatTensor(img /255.).permute(2,0,1).unsqueeze(0)
+        masks = torch.FloatTensor(mask).permute(2,0,1).repeat(1,3,1,1)
+        mask_dt = compute_distance_transform(masks)
+        # flows = torch.zeros(1,2, self.img_size, self.img_size)
+        flows = torch.zeros(1)
+        bboxs = torch.FloatTensor([0, 0, 0, self.img_size, self.img_size, 1, 1, 0]).unsqueeze(0) # frame_id, crop_x0, crop_y0, crop_w, crop_h, resize_sx, resize_sy, sharpness
+        bg_image = images[0]
+        seq_idx = torch.LongTensor([index])
+        frame_idx = torch.LongTensor([0])
+        return images, masks, mask_dt, flows, bboxs, bg_image, seq_idx, frame_idx
+def compute_distance_transform(mask):
+    mask_dt = []
+    for m in mask:
+        dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        inv_dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(1 - m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        mask_dt += [torch.stack([dt, inv_dt], 0)]
+    return torch.stack(mask_dt, 0)  # Bx2xHxW
+def resize_img(img, scale_factor):
+    new_size = (np.round(np.array(img.shape[:2]) * scale_factor)).astype(int)
+    new_img = cv2.resize(img, (new_size[1], new_size[0]))
+    # This is scale factor of [height, width] i.e. [y, x]
+    actual_factor = [new_size[0] / float(img.shape[0]),
+                     new_size[1] / float(img.shape[1])]
+    return new_img, actual_factor
+def peturb_bbox(bbox, pf=0, jf=0):
+    '''
+    Jitters and pads the input bbox.
+    Args:
+        bbox: Zero-indexed tight bbox.
+        pf: padding fraction.
+        jf: jittering fraction.
+    Returns:
+        pet_bbox: Jittered and padded box. Might have -ve or out-of-image coordinates
+    '''
+    pet_bbox = [coord for coord in bbox]
+    bwidth = bbox[2] - bbox[0] + 1
+    bheight = bbox[3] - bbox[1] + 1
+    pet_bbox[0] -= (pf*bwidth) + (1-2*np.random.random())*jf*bwidth
+    pet_bbox[1] -= (pf*bheight) + (1-2*np.random.random())*jf*bheight
+    pet_bbox[2] += (pf*bwidth) + (1-2*np.random.random())*jf*bwidth
+    pet_bbox[3] += (pf*bheight) + (1-2*np.random.random())*jf*bheight
+    return pet_bbox
+def square_bbox(bbox):
+    '''
+    Converts a bbox to have a square shape by increasing size along non-max dimension.
+    '''
+    sq_bbox = [int(round(coord)) for coord in bbox]
+    bwidth = sq_bbox[2] - sq_bbox[0] + 1
+    bheight = sq_bbox[3] - sq_bbox[1] + 1
+    maxdim = float(max(bwidth, bheight))
+    dw_b_2 = int(round((maxdim-bwidth)/2.0))
+    dh_b_2 = int(round((maxdim-bheight)/2.0))
+    sq_bbox[0] -= dw_b_2
+    sq_bbox[1] -= dh_b_2
+    sq_bbox[2] = sq_bbox[0] + maxdim - 1
+    sq_bbox[3] = sq_bbox[1] + maxdim - 1
+    return sq_bbox
+def crop(img, bbox, bgval=0):
+    '''
+    Crops a region from the image corresponding to the bbox.
+    If some regions specified go outside the image boundaries, the pixel values are set to bgval.
+    Args:
+        img: image to crop
+        bbox: bounding box to crop
+        bgval: default background for regions outside image
+    '''
+    bbox = [int(round(c)) for c in bbox]
+    bwidth = bbox[2] - bbox[0] + 1
+    bheight = bbox[3] - bbox[1] + 1
+    im_shape = np.shape(img)
+    im_h, im_w = im_shape[0], im_shape[1]
+    nc = 1 if len(im_shape) < 3 else im_shape[2]
+    img_out = np.ones((bheight, bwidth, nc))*bgval
+    x_min_src = max(0, bbox[0])
+    x_max_src = min(im_w, bbox[2]+1)
+    y_min_src = max(0, bbox[1])
+    y_max_src = min(im_h, bbox[3]+1)
+    x_min_trg = x_min_src - bbox[0]
+    x_max_trg = x_max_src - x_min_src + x_min_trg
+    y_min_trg = y_min_src - bbox[1]
+    y_max_trg = y_max_src - y_min_src + y_min_trg
+    img_out[y_min_trg:y_max_trg, x_min_trg:x_max_trg, :] = img[y_min_src:y_max_src, x_min_src:x_max_src, :]
+    return img_out
+# https://github.com/akanazawa/cmr/blob/master/utils/transformations.py
+import math
+import numpy
+_EPS = numpy.finfo(float).eps * 4.0
+def quaternion_matrix(quaternion):
+    """Return homogeneous rotation matrix from quaternion.
+    >>> M = quaternion_matrix([0.99810947, 0.06146124, 0, 0])
+    >>> numpy.allclose(M, rotation_matrix(0.123, [1, 0, 0]))
+    True
+    >>> M = quaternion_matrix([1, 0, 0, 0])
+    >>> numpy.allclose(M, numpy.identity(4))
+    True
+    >>> M = quaternion_matrix([0, 1, 0, 0])
+    >>> numpy.allclose(M, numpy.diag([1, -1, -1, 1]))
+    True
+    """
+    q = numpy.array(quaternion, dtype=numpy.float64, copy=True)
+    n = numpy.dot(q, q)
+    if n < _EPS:
+        return numpy.identity(4)
+    q *= math.sqrt(2.0 / n)
+    q = numpy.outer(q, q)
+    return numpy.array([
+        [1.0-q[2, 2]-q[3, 3],     q[1, 2]-q[3, 0],     q[1, 3]+q[2, 0], 0.0],
+        [    q[1, 2]+q[3, 0], 1.0-q[1, 1]-q[3, 3],     q[2, 3]-q[1, 0], 0.0],
+        [    q[1, 3]-q[2, 0],     q[2, 3]+q[1, 0], 1.0-q[1, 1]-q[2, 2], 0.0],
+        [                0.0,                 0.0,                 0.0, 1.0]])
+def quaternion_from_matrix(matrix, isprecise=False):
+    """Return quaternion from rotation matrix.
+    If isprecise is True, the input matrix is assumed to be a precise rotation
+    matrix and a faster algorithm is used.
+    >>> q = quaternion_from_matrix(numpy.identity(4), True)
+    >>> numpy.allclose(q, [1, 0, 0, 0])
+    True
+    >>> q = quaternion_from_matrix(numpy.diag([1, -1, -1, 1]))
+    >>> numpy.allclose(q, [0, 1, 0, 0]) or numpy.allclose(q, [0, -1, 0, 0])
+    True
+    >>> R = rotation_matrix(0.123, (1, 2, 3))
+    >>> q = quaternion_from_matrix(R, True)
+    >>> numpy.allclose(q, [0.9981095, 0.0164262, 0.0328524, 0.0492786])
+    True
+    >>> R = [[-0.545, 0.797, 0.260, 0], [0.733, 0.603, -0.313, 0],
+    ...      [-0.407, 0.021, -0.913, 0], [0, 0, 0, 1]]
+    >>> q = quaternion_from_matrix(R)
+    >>> numpy.allclose(q, [0.19069, 0.43736, 0.87485, -0.083611])
+    True
+    >>> R = [[0.395, 0.362, 0.843, 0], [-0.626, 0.796, -0.056, 0],
+    ...      [-0.677, -0.498, 0.529, 0], [0, 0, 0, 1]]
+    >>> q = quaternion_from_matrix(R)
+    >>> numpy.allclose(q, [0.82336615, -0.13610694, 0.46344705, -0.29792603])
+    True
+    >>> R = random_rotation_matrix()
+    >>> q = quaternion_from_matrix(R)
+    >>> is_same_transform(R, quaternion_matrix(q))
+    True
+    >>> is_same_quaternion(quaternion_from_matrix(R, isprecise=False),
+    ...                    quaternion_from_matrix(R, isprecise=True))
+    True
+    >>> R = euler_matrix(0.0, 0.0, numpy.pi/2.0)
+    >>> is_same_quaternion(quaternion_from_matrix(R, isprecise=False),
+    ...                    quaternion_from_matrix(R, isprecise=True))
+    True
+    """
+    M = numpy.array(matrix, dtype=numpy.float64, copy=False)[:4, :4]
+    if isprecise:
+        q = numpy.empty((4, ))
+        t = numpy.trace(M)
+        if t > M[3, 3]:
+            q[0] = t
+            q[3] = M[1, 0] - M[0, 1]
+            q[2] = M[0, 2] - M[2, 0]
+            q[1] = M[2, 1] - M[1, 2]
+        else:
+            i, j, k = 0, 1, 2
+            if M[1, 1] > M[0, 0]:
+                i, j, k = 1, 2, 0
+            if M[2, 2] > M[i, i]:
+                i, j, k = 2, 0, 1
+            t = M[i, i] - (M[j, j] + M[k, k]) + M[3, 3]
+            q[i] = t
+            q[j] = M[i, j] + M[j, i]
+            q[k] = M[k, i] + M[i, k]
+            q[3] = M[k, j] - M[j, k]
+            q = q[[3, 0, 1, 2]]
+        q *= 0.5 / math.sqrt(t * M[3, 3])
+    else:
+        m00 = M[0, 0]
+        m01 = M[0, 1]
+        m02 = M[0, 2]
+        m10 = M[1, 0]
+        m11 = M[1, 1]
+        m12 = M[1, 2]
+        m20 = M[2, 0]
+        m21 = M[2, 1]
+        m22 = M[2, 2]
+        # symmetric matrix K
+        K = numpy.array([[m00-m11-m22, 0.0,         0.0,         0.0],
+                         [m01+m10,     m11-m00-m22, 0.0,         0.0],
+                         [m02+m20,     m12+m21,     m22-m00-m11, 0.0],
+                         [m21-m12,     m02-m20,     m10-m01,     m00+m11+m22]])
+        K /= 3.0
+        # quaternion is eigenvector of K that corresponds to largest eigenvalue
+        w, V = numpy.linalg.eigh(K)
+        q = V[[3, 0, 1, 2], numpy.argmax(w)]
+    if q[0] < 0.0:
+        numpy.negative(q, q)
+    return q

video3d/dataloaders.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import os
+from glob import glob
+import random
+import numpy as np
+from PIL import Image
+import cv2
+import torch
+from torch.utils.data import Dataset
+import torchvision.datasets.folder
+import torchvision.transforms as transforms
+from einops import rearrange
+def compute_distance_transform(mask):
+    mask_dt = []
+    for m in mask:
+        dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        inv_dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(1 - m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        mask_dt += [torch.stack([dt, inv_dt], 0)]
+    return torch.stack(mask_dt, 0)  # Bx2xHxW
+def crop_image(image, boxs, size):
+    crops = []
+    for box in boxs:
+        crop_x0, crop_y0, crop_w, crop_h = box
+        crop = transforms.functional.resized_crop(image, crop_y0, crop_x0, crop_h, crop_w, size)
+        crop = transforms.functional.to_tensor(crop)
+        crops += [crop]
+    return torch.stack(crops, 0)
+def box_loader(fpath):
+    box = np.loadtxt(fpath, 'str')
+    box[0] = box[0].split('_')[0]
+    return box.astype(np.float32)
+def read_feat_from_img(path, n_channels):
+    feat = np.array(Image.open(path))
+    return dencode_feat_from_img(feat, n_channels)
+def dencode_feat_from_img(img, n_channels):
+    n_addon_channels = int(np.ceil(n_channels / 3) * 3) - n_channels
+    n_tiles = int((n_channels + n_addon_channels) / 3)
+    feat = rearrange(img, 'h (t w) c -> h w (t c)', t=n_tiles, c=3)
+    feat = feat[:, :, :-n_addon_channels]
+    feat = feat.astype('float32') / 255
+    return feat.transpose(2, 0, 1)
+def dino_loader(fpath, n_channels):
+    dino_map = read_feat_from_img(fpath, n_channels)
+    return dino_map
+def get_valid_mask(boxs, image_size):
+    valid_masks = []
+    for box in boxs:
+        crop_x0, crop_y0, crop_w, crop_h, full_w, full_h = box[1:7].int().numpy()
+        # Discard a small margin near the boundary.
+        margin_w = int(crop_w * 0.02)
+        margin_h = int(crop_h * 0.02)
+        mask_full = torch.ones(full_h-margin_h*2, full_w-margin_w*2)
+        mask_full_pad = torch.nn.functional.pad(mask_full, (crop_w+margin_w, crop_w+margin_w, crop_h+margin_h, crop_h+margin_h), mode='constant', value=0.0)
+        mask_full_crop = mask_full_pad[crop_y0+crop_h:crop_y0+crop_h*2, crop_x0+crop_w:crop_x0+crop_w*2]
+        mask_crop = torch.nn.functional.interpolate(mask_full_crop[None, None, :, :], image_size, mode='nearest')[0,0]
+        valid_masks += [mask_crop]
+    return torch.stack(valid_masks, 0)  # NxHxW
+def horizontal_flip_box(box):
+    frame_id, crop_x0, crop_y0, crop_w, crop_h, full_w, full_h, sharpness, label = box.unbind(1)
+    box[:,1] = full_w - crop_x0 - crop_w  # x0
+    return box
+def horizontal_flip_all(images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features=None, dino_clusters=None):
+    images = images.flip(3)  # NxCxHxW
+    masks = masks.flip(3)  # NxCxHxW
+    mask_dt = mask_dt.flip(3)  # NxCxHxW
+    mask_valid = mask_valid.flip(2)  # NxHxW
+    if flows.dim() > 1:
+        flows = flows.flip(3)  # (N-1)x(x,y)xHxW
+        flows[:,0] *= -1  # invert delta x
+    bboxs = horizontal_flip_box(bboxs)  # NxK
+    bg_images = bg_images.flip(3)  # NxCxHxW
+    if dino_features.dim() > 1:
+        dino_features = dino_features.flip(3)
+    if dino_clusters.dim() > 1:
+        dino_clusters = dino_clusters.flip(3)
+    return images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters
+class BaseSequenceDataset(Dataset):
+    def __init__(self, root, skip_beginning=4, skip_end=4, min_seq_len=10, debug_seq=False):
+        super().__init__()
+        self.skip_beginning = skip_beginning
+        self.skip_end = skip_end
+        self.min_seq_len = min_seq_len
+        # self.pattern = "{:07d}_{}"
+        self.sequences = self._make_sequences(root)
+        if debug_seq:
+            # self.sequences = [self.sequences[0][20:160]] * 100
+            seq_len = 0
+            while seq_len < min_seq_len:
+                i = np.random.randint(len(self.sequences))
+                rand_seq = self.sequences[i]
+                seq_len = len(rand_seq)
+            self.sequences = [rand_seq]
+        self.samples = []
+    def _make_sequences(self, path):
+        result = []
+        for d in sorted(os.scandir(path), key=lambda e: e.name):
+            if d.is_dir():
+                files = self._parse_folder(d)
+                if len(files) >= self.min_seq_len:
+                    result.append(files)
+        return result
+    def _parse_folder(self, path):
+        result = sorted(glob(os.path.join(path, '*'+self.image_loaders[0][0])))
+        result = [p.replace(self.image_loaders[0][0], '{}') for p in result]
+        if len(result) <= self.skip_beginning + self.skip_end:
+            return []
+        if self.skip_end == 0:
+            return result[self.skip_beginning:]
+        return result[self.skip_beginning:-self.skip_end]
+    def _load_ids(self, path_patterns, loaders, transform=None):
+        result = []
+        for loader in loaders:
+            for p in path_patterns:
+                x = loader[1](p.format(loader[0]), *loader[2:])
+                if transform:
+                    x = transform(x)
+                result.append(x)
+        return tuple(result)
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        raise NotImplemented("This is a base class and should not be used directly")
+class NFrameSequenceDataset(BaseSequenceDataset):
+    def __init__(self, root, cat_name=None, num_sample_frames=2, skip_beginning=4, skip_end=4, min_seq_len=10, in_image_size=256, out_image_size=256, debug_seq=False, random_sample=False, shuffle=False, dense_sample=True, color_jitter=None, load_background=False, random_flip=False, rgb_suffix='.png', load_dino_feature=False, load_dino_cluster=False, dino_feature_dim=64, **kwargs):
+        self.cat_name = cat_name
+        self.image_loaders = [("rgb"+rgb_suffix, torchvision.datasets.folder.default_loader)]
+        self.mask_loaders = [("mask.png", torchvision.datasets.folder.default_loader)]
+        self.bbox_loaders = [("box.txt", box_loader)]
+        super().__init__(root, skip_beginning, skip_end, min_seq_len, debug_seq)
+        if num_sample_frames > 1:
+            self.flow_loaders = [("flow.png", cv2.imread, cv2.IMREAD_UNCHANGED)]
+        else:
+            self.flow_loaders = None
+        self.num_sample_frames = num_sample_frames
+        self.random_sample = random_sample
+        if self.random_sample:
+            if shuffle:
+                random.shuffle(self.sequences)
+            self.samples = self.sequences
+        else:
+            for i, s in enumerate(self.sequences):
+                stride = 1 if dense_sample else self.num_sample_frames
+                self.samples += [(i, k) for k in range(0, len(s), stride)]
+            if shuffle:
+                random.shuffle(self.samples)
+        self.in_image_size = in_image_size
+        self.out_image_size = out_image_size
+        self.load_background = load_background
+        self.color_jitter = color_jitter
+        self.image_transform = transforms.Compose([transforms.Resize(self.in_image_size), transforms.ToTensor()])
+        self.mask_transform = transforms.Compose([transforms.Resize(self.out_image_size, interpolation=Image.NEAREST), transforms.ToTensor()])
+        if self.flow_loaders is not None:
+            self.flow_transform = lambda x: (torch.FloatTensor(x.astype(np.float32)).flip(2)[:,:,:2] / 65535. ) *2 -1
+        self.random_flip = random_flip
+        self.load_dino_feature = load_dino_feature
+        if load_dino_feature:
+            self.dino_feature_loaders = [(f"feat{dino_feature_dim}.png", dino_loader, dino_feature_dim)]
+        self.load_dino_cluster = load_dino_cluster
+        if load_dino_cluster:
+            self.dino_cluster_loaders = [("clusters.png", torchvision.datasets.folder.default_loader)]
+    def __getitem__(self, index):
+        if self.random_sample:
+            seq_idx = index % len(self.sequences)
+            seq = self.sequences[seq_idx]
+            if len(seq) < self.num_sample_frames:
+                start_frame_idx = 0
+            else:
+                start_frame_idx = np.random.randint(len(seq)-self.num_sample_frames+1)
+            paths = seq[start_frame_idx:start_frame_idx+self.num_sample_frames]
+        else:
+            seq_idx, start_frame_idx = self.samples[index % len(self.samples)]
+            seq = self.sequences[seq_idx]
+            # Handle edge case: when only last frame is left, sample last two frames, except if the sequence only has one frame
+            if len(seq) <= start_frame_idx +1:
+                start_frame_idx = max(0, start_frame_idx-1)
+            paths = seq[start_frame_idx:start_frame_idx+self.num_sample_frames]
+        masks = torch.stack(self._load_ids(paths, self.mask_loaders, transform=self.mask_transform), 0)  # load all images
+        mask_dt = compute_distance_transform(masks)
+        jitter = False
+        if self.color_jitter is not None:
+            prob, b, h = self.color_jitter
+            if np.random.rand() < prob:
+                jitter = True
+                color_jitter_tsf_fg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_fg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_fg, transforms.ToTensor()])
+                color_jitter_tsf_bg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_bg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_bg, transforms.ToTensor()])
+        if jitter:
+            images_fg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_fg), 0)  # load all images
+            images_bg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_bg), 0)  # load all images
+            images = images_fg * masks + images_bg * (1-masks)
+        else:
+            images = torch.stack(self._load_ids(paths, self.image_loaders, transform=self.image_transform), 0)  # load all images
+        if len(paths) > 1:
+            flows = torch.stack(self._load_ids(paths[:-1], self.flow_loaders, transform=self.flow_transform), 0).permute(0,3,1,2)   # load flow for first image, (N-1)x(x,y)xHxW, -1~1
+            flows = torch.nn.functional.interpolate(flows, size=self.out_image_size, mode="bilinear")
+        else:
+            flows = torch.zeros(1)
+        bboxs = torch.stack(self._load_ids(paths, self.bbox_loaders, transform=torch.FloatTensor), 0)   # load bounding boxes for all images
+        mask_valid = get_valid_mask(bboxs, (self.out_image_size, self.out_image_size))  # exclude pixels cropped outside the original image
+        if self.load_background:
+            bg_image = torchvision.datasets.folder.default_loader(os.path.join(os.path.dirname(paths[0]), 'background_frame.jpg'))
+            if jitter:
+                bg_image = color_jitter_tsf_bg(bg_image)
+            bg_images = crop_image(bg_image, bboxs[:, 1:5].int().numpy(), (self.in_image_size, self.in_image_size))
+        else:
+            bg_images = torch.zeros_like(images)
+        if self.load_dino_feature:
+            dino_features = torch.stack(self._load_ids(paths, self.dino_feature_loaders, transform=torch.FloatTensor), 0)  # BxFx64x224x224
+        else:
+            dino_features = torch.zeros(1)
+        if self.load_dino_cluster:
+            dino_clusters = torch.stack(self._load_ids(paths, self.dino_cluster_loaders, transform=transforms.ToTensor()), 0)  # BxFx3x55x55
+        else:
+            dino_clusters = torch.zeros(1)
+        seq_idx = torch.LongTensor([seq_idx])
+        frame_idx = torch.arange(start_frame_idx, start_frame_idx+len(paths)).long()
+        if self.random_flip and np.random.rand() < 0.5:
+            images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters = horizontal_flip_all(images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters)
+        ## pad shorter sequence
+        if len(paths) < self.num_sample_frames:
+            num_pad = self.num_sample_frames - len(paths)
+            images = torch.cat([images[:1]] *num_pad + [images], 0)
+            masks = torch.cat([masks[:1]] *num_pad + [masks], 0)
+            mask_dt = torch.cat([mask_dt[:1]] *num_pad + [mask_dt], 0)
+            mask_valid = torch.cat([mask_valid[:1]] *num_pad + [mask_valid], 0)
+            if flows.dim() > 1:
+                flows = torch.cat([flows[:1]*0] *num_pad + [flows], 0)
+            bboxs = torch.cat([bboxs[:1]] * num_pad + [bboxs], 0)
+            bg_images = torch.cat([bg_images[:1]] *num_pad + [bg_images], 0)
+            if dino_features.dim() > 1:
+                dino_features = torch.cat([dino_features[:1]] *num_pad + [dino_features], 0)
+            if dino_clusters.dim() > 1:
+                dino_clusters = torch.cat([dino_clusters[:1]] *num_pad + [dino_clusters], 0)
+            frame_idx = torch.cat([frame_idx[:1]] *num_pad + [frame_idx], 0)
+        return images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, self.cat_name
+def get_sequence_loader(data_dir, **kwargs):
+    if isinstance(data_dir, dict):
+        loaders = []
+        for k, v in data_dir.items():
+            dataset= NFrameSequenceDataset(v, cat_name=k, **kwargs)
+            loader = torch.utils.data.DataLoader(dataset, batch_size=kwargs['batch_size'], shuffle=kwargs['shuffle'], num_workers=kwargs['num_workers'], pin_memory=True)
+            loaders += [loader]
+        return loaders
+    else:
+        return [get_sequence_loader_single(data_dir, **kwargs)]
+def get_sequence_loader_single(data_dir, mode='all_frame', is_validation=False, batch_size=256, num_workers=4, in_image_size=256, out_image_size=256, debug_seq=False, num_sample_frames=2, skip_beginning=4, skip_end=4, min_seq_len=10, max_seq_len=256, random_sample=False, shuffle=False, dense_sample=True, color_jitter=None, load_background=False, random_flip=False, rgb_suffix='.jpg', load_dino_feature=False, load_dino_cluster=False, dino_feature_dim=64):
+    if mode == 'n_frame':
+        dataset = NFrameSequenceDataset(data_dir, num_sample_frames=num_sample_frames, skip_beginning=skip_beginning, skip_end=skip_end, min_seq_len=min_seq_len, in_image_size=in_image_size, out_image_size=out_image_size, debug_seq=debug_seq, random_sample=random_sample, shuffle=shuffle, dense_sample=dense_sample, color_jitter=color_jitter, load_background=load_background, random_flip=random_flip, rgb_suffix=rgb_suffix, load_dino_feature=load_dino_feature, load_dino_cluster=load_dino_cluster, dino_feature_dim=dino_feature_dim)
+    else:
+        raise NotImplementedError
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=not is_validation,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader
+class ImageDataset(Dataset):
+    def __init__(self, root, is_validation=False, image_size=256, color_jitter=None):
+        super().__init__()
+        self.image_loader = ("rgb.jpg", torchvision.datasets.folder.default_loader)
+        self.mask_loader = ("mask.png", torchvision.datasets.folder.default_loader)
+        self.bbox_loader = ("box.txt", np.loadtxt, 'str')
+        self.samples = self._parse_folder(root)
+        self.image_size = image_size
+        self.color_jitter = color_jitter
+        self.image_transform = transforms.Compose([transforms.Resize(self.image_size), transforms.ToTensor()])
+        self.mask_transform = transforms.Compose([transforms.Resize(self.image_size, interpolation=Image.NEAREST), transforms.ToTensor()])
+    def _parse_folder(self, path):
+        result = sorted(glob(os.path.join(path, '**/*'+self.image_loader[0]), recursive=True))
+        result = [p.replace(self.image_loader[0], '{}') for p in result]
+        return result
+    def _load_ids(self, path, loader, transform=None):
+        x = loader[1](path.format(loader[0]), *loader[2:])
+        if transform:
+            x = transform(x)
+        return x
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        path = self.samples[index % len(self.samples)]
+        masks = self._load_ids(path, self.mask_loader, transform=self.mask_transform).unsqueeze(0)
+        mask_dt = compute_distance_transform(masks)
+        jitter = False
+        if self.color_jitter is not None:
+            prob, b, h = self.color_jitter
+            if np.random.rand() < prob:
+                jitter = True
+                color_jitter_tsf_fg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_fg = transforms.Compose([transforms.Resize(self.image_size), color_jitter_tsf_fg, transforms.ToTensor()])
+                color_jitter_tsf_bg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_bg = transforms.Compose([transforms.Resize(self.image_size), color_jitter_tsf_bg, transforms.ToTensor()])
+        if jitter:
+            images_fg = self._load_ids(path, self.image_loader, transform=image_transform_fg).unsqueeze(0)
+            images_bg = self._load_ids(path, self.image_loader, transform=image_transform_bg).unsqueeze(0)
+            images = images_fg * masks + images_bg * (1-masks)
+        else:
+            images = self._load_ids(path, self.image_loader, transform=self.image_transform).unsqueeze(0)
+        flows = torch.zeros(1)
+        bboxs = self._load_ids(path, self.bbox_loader, transform=None)
+        bboxs[0] = '0'
+        bboxs = torch.FloatTensor(bboxs.astype('float')).unsqueeze(0)
+        bg_fpath = os.path.join(os.path.dirname(path), 'background_frame.jpg')
+        if os.path.isfile(bg_fpath):
+            bg_image = torchvision.datasets.folder.default_loader(bg_fpath)
+            if jitter:
+                bg_image = color_jitter_tsf_bg(bg_image)
+            bg_image = transforms.ToTensor()(bg_image)
+        else:
+            bg_image = images[0]
+        seq_idx = torch.LongTensor([index])
+        frame_idx = torch.LongTensor([0])
+        return images, masks, mask_dt, flows, bboxs, bg_image, seq_idx, frame_idx
+def get_image_loader(data_dir, is_validation=False, batch_size=256, num_workers=4, image_size=256, color_jitter=None):
+    dataset = ImageDataset(data_dir, is_validation=is_validation, image_size=image_size, color_jitter=color_jitter)
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader

video3d/dataloaders_ddp.py ADDED Viewed

	@@ -0,0 +1,1210 @@

+import os
+from glob import glob
+import random
+import numpy as np
+from PIL import Image
+import cv2
+import itertools
+import torch
+import copy
+from torch.utils.data import Dataset
+import torchvision.datasets.folder
+import torchvision.transforms as transforms
+from einops import rearrange
+def compute_distance_transform(mask):
+    mask_dt = []
+    for m in mask:
+        dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        inv_dt = torch.FloatTensor(cv2.distanceTransform(np.uint8(1 - m[0]), cv2.DIST_L2, cv2.DIST_MASK_PRECISE))
+        mask_dt += [torch.stack([dt, inv_dt], 0)]
+    return torch.stack(mask_dt, 0)  # Bx2xHxW
+def crop_image(image, boxs, size):
+    crops = []
+    for box in boxs:
+        crop_x0, crop_y0, crop_w, crop_h = box
+        crop = transforms.functional.resized_crop(image, crop_y0, crop_x0, crop_h, crop_w, size)
+        crop = transforms.functional.to_tensor(crop)
+        crops += [crop]
+    return torch.stack(crops, 0)
+def box_loader(fpath):
+    box = np.loadtxt(fpath, 'str')
+    box[0] = box[0].split('_')[0]
+    return box.astype(np.float32)
+def read_feat_from_img(path, n_channels):
+    feat = np.array(Image.open(path))
+    return dencode_feat_from_img(feat, n_channels)
+def dencode_feat_from_img(img, n_channels):
+    n_addon_channels = int(np.ceil(n_channels / 3) * 3) - n_channels
+    n_tiles = int((n_channels + n_addon_channels) / 3)
+    feat = rearrange(img, 'h (t w) c -> h w (t c)', t=n_tiles, c=3)
+    if n_addon_channels != 0:
+        feat = feat[:, :, :-n_addon_channels]
+    feat = feat.astype('float32') / 255
+    return feat.transpose(2, 0, 1)
+def dino_loader(fpath, n_channels):
+    dino_map = read_feat_from_img(fpath, n_channels)
+    return dino_map
+def get_valid_mask(boxs, image_size):
+    valid_masks = []
+    for box in boxs:
+        crop_x0, crop_y0, crop_w, crop_h, full_w, full_h = box[1:7].int().numpy()
+        margin_w = int(crop_w * 0.02)
+        margin_h = int(crop_h * 0.02)
+        mask_full = torch.ones(full_h-margin_h*2, full_w-margin_w*2)
+        mask_full_pad = torch.nn.functional.pad(mask_full, (crop_w+margin_w, crop_w+margin_w, crop_h+margin_h, crop_h+margin_h), mode='constant', value=0.0)
+        mask_full_crop = mask_full_pad[(crop_y0+crop_h):crop_y0+(crop_h*2), (crop_x0+crop_w):crop_x0+(crop_w*2)]
+        mask_crop = torch.nn.functional.interpolate(mask_full_crop[None, None, :, :], image_size, mode='nearest')[0,0]
+        valid_masks += [mask_crop]
+    return torch.stack(valid_masks, 0)  # NxHxW
+def horizontal_flip_box(box):
+    frame_id, crop_x0, crop_y0, crop_w, crop_h, full_w, full_h, sharpness, label = box.unbind(1)
+    box[:,1] = full_w - crop_x0 - crop_w  # x0
+    return box
+def horizontal_flip_all(images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features=None, dino_clusters=None):
+    images = images.flip(3)  # NxCxHxW
+    masks = masks.flip(3)  # NxCxHxW
+    mask_dt = mask_dt.flip(3)  # NxCxHxW
+    mask_valid = mask_valid.flip(2)  # NxHxW
+    if flows.dim() > 1:
+        flows = flows.flip(3)  # (N-1)x(x,y)xHxW
+        flows[:,0] *= -1  # invert delta x
+    bboxs = horizontal_flip_box(bboxs)  # NxK
+    bg_images = bg_images.flip(3)  # NxCxHxW
+    if dino_features.dim() > 1:
+        dino_features = dino_features.flip(3)
+    if dino_clusters.dim() > 1:
+        dino_clusters = dino_clusters.flip(3)
+    return images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters
+def none_to_nan(x):
+    return torch.FloatTensor([float('nan')]) if x is None else x
+class BaseSequenceDataset(Dataset):
+    def __init__(self, root, skip_beginning=4, skip_end=4, min_seq_len=10, debug_seq=False):
+        super().__init__()
+        self.skip_beginning = skip_beginning
+        self.skip_end = skip_end
+        self.min_seq_len = min_seq_len
+        # self.pattern = "{:07d}_{}"
+        self.sequences = self._make_sequences(root)
+        if debug_seq:
+            # self.sequences = [self.sequences[0][20:160]] * 100
+            seq_len = 0
+            while seq_len < min_seq_len:
+                i = np.random.randint(len(self.sequences))
+                rand_seq = self.sequences[i]
+                seq_len = len(rand_seq)
+            self.sequences = [rand_seq]
+        self.samples = []
+    def _make_sequences(self, path):
+        result = []
+        for d in sorted(os.scandir(path), key=lambda e: e.name):
+            if d.is_dir():
+                files = self._parse_folder(d)
+                if len(files) >= self.min_seq_len:
+                    result.append(files)
+        return result
+    def _parse_folder(self, path):
+        result = sorted(glob(os.path.join(path, '*'+self.image_loaders[0][0])))
+        result = [p.replace(self.image_loaders[0][0], '{}') for p in result]
+        if len(result) <= self.skip_beginning + self.skip_end:
+            return []
+        if self.skip_end == 0:
+            return result[self.skip_beginning:]
+        return result[self.skip_beginning:-self.skip_end]
+    def _load_ids(self, path_patterns, loaders, transform=None):
+        result = []
+        for loader in loaders:
+            for p in path_patterns:
+                x = loader[1](p.format(loader[0]), *loader[2:])
+                if transform:
+                    x = transform(x)
+                result.append(x)
+        return tuple(result)
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        raise NotImplemented("This is a base class and should not be used directly")
+class NFrameSequenceDataset(BaseSequenceDataset):
+    def __init__(self, root, cat_name=None, num_sample_frames=2, skip_beginning=4, skip_end=4, min_seq_len=10, in_image_size=256, out_image_size=256, debug_seq=False, random_sample=False, shuffle=False, dense_sample=True, color_jitter=None, load_background=False, random_flip=False, rgb_suffix='.png', load_dino_feature=False, load_dino_cluster=False, dino_feature_dim=64, flow_bool=False, **kwargs):
+        self.cat_name = cat_name
+        self.flow_bool=flow_bool
+        self.image_loaders = [("rgb"+rgb_suffix, torchvision.datasets.folder.default_loader)]
+        self.mask_loaders = [("mask.png", torchvision.datasets.folder.default_loader)]
+        self.bbox_loaders = [("box.txt", box_loader)]
+        super().__init__(root, skip_beginning, skip_end, min_seq_len, debug_seq)
+        # from IPython import embed; embed()
+        if flow_bool and num_sample_frames > 1:
+            self.flow_loaders = [("flow.png", cv2.imread, cv2.IMREAD_UNCHANGED)]
+        else:
+            self.flow_loaders = None
+        self.num_sample_frames = num_sample_frames
+        self.random_sample = random_sample
+        if self.random_sample:
+            if shuffle:
+                random.shuffle(self.sequences)
+            self.samples = self.sequences
+        else:
+            for i, s in enumerate(self.sequences):
+                stride = 1 if dense_sample else self.num_sample_frames
+                self.samples += [(i, k) for k in range(0, len(s), stride)]
+            if shuffle:
+                random.shuffle(self.samples)
+        self.in_image_size = in_image_size
+        self.out_image_size = out_image_size
+        self.load_background = load_background
+        self.color_jitter = color_jitter
+        self.image_transform = transforms.Compose([transforms.Resize(self.in_image_size), transforms.ToTensor()])
+        self.mask_transform = transforms.Compose([transforms.Resize(self.out_image_size, interpolation=Image.NEAREST), transforms.ToTensor()])
+        if self.flow_loaders is not None:
+            self.flow_transform = lambda x: (torch.FloatTensor(x.astype(np.float32)).flip(2)[:,:,:2] / 65535. ) *2 -1
+        self.random_flip = random_flip
+        self.load_dino_feature = load_dino_feature
+        if load_dino_feature:
+            self.dino_feature_loaders = [(f"feat{dino_feature_dim}.png", dino_loader, dino_feature_dim)]
+        self.load_dino_cluster = load_dino_cluster
+        if load_dino_cluster:
+            self.dino_cluster_loaders = [("clusters.png", torchvision.datasets.folder.default_loader)]
+    def __getitem__(self, index):
+        if self.random_sample:
+            seq_idx = index % len(self.sequences)
+            seq = self.sequences[seq_idx]
+            if len(seq) < self.num_sample_frames:
+                start_frame_idx = 0
+            else:
+                start_frame_idx = np.random.randint(len(seq)-self.num_sample_frames+1)
+            paths = seq[start_frame_idx:start_frame_idx+self.num_sample_frames]
+        else:
+            seq_idx, start_frame_idx = self.samples[index % len(self.samples)]
+            seq = self.sequences[seq_idx]
+            # Handle edge case: when only last frame is left, sample last two frames, except if the sequence only has one frame
+            if len(seq) <= start_frame_idx +1:
+                start_frame_idx = max(0, start_frame_idx-1)
+            paths = seq[start_frame_idx:start_frame_idx+self.num_sample_frames]
+        masks = torch.stack(self._load_ids(paths, self.mask_loaders, transform=self.mask_transform), 0)  # load all images
+        mask_dt = compute_distance_transform(masks)
+        jitter = False
+        if self.color_jitter is not None:
+            prob, b, h = self.color_jitter
+            if np.random.rand() < prob:
+                jitter = True
+                color_jitter_tsf_fg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_fg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_fg, transforms.ToTensor()])
+                color_jitter_tsf_bg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_bg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_bg, transforms.ToTensor()])
+        if jitter:
+            images_fg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_fg), 0)  # load all images
+            images_bg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_bg), 0)  # load all images
+            images = images_fg * masks + images_bg * (1-masks)
+        else:
+            images = torch.stack(self._load_ids(paths, self.image_loaders, transform=self.image_transform), 0)  # load all images
+        if self.flow_bool==True and len(paths) > 1:
+            flows = torch.stack(self._load_ids(paths[:-1], self.flow_loaders, transform=self.flow_transform), 0).permute(0,3,1,2)   # load flow for first image, (N-1)x(x,y)xHxW, -1~1
+            flows = torch.nn.functional.interpolate(flows, size=self.out_image_size, mode="bilinear")
+        else:
+            flows = torch.zeros(1)
+        bboxs = torch.stack(self._load_ids(paths, self.bbox_loaders, transform=torch.FloatTensor), 0)   # load bounding boxes for all images
+        mask_valid = get_valid_mask(bboxs, (self.out_image_size, self.out_image_size))  # exclude pixels cropped outside the original image
+        if self.load_background:
+            bg_image = torchvision.datasets.folder.default_loader(os.path.join(os.path.dirname(paths[0]), 'background_frame.jpg'))
+            if jitter:
+                bg_image = color_jitter_tsf_bg(bg_image)
+            bg_images = crop_image(bg_image, bboxs[:, 1:5].int().numpy(), (self.in_image_size, self.in_image_size))
+        else:
+            bg_images = torch.zeros_like(images)
+        if self.load_dino_feature:
+            dino_paths = [
+                x.replace(
+                    "/viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new",
+                    "/viscam/projects/articulated/zzli/data_dino_5000/7_cat"
+                )
+                for x in paths
+            ]
+            dino_features = torch.stack(self._load_ids(dino_paths, self.dino_feature_loaders, transform=torch.FloatTensor), 0)
+            # dino_features = torch.stack(self._load_ids(paths, self.dino_feature_loaders, transform=torch.FloatTensor), 0)  # BxFx64x224x224
+        else:
+            dino_features = torch.zeros(1)
+        if self.load_dino_cluster:
+            dino_clusters = torch.stack(self._load_ids(paths, self.dino_cluster_loaders, transform=transforms.ToTensor()), 0)  # BxFx3x55x55
+        else:
+            dino_clusters = torch.zeros(1)
+        seq_idx = torch.LongTensor([seq_idx])
+        frame_idx = torch.arange(start_frame_idx, start_frame_idx+len(paths)).long()
+        if self.random_flip and np.random.rand() < 0.5:
+            images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters = horizontal_flip_all(images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters)
+        ## pad shorter sequence
+        if len(paths) < self.num_sample_frames:
+            num_pad = self.num_sample_frames - len(paths)
+            images = torch.cat([images[:1]] *num_pad + [images], 0)
+            masks = torch.cat([masks[:1]] *num_pad + [masks], 0)
+            mask_dt = torch.cat([mask_dt[:1]] *num_pad + [mask_dt], 0)
+            mask_valid = torch.cat([mask_valid[:1]] *num_pad + [mask_valid], 0)
+            if flows.dim() > 1:
+                flows = torch.cat([flows[:1]*0] *num_pad + [flows], 0)
+            bboxs = torch.cat([bboxs[:1]] * num_pad + [bboxs], 0)
+            bg_images = torch.cat([bg_images[:1]] *num_pad + [bg_images], 0)
+            if dino_features.dim() > 1:
+                dino_features = torch.cat([dino_features[:1]] *num_pad + [dino_features], 0)
+            if dino_clusters.dim() > 1:
+                dino_clusters = torch.cat([dino_clusters[:1]] *num_pad + [dino_clusters], 0)
+            frame_idx = torch.cat([frame_idx[:1]] *num_pad + [frame_idx], 0)
+        out = (*map(none_to_nan, (images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, self.cat_name)), )
+        return out
+        # return images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, self.cat_name
+def few_shot_box_loader(fpath):
+    box = np.loadtxt(fpath, 'str')
+    # box[0] = box[0].split('_')[0]
+    return box.astype(np.float32)
+class FewShotImageDataset(Dataset):
+    def __init__(self, root, cat_name=None, cat_num=0, num_sample_frames=2, in_image_size=256, out_image_size=256, shuffle=False, color_jitter=None, load_background=False, random_flip=False, rgb_suffix='.png', load_dino_feature=False, dino_feature_dim=64, flow_bool=False, **kwargs):
+        super().__init__()
+        self.cat_name = cat_name
+        self.cat_num = cat_num # this is actually useless
+        self.flow_bool=flow_bool
+        self.image_loaders = [("rgb"+rgb_suffix, torchvision.datasets.folder.default_loader)]
+        self.mask_loaders = [("mask.png", torchvision.datasets.folder.default_loader)]
+        self.bbox_loaders = [("box.txt", few_shot_box_loader)]
+        self.flow_loaders = None
+        # get all the valid paths, since it's just image-wise, in get_item, we will make it like a len=1 sequence
+        result = sorted(glob(os.path.join(root, '*'+self.image_loaders[0][0])))
+        result = [p.replace(self.image_loaders[0][0], '{}') for p in result]
+        self.sequences = result
+        self.num_sample_frames = num_sample_frames
+        if shuffle:
+            random.shuffle(self.sequences)
+        self.samples = self.sequences
+        self.in_image_size = in_image_size
+        self.out_image_size = out_image_size
+        self.load_background = load_background
+        self.color_jitter = color_jitter
+        self.image_transform = transforms.Compose([transforms.Resize(self.in_image_size), transforms.ToTensor()])
+        self.mask_transform = transforms.Compose([transforms.Resize(self.out_image_size, interpolation=Image.NEAREST), transforms.ToTensor()])
+        self.random_flip = random_flip
+        self.load_dino_feature = load_dino_feature
+        if load_dino_feature:
+            self.dino_feature_loaders = [(f"feat{dino_feature_dim}.png", dino_loader, dino_feature_dim)]
+    def _load_ids(self, path_patterns, loaders, transform=None):
+        result = []
+        for loader in loaders:
+            for p in path_patterns:
+                x = loader[1](p.format(loader[0]), *loader[2:])
+                if transform:
+                    x = transform(x)
+                result.append(x)
+        return tuple(result)
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        paths = [self.samples[index]]  # len 1 sequence
+        masks = torch.stack(self._load_ids(paths, self.mask_loaders, transform=self.mask_transform), 0)  # load all images
+        mask_dt = compute_distance_transform(masks)
+        jitter = False
+        if self.color_jitter is not None:
+            prob, b, h = self.color_jitter
+            if np.random.rand() < prob:
+                jitter = True
+                color_jitter_tsf_fg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_fg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_fg, transforms.ToTensor()])
+                color_jitter_tsf_bg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_bg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_bg, transforms.ToTensor()])
+        if jitter:
+            images_fg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_fg), 0)  # load all images
+            images_bg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_bg), 0)  # load all images
+            images = images_fg * masks + images_bg * (1-masks)
+        else:
+            images = torch.stack(self._load_ids(paths, self.image_loaders, transform=self.image_transform), 0)  # load all images
+        flows = torch.zeros(1)
+        bboxs = torch.stack(self._load_ids(paths, self.bbox_loaders, transform=torch.FloatTensor), 0)   # load bounding boxes for all images
+        bboxs=torch.cat([bboxs, torch.Tensor([[self.cat_num]]).float()],dim=-1) # pad a label number
+        mask_valid = get_valid_mask(bboxs, (self.out_image_size, self.out_image_size))  # exclude pixels cropped outside the original image
+        if self.load_background:
+            bg_image = torchvision.datasets.folder.default_loader(os.path.join(os.path.dirname(paths[0]), 'background_frame.jpg'))
+            if jitter:
+                bg_image = color_jitter_tsf_bg(bg_image)
+            bg_images = crop_image(bg_image, bboxs[:, 1:5].int().numpy(), (self.in_image_size, self.in_image_size))
+        else:
+            bg_images = torch.zeros_like(images)
+        if self.load_dino_feature:
+            dino_features = torch.stack(self._load_ids(paths, self.dino_feature_loaders, transform=torch.FloatTensor), 0)  # BxFx64x224x224
+        else:
+            dino_features = torch.zeros(1)
+        dino_clusters = torch.zeros(1)
+        # These are actually no use
+        seq_idx = 0
+        seq_idx = torch.LongTensor([seq_idx])
+        frame_idx = torch.arange(0, 1).long()
+        if self.random_flip and np.random.rand() < 0.5:
+            images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters = horizontal_flip_all(images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters)
+        ## pad shorter sequence
+        if len(paths) < self.num_sample_frames:
+            num_pad = self.num_sample_frames - len(paths)
+            images = torch.cat([images[:1]] *num_pad + [images], 0)
+            masks = torch.cat([masks[:1]] *num_pad + [masks], 0)
+            mask_dt = torch.cat([mask_dt[:1]] *num_pad + [mask_dt], 0)
+            mask_valid = torch.cat([mask_valid[:1]] *num_pad + [mask_valid], 0)
+            if flows.dim() > 1:
+                flows = torch.cat([flows[:1]*0] *num_pad + [flows], 0)
+            bboxs = torch.cat([bboxs[:1]] * num_pad + [bboxs], 0)
+            bg_images = torch.cat([bg_images[:1]] *num_pad + [bg_images], 0)
+            if dino_features.dim() > 1:
+                dino_features = torch.cat([dino_features[:1]] *num_pad + [dino_features], 0)
+            if dino_clusters.dim() > 1:
+                dino_clusters = torch.cat([dino_clusters[:1]] *num_pad + [dino_clusters], 0)
+            frame_idx = torch.cat([frame_idx[:1]] *num_pad + [frame_idx], 0)
+        out = (*map(none_to_nan, (images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, self.cat_name)), )
+        return out
+        # return images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, self.cat_name
+class Quadrupeds_Image_Dataset(Dataset):
+    def __init__(self, original_data_dirs, few_shot_data_dirs, original_num=7, few_shot_num=93, num_sample_frames=2,
+                 in_image_size=256, out_image_size=256, is_validation=False, val_image_num=5, shuffle=False, color_jitter=None,
+                 load_background=False, random_flip=False, rgb_suffix='.png', load_dino_feature=False, dino_feature_dim=64,
+                 flow_bool=False, disable_fewshot=False, dataset_split_num=-1, **kwargs):
+        self.original_data_dirs = original_data_dirs
+        self.few_shot_data_dirs = few_shot_data_dirs
+        self.original_num = original_num
+        self.few_shot_num = few_shot_num
+        self.image_loaders = [("rgb"+rgb_suffix, torchvision.datasets.folder.default_loader)]
+        self.mask_loaders = [("mask.png", torchvision.datasets.folder.default_loader)]
+        self.original_bbox_loaders = [("box.txt", box_loader)]
+        self.few_shot_bbox_loaders = [("box.txt", few_shot_box_loader)]
+        assert len(self.original_data_dirs.keys()) == self.original_num
+        assert len(self.few_shot_data_dirs.keys()) == self.few_shot_num
+        self.num_sample_frames = num_sample_frames
+        self.batch_size = kwargs['batch_size']  # a hack way here
+        # for debug, just use some categories
+        if "override_categories" in kwargs:
+            self.override_categories = kwargs["override_categories"]
+        else:
+            self.override_categories = None
+        # original dataset
+        original_data_paths = {}
+        for k,v in self.original_data_dirs.items():
+            # categories override
+            if self.override_categories is not None:
+                if k not in self.override_categories:
+                    continue
+            sequences = self._make_sequences(v)
+            samples = []
+            for seq in sequences:
+                samples += seq
+            if shuffle:
+                random.shuffle(samples)
+            original_data_paths.update({k: samples})
+        # few-shot dataset
+        enhance_back_view = kwargs['enhance_back_view']
+        if enhance_back_view:
+            enhance_back_view_path = kwargs['enhance_back_view_path']
+        few_shot_data_paths = {}
+        for k,v in self.few_shot_data_dirs.items():
+            # categories override
+            if self.override_categories is not None:
+                if k not in self.override_categories:
+                    continue
+            if k.startswith('_'):
+                # a boundary here for dealing with when in new data, we have same categories as in 7-cat
+                v = v.replace(k, k[1:])
+            if isinstance(v, str):
+                result = sorted(glob(os.path.join(v, '*'+self.image_loaders[0][0])))
+            elif isinstance(v, list):
+                result = []
+                for _v in v:
+                    result = result + sorted(glob(os.path.join(_v, '*'+self.image_loaders[0][0])))
+            else:
+                raise NotImplementedError
+            # result = sorted(glob(os.path.join(v, '*'+self.image_loaders[0][0])))
+            result = [p.replace(self.image_loaders[0][0], '{}') for p in result]
+            sequences = result
+            # the original 7 categories are using pre-defined paths to separate train and test
+            # here the few-shot we use is_validation to decide if this dataset is train or test
+            # if use enhanced back view, we first pad the multiplied back view image paths at the front of seq
+            # i.e., we don't use back view images for validation
+            if enhance_back_view:
+                back_view_dir = os.path.join(enhance_back_view_path, k, 'train')
+                back_view_result = sorted(glob(os.path.join(back_view_dir, '*'+self.image_loaders[0][0])))
+                back_view_result = [p.replace(self.image_loaders[0][0], '{}') for p in back_view_result]
+                mul_bv_sequences = self._more_back_views(back_view_result, result)
+                sequences = mul_bv_sequences + sequences
+            if is_validation:
+                # sequences = sequences[-2:]
+                sequences = sequences[-val_image_num:]
+            else:
+                # sequences = sequences[:-2]
+                sequences = sequences[:-val_image_num]
+            if shuffle:
+                random.shuffle(sequences)
+            few_shot_data_paths.update({k: sequences})
+        # for visualization purpose
+        self.pure_ori_data_path = original_data_paths
+        self.pure_fs_data_path = few_shot_data_paths
+        self.few_shot_data_length = self._get_data_length(few_shot_data_paths) # get the original length of each few-shot category
+        if disable_fewshot:
+            few_shot_data_paths = {}
+        self.dataset_split_num = dataset_split_num # if -1 then pad to longest, otherwise follow this number to pad and split
+        if is_validation:
+            self.dataset_split_num = -1            # validation we don't split dataset
+        if self.dataset_split_num == -1:
+            self.all_data_paths, self.one_category_num = self._pad_paths(original_data_paths, few_shot_data_paths)
+            self.all_category_num = len(self.all_data_paths.keys())
+            self.all_category_names = list(self.all_data_paths.keys())
+            self.original_category_names = list(self.original_data_dirs.keys())
+        elif self.dataset_split_num > 0:
+            self.all_data_paths, self.one_category_num, self.original_category_names = self._pad_paths_withnum(original_data_paths, few_shot_data_paths, self.dataset_split_num)
+            self.all_category_num = len(self.all_data_paths.keys())
+            self.all_category_names = list(self.all_data_paths.keys())
+        else:
+            raise NotImplementedError
+        self.in_image_size = in_image_size
+        self.out_image_size = out_image_size
+        self.load_background = load_background
+        self.color_jitter = color_jitter
+        self.image_transform = transforms.Compose([transforms.Resize(self.in_image_size), transforms.ToTensor()])
+        self.mask_transform = transforms.Compose([transforms.Resize(self.out_image_size, interpolation=Image.NEAREST), transforms.ToTensor()])
+        self.random_flip = random_flip
+        self.load_dino_feature = load_dino_feature
+        if load_dino_feature:
+            self.dino_feature_loaders = [(f"feat{dino_feature_dim}.png", dino_loader, dino_feature_dim)]
+    def _more_back_views(self, back_view_seq, seq):
+        if len(back_view_seq) == 0:
+            # for category without back views
+            return []
+        factor = 5
+        # length = (len(seq) // factor) * factor
+        length = (len(seq) // factor) * (factor - 1)
+        mul_f = length // len(back_view_seq)
+        pad_f = length % len(back_view_seq)
+        new_seq = mul_f * back_view_seq + back_view_seq[:pad_f]
+        return new_seq
+    def _get_data_length(self, paths):
+        data_length = {}
+        for k,v in paths.items():
+            length = len(v)
+            data_length.update({k: length})
+        return data_length
+    def _make_sequences(self, path):
+        result = []
+        for d in sorted(os.scandir(path), key=lambda e: e.name):
+            if d.is_dir():
+                files = self._parse_folder(d)
+                if len(files) >= 1:
+                    result.append(files)
+        return result
+    def _parse_folder(self, path):
+        result = sorted(glob(os.path.join(path, '*'+self.image_loaders[0][0])))
+        result = [p.replace(self.image_loaders[0][0], '{}') for p in result]
+        if len(result) <= 0:
+            return []
+        return result
+    def _pad_paths(self, ori_paths, fs_paths):
+        img_nums = []
+        all_paths = copy.deepcopy(ori_paths)
+        all_paths.update(fs_paths)
+        for _, v in all_paths.items():
+            img_nums.append(len(v))
+        img_num = max(img_nums)
+        img_num = (img_num // self.batch_size) * self.batch_size
+        for k,v in all_paths.items():
+            if len(v) < img_num:
+                mul_time = img_num // len(v)
+                pad_time = img_num % len(v)
+                # for each v, shuffle it
+                shuffle_v = copy.deepcopy(v)
+                new_v = []
+                for i in range(mul_time):
+                    new_v = new_v + shuffle_v
+                    random.shuffle(shuffle_v)
+                del shuffle_v
+                new_v = new_v + v[0:pad_time]
+                # new_v = mul_time * v + v[0:pad_time]
+                all_paths[k] = new_v
+            elif len(v) > img_num:
+                all_paths[k] = v[:img_num]
+            else:
+                continue
+        return all_paths, img_num
+    def _pad_paths_withnum(self, ori_paths, fs_paths, split_num=1000):
+        img_num = (split_num // self.batch_size) * self.batch_size
+        all_paths = {}
+        orig_cat_names = []
+        for k, v in ori_paths.items():
+            total_num = ((len(v) // img_num) + 1) * img_num
+            pad_num = total_num - len(v)
+            split_num = total_num // img_num
+            new_v = copy.deepcopy(v)
+            random.shuffle(new_v)
+            all_v = v + new_v[:pad_num]
+            del new_v
+            for sn in range(split_num):
+                split_cat_name = f'{k}_' + '%03d' % sn
+                all_paths.update({
+                    split_cat_name: all_v[sn*img_num: (sn+1)*img_num]
+                })
+                orig_cat_names.append(split_cat_name)
+        for k, v in fs_paths.items():
+            if len(v) < img_num:
+                mul_time = img_num // len(v)
+                pad_time = img_num % len(v)
+                # for each v, shuffle it
+                shuffle_v = copy.deepcopy(v)
+                new_v = []
+                for i in range(mul_time):
+                    new_v = new_v + shuffle_v
+                    random.shuffle(shuffle_v)
+                del shuffle_v
+                new_v = new_v + v[0:pad_time]
+                # new_v = mul_time * v + v[0:pad_time]
+                all_paths.update({
+                    k: new_v
+                })
+            elif len(v) > img_num:
+                all_paths.update({
+                    k: v[:img_num]
+                })
+            else:
+                continue
+        return all_paths, img_num, orig_cat_names
+    def _load_ids(self, path_patterns, loaders, transform=None):
+        result = []
+        for loader in loaders:
+            for p in path_patterns:
+                x = loader[1](p.format(loader[0]), *loader[2:])
+                if transform:
+                    x = transform(x)
+                result.append(x)
+        return tuple(result)
+    def _shuffle_all(self):
+        for k,v in self.all_data_paths.items():
+            new_v = copy.deepcopy(v)
+            random.shuffle(new_v)
+            self.all_data_paths[k] = new_v
+        return None
+    def __len__(self):
+        return self.all_category_num * self.one_category_num
+    def __getitem__(self, index):
+        '''
+        This dataset must have non-shuffled index!!
+        '''
+        category_idx = (index % (self.batch_size * self.all_category_num)) // self.batch_size
+        path_idx = (index // (self.batch_size * self.all_category_num)) * self.batch_size + (index % (self.batch_size * self.all_category_num)) - category_idx * self.batch_size
+        category_name = self.all_category_names[category_idx]
+        paths = [self.all_data_paths[category_name][path_idx]]  # len 1 sequence
+        if category_name in self.original_category_names:
+            bbox_loaders = self.original_bbox_loaders
+            use_original_bbox = True
+        else:
+            bbox_loaders = self.few_shot_bbox_loaders
+            use_original_bbox = False
+        masks = torch.stack(self._load_ids(paths, self.mask_loaders, transform=self.mask_transform), 0)  # load all images
+        mask_dt = compute_distance_transform(masks)
+        jitter = False
+        if self.color_jitter is not None:
+            prob, b, h = self.color_jitter
+            if np.random.rand() < prob:
+                jitter = True
+                color_jitter_tsf_fg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_fg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_fg, transforms.ToTensor()])
+                color_jitter_tsf_bg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_bg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_bg, transforms.ToTensor()])
+        if jitter:
+            images_fg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_fg), 0)  # load all images
+            images_bg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_bg), 0)  # load all images
+            images = images_fg * masks + images_bg * (1-masks)
+        else:
+            images = torch.stack(self._load_ids(paths, self.image_loaders, transform=self.image_transform), 0)  # load all images
+        flows = torch.zeros(1)
+        bboxs = torch.stack(self._load_ids(paths, bbox_loaders, transform=torch.FloatTensor), 0)   # load bounding boxes for all images
+        if not use_original_bbox:
+            bboxs=torch.cat([bboxs, torch.Tensor([[category_idx]]).float()],dim=-1) # pad a label number
+        mask_valid = get_valid_mask(bboxs, (self.out_image_size, self.out_image_size))  # exclude pixels cropped outside the original image
+        if self.load_background:
+            bg_image = torchvision.datasets.folder.default_loader(os.path.join(os.path.dirname(paths[0]), 'background_frame.jpg'))
+            if jitter:
+                bg_image = color_jitter_tsf_bg(bg_image)
+            bg_images = crop_image(bg_image, bboxs[:, 1:5].int().numpy(), (self.in_image_size, self.in_image_size))
+        else:
+            bg_images = torch.zeros_like(images)
+        if self.load_dino_feature:
+            # print(paths)
+            new_dino_data_name = "data_dino_5000"
+            new_dino_data_path = os.path.join("/viscam/projects/articulated/dor/combine_all_data_for_ablation_magicpony", new_dino_data_name)
+            # TODO: use another version of DINO here by changing the path
+            if paths[0].startswith("/viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new"):
+                # 7 cat data
+                new_dino_path = paths[0].replace(
+                    "/viscam/projects/articulated/dor/AnimalsMotionDataset/splitted_data/Combine_data/dinov2_new",
+                    "/viscam/projects/articulated/zzli/data_dino_5000/7_cat"
+                )
+                dino_paths = [new_dino_path]
+            elif paths[0].startswith("/viscam/u/zzli/workspace/Animal-Data-Engine/data/data_resize_update/few_shot_data_all"):
+                # 100 cat
+                dino_path = paths[0].replace(
+                    "/viscam/u/zzli/workspace/Animal-Data-Engine/data/data_resize_update/few_shot_data_all",
+                    os.path.join(new_dino_data_path, "100_cat")
+                )
+                dino_path_list = dino_path.split("/")
+                new_dino_path = dino_path_list[:-2] + dino_path_list[-1:] # remove "/train/"
+                new_dino_path = '/'.join(new_dino_path)
+                dino_paths = [new_dino_path]
+            elif paths[0].startswith("/viscam/projects/articulated/zzli/fs_data/data_resize_update/few_shot_data_all"):
+                # 100 cat
+                dino_path = paths[0].replace(
+                    "/viscam/projects/articulated/zzli/fs_data/data_resize_update/few_shot_data_all",
+                    os.path.join(new_dino_data_path, "100_cat")
+                )
+                dino_path_list = dino_path.split("/")
+                new_dino_path = dino_path_list[:-2] + dino_path_list[-1:] # remove "/train/"
+                new_dino_path = '/'.join(new_dino_path)
+                dino_paths = [new_dino_path]
+            elif paths[0].startswith("/viscam/u/zzli/workspace/Animal-Data-Engine/data/data_resize_update/segmented_back_view_data"):
+                # back 100 cat
+                dino_path = paths[0].replace(
+                    "/viscam/u/zzli/workspace/Animal-Data-Engine/data/data_resize_update/segmented_back_view_data",
+                    os.path.join(new_dino_data_path, "back_100_cat")
+                )
+                dino_path_list = dino_path.split("/")
+                new_dino_path = dino_path_list[:-2] + dino_path_list[-1:] # remove "/train/"
+                new_dino_path = '/'.join(new_dino_path)
+                dino_paths = [new_dino_path]
+            elif paths[0].startswith("/viscam/projects/articulated/dor/Animal-Data-Engine/data/data_resize_update/train_with_classes_filtered"):
+                # animal3d
+                dino_path = paths[0].replace(
+                    "/viscam/projects/articulated/dor/Animal-Data-Engine/data/data_resize_update/train_with_classes_filtered",
+                    os.path.join(new_dino_data_path, "animal3D")
+                )
+                dino_path_list = dino_path.split("/")
+                new_dino_path = dino_path_list[:-2] + dino_path_list[-1:] # remove "/train/"
+                new_dino_path = '/'.join(new_dino_path)
+                dino_paths = [new_dino_path]
+            else:
+                raise NotImplementedError
+            dino_features = torch.stack(self._load_ids(dino_paths, self.dino_feature_loaders, transform=torch.FloatTensor), 0)
+            # dino_features = torch.stack(self._load_ids(paths, self.dino_feature_loaders, transform=torch.FloatTensor), 0)  # BxFx64x224x224
+        else:
+            dino_features = torch.zeros(1)
+        dino_clusters = torch.zeros(1)
+        # These are actually no use
+        seq_idx = 0
+        seq_idx = torch.LongTensor([seq_idx])
+        frame_idx = torch.arange(0, 1).long()
+        if self.random_flip and np.random.rand() < 0.5:
+            images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters = horizontal_flip_all(images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters)
+        ## pad shorter sequence
+        if len(paths) < self.num_sample_frames:
+            num_pad = self.num_sample_frames - len(paths)
+            images = torch.cat([images[:1]] *num_pad + [images], 0)
+            masks = torch.cat([masks[:1]] *num_pad + [masks], 0)
+            mask_dt = torch.cat([mask_dt[:1]] *num_pad + [mask_dt], 0)
+            mask_valid = torch.cat([mask_valid[:1]] *num_pad + [mask_valid], 0)
+            if flows.dim() > 1:
+                flows = torch.cat([flows[:1]*0] *num_pad + [flows], 0)
+            bboxs = torch.cat([bboxs[:1]] * num_pad + [bboxs], 0)
+            bg_images = torch.cat([bg_images[:1]] *num_pad + [bg_images], 0)
+            if dino_features.dim() > 1:
+                dino_features = torch.cat([dino_features[:1]] *num_pad + [dino_features], 0)
+            if dino_clusters.dim() > 1:
+                dino_clusters = torch.cat([dino_clusters[:1]] *num_pad + [dino_clusters], 0)
+            frame_idx = torch.cat([frame_idx[:1]] *num_pad + [frame_idx], 0)
+        out = (*map(none_to_nan, (images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, category_name)), )
+        return out
+        # return images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, category_name
+def get_sequence_loader_quadrupeds(original_data_dirs, few_shot_data_dirs, original_num, few_shot_num, rank, world_size, **kwargs):
+    dataset = Quadrupeds_Image_Dataset(original_data_dirs, few_shot_data_dirs, original_num, few_shot_num, **kwargs)
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=rank,
+        shuffle=False
+    )
+    loaders = []
+    loaders += [torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=kwargs['batch_size'], shuffle=False, drop_last=True, num_workers=kwargs['num_workers'], pin_memory=True)]
+    return loaders
+class Quadrupeds_Image_Test_Dataset(Dataset):
+    def __init__(self, test_data_dirs, num_sample_frames=2, in_image_size=256, out_image_size=256, shuffle=False, color_jitter=None, load_background=False, random_flip=False, rgb_suffix='.png', load_dino_feature=False, dino_feature_dim=64, flow_bool=False, **kwargs):
+        self.few_shot_data_dirs = test_data_dirs
+        self.image_loaders = [("rgb"+rgb_suffix, torchvision.datasets.folder.default_loader)]
+        self.mask_loaders = [("mask.png", torchvision.datasets.folder.default_loader)]
+        self.original_bbox_loaders = [("box.txt", box_loader)]
+        self.few_shot_bbox_loaders = [("box.txt", few_shot_box_loader)]
+        self.num_sample_frames = num_sample_frames
+        self.batch_size = kwargs['batch_size']  # a hack way here
+        few_shot_data_paths = {}
+        for k,v in self.few_shot_data_dirs.items():
+            if k.startswith('_'):
+                # a boundary here for dealing with when in new data, we have same categories as in 7-cat
+                v = v.replace(k, k[1:])
+            if isinstance(v, str):
+                result = sorted(glob(os.path.join(v, '*'+self.image_loaders[0][0])))
+            elif isinstance(v, list):
+                result = []
+                for _v in v:
+                    result = result + sorted(glob(os.path.join(_v, '*'+self.image_loaders[0][0])))
+            else:
+                raise NotImplementedError
+            # result = sorted(glob(os.path.join(v, '*'+self.image_loaders[0][0])))
+            result = [p.replace(self.image_loaders[0][0], '{}') for p in result]
+            sequences = result
+            if shuffle:
+                random.shuffle(sequences)
+            few_shot_data_paths.update({k: sequences})
+        # for visualization purpose
+        self.pure_fs_data_path = few_shot_data_paths
+        self.all_data_paths, self.one_category_num = self._pad_paths(few_shot_data_paths)
+        self.all_category_num = len(self.all_data_paths.keys())
+        self.all_category_names = list(self.all_data_paths.keys())
+        self.in_image_size = in_image_size
+        self.out_image_size = out_image_size
+        self.load_background = load_background
+        self.color_jitter = color_jitter
+        self.image_transform = transforms.Compose([transforms.Resize(self.in_image_size), transforms.ToTensor()])
+        self.mask_transform = transforms.Compose([transforms.Resize(self.out_image_size, interpolation=Image.NEAREST), transforms.ToTensor()])
+        self.random_flip = random_flip
+        self.load_dino_feature = load_dino_feature
+        if load_dino_feature:
+            self.dino_feature_loaders = [(f"feat{dino_feature_dim}.png", dino_loader, dino_feature_dim)]
+    def _pad_paths(self, fs_paths):
+        img_nums = []
+        all_paths = copy.deepcopy(fs_paths)
+        for _, v in all_paths.items():
+            img_nums.append(len(v))
+        img_num = max(img_nums)
+        img_num = (img_num // self.batch_size) * self.batch_size
+        for k,v in all_paths.items():
+            if len(v) < img_num:
+                mul_time = img_num // len(v)
+                pad_time = img_num % len(v)
+                # for each v, shuffle it
+                shuffle_v = copy.deepcopy(v)
+                new_v = []
+                for i in range(mul_time):
+                    new_v = new_v + shuffle_v
+                    random.shuffle(shuffle_v)
+                del shuffle_v
+                new_v = new_v + v[0:pad_time]
+                # new_v = mul_time * v + v[0:pad_time]
+                all_paths[k] = new_v
+            elif len(v) > img_num:
+                all_paths[k] = v[:img_num]
+            else:
+                continue
+        return all_paths, img_num
+    def _load_ids(self, path_patterns, loaders, transform=None):
+        result = []
+        for loader in loaders:
+            for p in path_patterns:
+                x = loader[1](p.format(loader[0]), *loader[2:])
+                if transform:
+                    x = transform(x)
+                result.append(x)
+        return tuple(result)
+    def _shuffle_all(self):
+        for k,v in self.all_data_paths.items():
+            new_v = copy.deepcopy(v)
+            random.shuffle(new_v)
+            self.all_data_paths[k] = new_v
+        return None
+    def __len__(self):
+        return self.all_category_num * self.one_category_num
+    def __getitem__(self, index):
+        '''
+        This dataset must have non-shuffled index!!
+        '''
+        category_idx = (index % (self.batch_size * self.all_category_num)) // self.batch_size
+        path_idx = (index // (self.batch_size * self.all_category_num)) * self.batch_size + (index % (self.batch_size * self.all_category_num)) - category_idx * self.batch_size
+        category_name = self.all_category_names[category_idx]
+        paths = [self.all_data_paths[category_name][path_idx]]  # len 1 sequence
+        # if category_name in self.original_category_names:
+        #     bbox_loaders = self.original_bbox_loaders
+        #     use_original_bbox = True
+        # else:
+        bbox_loaders = self.few_shot_bbox_loaders
+        use_original_bbox = False
+        masks = torch.stack(self._load_ids(paths, self.mask_loaders, transform=self.mask_transform), 0)  # load all images
+        mask_dt = compute_distance_transform(masks)
+        jitter = False
+        if self.color_jitter is not None:
+            prob, b, h = self.color_jitter
+            if np.random.rand() < prob:
+                jitter = True
+                color_jitter_tsf_fg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_fg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_fg, transforms.ToTensor()])
+                color_jitter_tsf_bg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_bg = transforms.Compose([transforms.Resize(self.in_image_size), color_jitter_tsf_bg, transforms.ToTensor()])
+        if jitter:
+            images_fg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_fg), 0)  # load all images
+            images_bg = torch.stack(self._load_ids(paths, self.image_loaders, transform=image_transform_bg), 0)  # load all images
+            images = images_fg * masks + images_bg * (1-masks)
+        else:
+            images = torch.stack(self._load_ids(paths, self.image_loaders, transform=self.image_transform), 0)  # load all images
+        flows = torch.zeros(1)
+        bboxs = torch.stack(self._load_ids(paths, bbox_loaders, transform=torch.FloatTensor), 0)   # load bounding boxes for all images
+        if not use_original_bbox:
+            bboxs=torch.cat([bboxs, torch.Tensor([[category_idx]]).float()],dim=-1) # pad a label number
+        mask_valid = get_valid_mask(bboxs, (self.out_image_size, self.out_image_size))  # exclude pixels cropped outside the original image
+        if self.load_background:
+            bg_image = torchvision.datasets.folder.default_loader(os.path.join(os.path.dirname(paths[0]), 'background_frame.jpg'))
+            if jitter:
+                bg_image = color_jitter_tsf_bg(bg_image)
+            bg_images = crop_image(bg_image, bboxs[:, 1:5].int().numpy(), (self.in_image_size, self.in_image_size))
+        else:
+            bg_images = torch.zeros_like(images)
+        if self.load_dino_feature:
+            dino_features = torch.stack(self._load_ids(paths, self.dino_feature_loaders, transform=torch.FloatTensor), 0)  # BxFx64x224x224
+        else:
+            dino_features = torch.zeros(1)
+        dino_clusters = torch.zeros(1)
+        # These are actually no use
+        seq_idx = 0
+        seq_idx = torch.LongTensor([seq_idx])
+        frame_idx = torch.arange(0, 1).long()
+        if self.random_flip and np.random.rand() < 0.5:
+            images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters = horizontal_flip_all(images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters)
+        ## pad shorter sequence
+        if len(paths) < self.num_sample_frames:
+            num_pad = self.num_sample_frames - len(paths)
+            images = torch.cat([images[:1]] *num_pad + [images], 0)
+            masks = torch.cat([masks[:1]] *num_pad + [masks], 0)
+            mask_dt = torch.cat([mask_dt[:1]] *num_pad + [mask_dt], 0)
+            mask_valid = torch.cat([mask_valid[:1]] *num_pad + [mask_valid], 0)
+            if flows.dim() > 1:
+                flows = torch.cat([flows[:1]*0] *num_pad + [flows], 0)
+            bboxs = torch.cat([bboxs[:1]] * num_pad + [bboxs], 0)
+            bg_images = torch.cat([bg_images[:1]] *num_pad + [bg_images], 0)
+            if dino_features.dim() > 1:
+                dino_features = torch.cat([dino_features[:1]] *num_pad + [dino_features], 0)
+            if dino_clusters.dim() > 1:
+                dino_clusters = torch.cat([dino_clusters[:1]] *num_pad + [dino_clusters], 0)
+            frame_idx = torch.cat([frame_idx[:1]] *num_pad + [frame_idx], 0)
+        out = (*map(none_to_nan, (images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, category_name)), )
+        return out
+        # return images, masks, mask_dt, mask_valid, flows, bboxs, bg_images, dino_features, dino_clusters, seq_idx, frame_idx, category_name
+def get_test_loader_quadrupeds(test_data_dirs, rank, world_size, **kwargs):
+    dataset = Quadrupeds_Image_Test_Dataset(test_data_dirs, **kwargs)
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=rank,
+        shuffle=False
+    )
+    loaders = []
+    loaders += [torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=kwargs['batch_size'], shuffle=False, drop_last=True, num_workers=kwargs['num_workers'], pin_memory=True)]
+    return loaders
+def get_sequence_loader(data_dir, **kwargs):
+    if isinstance(data_dir, dict):
+        loaders = []
+        for k, v in data_dir.items():
+            dataset= NFrameSequenceDataset(v, cat_name=k, **kwargs)
+            loader = torch.utils.data.DataLoader(dataset, batch_size=kwargs['batch_size'], shuffle=kwargs['shuffle'], num_workers=kwargs['num_workers'], pin_memory=True)
+            loaders += [loader]
+        return loaders
+    else:
+        return [get_sequence_loader_single(data_dir, **kwargs)]
+def get_sequence_loader_single(data_dir, mode='all_frame', is_validation=False, batch_size=256, num_workers=4, in_image_size=256, out_image_size=256, debug_seq=False, num_sample_frames=2, skip_beginning=4, skip_end=4, min_seq_len=10, max_seq_len=256, random_sample=False, shuffle=False, dense_sample=True, color_jitter=None, load_background=False, random_flip=False, rgb_suffix='.jpg', load_dino_feature=False, load_dino_cluster=False, dino_feature_dim=64):
+    if mode == 'n_frame':
+        dataset = NFrameSequenceDataset(data_dir, num_sample_frames=num_sample_frames, skip_beginning=skip_beginning, skip_end=skip_end, min_seq_len=min_seq_len, in_image_size=in_image_size, out_image_size=out_image_size, debug_seq=debug_seq, random_sample=random_sample, shuffle=shuffle, dense_sample=dense_sample, color_jitter=color_jitter, load_background=load_background, random_flip=random_flip, rgb_suffix=rgb_suffix, load_dino_feature=load_dino_feature, load_dino_cluster=load_dino_cluster, dino_feature_dim=dino_feature_dim)
+    else:
+        raise NotImplementedError
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=not is_validation,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader
+def get_sequence_loader_ddp(data_dir, world_size, rank, use_few_shot=False, **kwargs):
+    original_classes_num = 0
+    use_few_shot = use_few_shot
+    if isinstance(data_dir, list) and len(data_dir) == 2 and isinstance(data_dir[-1], dict):
+        # a hack way for few shot experiment
+        original_classes_num = data_dir[0]
+        data_dir = data_dir[-1]
+    if isinstance(data_dir, dict):
+        loaders = []
+        cnt = original_classes_num
+        for k, v in data_dir.items():
+            if use_few_shot:
+                dataset = FewShotImageDataset(v, cat_name=k, cat_num=cnt, **kwargs)
+                cnt += 1
+            else:
+                dataset = NFrameSequenceDataset(v, cat_name=k, **kwargs)
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                dataset,
+                num_replicas=world_size,
+                rank=rank,
+            )
+            loaders += [torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=kwargs['batch_size'], shuffle=False, drop_last=True, num_workers=kwargs['num_workers'], pin_memory=True)]
+        return loaders
+    else:
+        return [get_sequence_loader_single_ddp(data_dir, world_size, rank, **kwargs)]
+def get_sequence_loader_single_ddp(data_dir, world_size, rank, mode='all_frame', is_validation=False, batch_size=256, num_workers=4, in_image_size=256, out_image_size=256, debug_seq=False, num_sample_frames=2, skip_beginning=4, skip_end=4, min_seq_len=10, max_seq_len=256, random_sample=False, shuffle=False, dense_sample=True, color_jitter=None, load_background=False, random_flip=False, rgb_suffix='.jpg', load_dino_feature=False, load_dino_cluster=False, dino_feature_dim=64, flow_bool=False):
+    if mode == 'n_frame':
+        dataset = NFrameSequenceDataset(data_dir, num_sample_frames=num_sample_frames, skip_beginning=skip_beginning, skip_end=skip_end, min_seq_len=min_seq_len, in_image_size=in_image_size, out_image_size=out_image_size, debug_seq=debug_seq, random_sample=random_sample, shuffle=shuffle, dense_sample=dense_sample, color_jitter=color_jitter, load_background=load_background, random_flip=random_flip, rgb_suffix=rgb_suffix, load_dino_feature=load_dino_feature, load_dino_cluster=load_dino_cluster, dino_feature_dim=dino_feature_dim, flow_bool=flow_bool)
+    else:
+        raise NotImplementedError
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=rank,
+    )
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=True,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader
+class ImageDataset(Dataset):
+    def __init__(self, root, is_validation=False, image_size=256, color_jitter=None):
+        super().__init__()
+        self.image_loader = ("rgb.jpg", torchvision.datasets.folder.default_loader)
+        self.mask_loader = ("mask.png", torchvision.datasets.folder.default_loader)
+        self.bbox_loader = ("box.txt", np.loadtxt, 'str')
+        self.samples = self._parse_folder(root)
+        self.image_size = image_size
+        self.color_jitter = color_jitter
+        self.image_transform = transforms.Compose([transforms.Resize(self.image_size), transforms.ToTensor()])
+        self.mask_transform = transforms.Compose([transforms.Resize(self.image_size, interpolation=Image.NEAREST), transforms.ToTensor()])
+    def _parse_folder(self, path):
+        result = sorted(glob(os.path.join(path, '**/*'+self.image_loader[0]), recursive=True))
+        result = [p.replace(self.image_loader[0], '{}') for p in result]
+        return result
+    def _load_ids(self, path, loader, transform=None):
+        x = loader[1](path.format(loader[0]), *loader[2:])
+        if transform:
+            x = transform(x)
+        return x
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        path = self.samples[index % len(self.samples)]
+        masks = self._load_ids(path, self.mask_loader, transform=self.mask_transform).unsqueeze(0)
+        mask_dt = compute_distance_transform(masks)
+        jitter = False
+        if self.color_jitter is not None:
+            prob, b, h = self.color_jitter
+            if np.random.rand() < prob:
+                jitter = True
+                color_jitter_tsf_fg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_fg = transforms.Compose([transforms.Resize(self.image_size), color_jitter_tsf_fg, transforms.ToTensor()])
+                color_jitter_tsf_bg = transforms.ColorJitter.get_params(brightness=(1-b, 1+b), contrast=None, saturation=None, hue=(-h, h))
+                image_transform_bg = transforms.Compose([transforms.Resize(self.image_size), color_jitter_tsf_bg, transforms.ToTensor()])
+        if jitter:
+            images_fg = self._load_ids(path, self.image_loader, transform=image_transform_fg).unsqueeze(0)
+            images_bg = self._load_ids(path, self.image_loader, transform=image_transform_bg).unsqueeze(0)
+            images = images_fg * masks + images_bg * (1-masks)
+        else:
+            images = self._load_ids(path, self.image_loader, transform=self.image_transform).unsqueeze(0)
+        flows = torch.zeros(1)
+        bboxs = self._load_ids(path, self.bbox_loader, transform=None)
+        bboxs[0] = '0'
+        bboxs = torch.FloatTensor(bboxs.astype('float')).unsqueeze(0)
+        bg_fpath = os.path.join(os.path.dirname(path), 'background_frame.jpg')
+        if os.path.isfile(bg_fpath):
+            bg_image = torchvision.datasets.folder.default_loader(bg_fpath)
+            if jitter:
+                bg_image = color_jitter_tsf_bg(bg_image)
+            bg_image = transforms.ToTensor()(bg_image)
+        else:
+            bg_image = images[0]
+        seq_idx = torch.LongTensor([index])
+        frame_idx = torch.LongTensor([0])
+        return images, masks, mask_dt, flows, bboxs, bg_image, seq_idx, frame_idx
+def get_image_loader(data_dir, is_validation=False, batch_size=256, num_workers=4, image_size=256, color_jitter=None):
+    dataset = ImageDataset(data_dir, is_validation=is_validation, image_size=image_size, color_jitter=color_jitter)
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader
+def get_image_loader_ddp(data_dir, world_size, rank, is_validation=False, batch_size=256, num_workers=4, image_size=256, color_jitter=None):
+    dataset = ImageDataset(data_dir, is_validation=is_validation, image_size=image_size, color_jitter=color_jitter)
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=rank,
+    )
+    loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=True,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return loader

video3d/diffusion/sd.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import os
+# os.environ['HUGGINGFACE_HUB_CACHE'] = '/work/tomj/cache/huggingface_hub'
+# os.environ['HF_HOME'] = '/work/tomj/cache/huggingface_hub'
+os.environ['HUGGINGFACE_HUB_CACHE'] = '/viscam/u/zzli'
+os.environ['HF_HOME'] = '/viscam/u/zzli'
+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DDIMScheduler
+# Suppress partial model loading warning
+logging.set_verbosity_error()
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.cuda.amp import custom_bwd, custom_fwd
+class SpecifyGradient(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, input_tensor, gt_grad):
+        ctx.save_for_backward(gt_grad)
+        return torch.zeros([1], device=input_tensor.device, dtype=input_tensor.dtype)  # Dummy loss value
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad):
+        gt_grad, = ctx.saved_tensors
+        batch_size = len(gt_grad)
+        return gt_grad / batch_size, None
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+class StableDiffusion(nn.Module):
+    def __init__(self, device, sd_version='2.1', hf_key=None, torch_dtype=torch.float32):
+        super().__init__()
+        self.device = device
+        self.sd_version = sd_version
+        self.torch_dtype = torch_dtype
+        print(f'[INFO] loading stable diffusion...')
+        if hf_key is not None:
+            print(f'[INFO] using hugging face custom model key: {hf_key}')
+            model_key = hf_key
+        elif self.sd_version == '2.1':
+            model_key = "stabilityai/stable-diffusion-2-1-base"
+        elif self.sd_version == '2.0':
+            model_key = "stabilityai/stable-diffusion-2-base"
+        elif self.sd_version == '1.5':
+            model_key = "runwayml/stable-diffusion-v1-5"
+        else:
+            raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
+        # Create model
+        self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", torch_dtype=torch_dtype).to(self.device)
+        self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder").to(self.device)
+        self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", torch_dtype=torch_dtype).to(self.device)
+        self.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
+        # self.scheduler = PNDMScheduler.from_pretrained(model_key, subfolder="scheduler")
+        self.num_train_timesteps = self.scheduler.config.num_train_timesteps
+        self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
+        print(f'[INFO] loaded stable diffusion!')
+    def get_text_embeds(self, prompt, negative_prompt):
+        # prompt, negative_prompt: [str]
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(prompt, padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt')
+        with torch.no_grad():
+            text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # Do the same for unconditional embeddings
+        uncond_input = self.tokenizer(negative_prompt, padding='max_length', max_length=self.tokenizer.model_max_length, return_tensors='pt')
+        with torch.no_grad():
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+        # Cat for final embeddings
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def train_step(self, text_embeddings, pred_rgb,
+                   guidance_scale=100, loss_weight=1.0, min_step_pct=0.02, max_step_pct=0.98, return_aux=False):
+        pred_rgb = pred_rgb.to(self.torch_dtype)
+        text_embeddings = text_embeddings.to(self.torch_dtype)
+        b = pred_rgb.shape[0]
+        # interp to 512x512 to be fed into vae.
+        # _t = time.time()
+        pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: interp {time.time() - _t:.4f}s')
+        # timestep ~ U(0.02, 0.98) to avoid very high/low noise level
+        min_step = int(self.num_train_timesteps * min_step_pct)
+        max_step = int(self.num_train_timesteps * max_step_pct)
+        t = torch.randint(min_step, max_step + 1, [b], dtype=torch.long, device=self.device)
+        # encode image into latents with vae, requires grad!
+        # _t = time.time()
+        latents = self.encode_imgs(pred_rgb_512)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: vae enc {time.time() - _t:.4f}s')
+        # predict the noise residual with unet, NO grad!
+        # _t = time.time()
+        with torch.no_grad():
+            # add noise
+            noise = torch.randn_like(latents)
+            latents_noisy = self.scheduler.add_noise(latents, noise, t)
+            # pred noise
+            latent_model_input = torch.cat([latents_noisy] * 2)
+            t_input = torch.cat([t, t])
+            noise_pred = self.unet(latent_model_input, t_input, encoder_hidden_states=text_embeddings).sample
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: unet {time.time() - _t:.4f}s')
+        # perform guidance (high scale from paper!)
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        # noise_pred = noise_pred_text + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # w(t), sigma_t^2
+        w = (1 - self.alphas[t])
+        # w = self.alphas[t] ** 0.5 * (1 - self.alphas[t])
+        grad = loss_weight * w[:, None, None, None] * (noise_pred - noise)
+        # clip grad for stable training?
+        # grad = grad.clamp(-10, 10)
+        grad = torch.nan_to_num(grad)
+        # since we omitted an item in grad, we need to use the custom function to specify the gradient
+        # _t = time.time()
+        # loss = SpecifyGradient.apply(latents, grad)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: backward {time.time() - _t:.4f}s')
+        targets = (latents - grad).detach()
+        loss = 0.5 * F.mse_loss(latents.float(), targets, reduction='sum') / latents.shape[0]
+        if return_aux:
+            aux = {'grad': grad, 't': t, 'w': w}
+            return loss, aux
+        else:
+            return loss
+    def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):
+        if latents is None:
+            latents = torch.randn((text_embeddings.shape[0] // 2, self.unet.config.in_channels, height // 8, width // 8), device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps)
+        with torch.autocast('cuda'):
+            for i, t in enumerate(self.scheduler.timesteps):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latent_model_input = torch.cat([latents] * 2)
+                # predict the noise residual
+                with torch.no_grad():
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']
+                # perform guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_text + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents)['prev_sample']
+        return latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        with torch.no_grad():
+            imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def encode_imgs(self, imgs):
+        # imgs: [B, 3, H, W]
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        latents = posterior.sample() * self.vae.config.scaling_factor
+        return latents
+    def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        # Prompts -> text embeds
+        text_embeds = self.get_text_embeds(prompts, negative_prompts) # [2, 77, 768]
+        # Text embeds -> img latents
+        latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale) # [1, 4, 64, 64]
+        # Img latents -> imgs
+        imgs = self.decode_latents(latents) # [1, 3, 512, 512]
+        # Img to Numpy
+        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
+        imgs = (imgs * 255).round().astype('uint8')
+        return imgs
+if __name__ == '__main__':
+    import argparse
+    import matplotlib.pyplot as plt
+    parser = argparse.ArgumentParser()
+    parser.add_argument('prompt', type=str)
+    parser.add_argument('--negative', default='', type=str)
+    parser.add_argument('--sd_version', type=str, default='2.1', choices=['1.5', '2.0', '2.1'], help="stable diffusion version")
+    parser.add_argument('--hf_key', type=str, default=None, help="hugging face Stable diffusion model key")
+    parser.add_argument('-H', type=int, default=512)
+    parser.add_argument('-W', type=int, default=512)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--steps', type=int, default=50)
+    opt = parser.parse_args()
+    seed_everything(opt.seed)
+    device = torch.device('cuda')
+    sd = StableDiffusion(device, opt.sd_version, opt.hf_key)
+    imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps)
+    # visualize image
+    plt.imshow(imgs[0])
+    plt.show()
+    plt.savefig(f'{opt.prompt}.png')

video3d/diffusion/sd_utils.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import numpy as np
+import random
+import torch.nn.functional as F
+from ..render.light import DirectionalLight
+def safe_normalize(x, eps=1e-20):
+    return x / torch.sqrt(torch.clamp(torch.sum(x * x, -1, keepdim=True), min=eps))
+def get_view_direction(thetas, phis, overhead, front, phi_offset=0):
+    #                   phis [B,];          thetas: [B,]
+    # front = 0         [360 - front / 2, front / 2)
+    # side (left) = 1   [front / 2, 180 - front / 2)
+    # back = 2          [180 - front / 2, 180 + front / 2)
+    # side (right) = 3  [180 + front / 2, 360 - front / 2)
+    # top = 4                               [0, overhead]
+    # bottom = 5                            [180-overhead, 180]
+    res = torch.zeros(thetas.shape[0], dtype=torch.long)
+    # first determine by phis
+    phi_offset = np.deg2rad(phi_offset)
+    phis = phis + phi_offset
+    phis = phis % (2 * np.pi)
+    half_front = front / 2
+    res[(phis >= (2*np.pi - half_front)) | (phis < half_front)] = 0
+    res[(phis >= half_front) & (phis < (np.pi - half_front))] = 1
+    res[(phis >= (np.pi - half_front)) & (phis < (np.pi + half_front))] = 2
+    res[(phis >= (np.pi + half_front)) & (phis < (2*np.pi - half_front))] = 3
+    # override by thetas
+    res[thetas <= overhead] = 4
+    res[thetas >= (np.pi - overhead)] = 5
+    return res
+def view_direction_id_to_text(view_direction_id):
+    dir_texts = ['front', 'side', 'back', 'side', 'overhead', 'bottom']
+    return [dir_texts[i] for i in view_direction_id]
+def append_text_direction(prompts, dir_texts):
+    return [f'{prompt}, {dir_text} view' for prompt, dir_text in zip(prompts, dir_texts)]
+def rand_lights(camera_dir, fixed_ambient, fixed_diffuse):
+    size = camera_dir.shape[0]
+    device = camera_dir.device
+    random_fixed_dir = F.normalize(torch.randn_like(camera_dir) + camera_dir, dim=-1)  # Centered around camera_dir
+    random_fixed_intensity = torch.tensor([fixed_ambient, fixed_diffuse], device=device)[None, :].repeat(size, 1)  # ambient, diffuse
+    return DirectionalLight(mlp_in=1, mlp_layers=1, mlp_hidden_size=1, # Dummy values
+                            intensity_min_max=[0.5, 1],fixed_dir=random_fixed_dir, fixed_intensity=random_fixed_intensity).to(device)
+def rand_poses(size, device, radius_range=[1, 1], theta_range=[0, 120], phi_range=[0, 360], cam_z_offset=10, return_dirs=False, angle_overhead=30, angle_front=60, phi_offset=0, jitter=False, uniform_sphere_rate=0.5):
+    ''' generate random poses from an orbit camera
+    Args:
+        size: batch size of generated poses.
+        device: where to allocate the output.
+        radius_range: [min, max]
+        theta_range: [min, max], should be in [0, pi]
+        phi_range: [min, max], should be in [0, 2 * pi]
+    Return:
+        poses: [size, 4, 4]
+    '''
+    theta_range = np.deg2rad(theta_range)
+    phi_range = np.deg2rad(phi_range)
+    angle_overhead = np.deg2rad(angle_overhead)
+    angle_front = np.deg2rad(angle_front)
+    radius = torch.rand(size, device=device) * (radius_range[1] - radius_range[0]) + radius_range[0]
+    phis = torch.rand(size, device=device) * (phi_range[1] - phi_range[0]) + phi_range[0]
+    if random.random() < uniform_sphere_rate:
+        # based on http://corysimon.github.io/articles/uniformdistn-on-sphere/
+        # acos takes in [-1, 1], first convert theta range to fit in [-1, 1]
+        theta_range = torch.from_numpy(np.array(theta_range)).to(device)
+        theta_amplitude_range = torch.cos(theta_range)
+        # sample uniformly in amplitude space range
+        thetas_amplitude = torch.rand(size, device=device) * (theta_amplitude_range[1] - theta_amplitude_range[0]) + theta_amplitude_range[0]
+        # convert back
+        thetas = torch.acos(thetas_amplitude)
+    else:
+        thetas = torch.rand(size, device=device) * (theta_range[1] - theta_range[0]) + theta_range[0]
+    centers = -torch.stack([
+        radius * torch.sin(thetas) * torch.sin(phis),
+        radius * torch.cos(thetas),
+        radius * torch.sin(thetas) * torch.cos(phis),
+    ], dim=-1) # [B, 3]
+    targets = 0
+    # jitters
+    if jitter:
+        centers = centers + (torch.rand_like(centers) * 0.2 - 0.1)
+        targets = targets + torch.randn_like(centers) * 0.2
+    # lookat
+    forward_vector = safe_normalize(targets - centers)
+    up_vector = torch.FloatTensor([0, 1, 0]).to(device).unsqueeze(0).repeat(size, 1)
+    right_vector = safe_normalize(torch.cross(up_vector, forward_vector, dim=-1))
+    if jitter:
+        up_noise = torch.randn_like(up_vector) * 0.02
+    else:
+        up_noise = 0
+    up_vector = safe_normalize(torch.cross(forward_vector, right_vector, dim=-1) + up_noise)
+    poses = torch.stack([right_vector, up_vector, forward_vector], dim=-1)
+    radius = radius[..., None] - cam_z_offset
+    translations = torch.cat([torch.zeros_like(radius), torch.zeros_like(radius), radius], dim=-1)
+    poses = torch.cat([poses.view(-1, 9), translations], dim=-1)
+    if return_dirs:
+        dirs = get_view_direction(thetas, phis, angle_overhead, angle_front, phi_offset=phi_offset)
+        dirs = view_direction_id_to_text(dirs)
+    else:
+        dirs = None
+    return poses, dirs

video3d/diffusion/vsd.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import os
+os.environ['HUGGINGFACE_HUB_CACHE'] = '/viscam/u/zzli'
+os.environ['HF_HOME'] = '/viscam/u/zzli'
+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DDIMScheduler
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.models.embeddings import TimestepEmbedding
+from diffusers.utils.import_utils import is_xformers_available
+# Suppress partial model loading warning
+logging.set_verbosity_error()
+import gc
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tinycudann as tcnn
+from video3d.diffusion.sd import StableDiffusion
+from torch.cuda.amp import custom_bwd, custom_fwd
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+def cleanup():
+    gc.collect()
+    torch.cuda.empty_cache()
+    tcnn.free_temporary_memory()
+class StableDiffusion_VSD(StableDiffusion):
+    def __init__(self, device, sd_version='2.1', hf_key=None, torch_dtype=torch.float32, lora_n_timestamp_samples=1):
+        super().__init__(device, sd_version=sd_version, hf_key=hf_key, torch_dtype=torch_dtype)
+        # self.device = device
+        # self.sd_version = sd_version
+        # self.torch_dtype = torch_dtype
+        if hf_key is not None:
+            print(f'[INFO] using hugging face custom model key: {hf_key}')
+            model_key = hf_key
+        elif self.sd_version == '2.1':
+            model_key = "stabilityai/stable-diffusion-2-1-base"
+        elif self.sd_version == '2.0':
+            model_key = "stabilityai/stable-diffusion-2-base"
+        elif self.sd_version == '1.5':
+            model_key = "runwayml/stable-diffusion-v1-5"
+        else:
+            raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
+        # # Create model
+        # self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", torch_dtype=torch_dtype).to(self.device)
+        # self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
+        # self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder").to(self.device)
+        # self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", torch_dtype=torch_dtype).to(self.device)
+        # self.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
+        # # self.scheduler = PNDMScheduler.from_pretrained(model_key, subfolder="scheduler")
+        # self.num_train_timesteps = self.scheduler.config.num_train_timesteps
+        # self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
+        print(f'[INFO] loading stable diffusion VSD modules...')
+        self.unet_lora = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", torch_dtype=torch_dtype).to(self.device)
+        cleanup()
+        for p in self.vae.parameters():
+            p.requires_grad_(False)
+        for p in self.text_encoder.parameters():
+            p.requires_grad_(False)
+        for p in self.unet.parameters():
+            p.requires_grad_(False)
+        for p in self.unet_lora.parameters():
+            p.requires_grad_(False)
+        # set up LoRA layers
+        lora_attn_procs = {}
+        for name in self.unet_lora.attn_processors.keys():
+            cross_attention_dim = (
+                None
+                if name.endswith("attn1.processor")
+                else self.unet_lora.config.cross_attention_dim
+            )
+            if name.startswith("mid_block"):
+                hidden_size = self.unet_lora.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.unet_lora.config.block_out_channels))[
+                    block_id
+                ]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.unet_lora.config.block_out_channels[block_id]
+            lora_attn_procs[name] = LoRAAttnProcessor(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+            )
+        self.unet_lora.set_attn_processor(lora_attn_procs)
+        self.lora_layers = AttnProcsLayers(self.unet_lora.attn_processors).to(
+            self.device
+        )
+        self.lora_layers._load_state_dict_pre_hooks.clear()
+        self.lora_layers._state_dict_hooks.clear()
+        self.lora_n_timestamp_samples = lora_n_timestamp_samples
+        self.scheduler_lora = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
+        print(f'[INFO] loaded stable diffusion VSD modules!')
+    def train_lora(
+        self,
+        latents,
+        text_embeddings,
+        camera_condition
+    ):
+        B = latents.shape[0]
+        lora_n_timestamp_samples = self.lora_n_timestamp_samples
+        latents = latents.detach().repeat(lora_n_timestamp_samples, 1, 1, 1)
+        t = torch.randint(
+            int(self.num_train_timesteps * 0.0),
+            int(self.num_train_timesteps * 1.0),
+            [B * lora_n_timestamp_samples],
+            dtype=torch.long,
+            device=self.device,
+        )
+        noise = torch.randn_like(latents)
+        noisy_latents = self.scheduler_lora.add_noise(latents, noise, t)
+        if self.scheduler_lora.config.prediction_type == "epsilon":
+            target = noise
+        elif self.scheduler_lora.config.prediction_type == "v_prediction":
+            target = self.scheduler_lora.get_velocity(latents, noise, t)
+        else:
+            raise ValueError(
+                f"Unknown prediction type {self.scheduler_lora.config.prediction_type}"
+            )
+        # use view-independent text embeddings in LoRA
+        _, text_embeddings_cond = text_embeddings.chunk(2)
+        if random.random() < 0.1:
+            camera_condition = torch.zeros_like(camera_condition)
+        noise_pred = self.unet_lora(
+            noisy_latents,
+            t,
+            encoder_hidden_states=text_embeddings_cond.repeat(
+                lora_n_timestamp_samples, 1, 1
+            ),
+            class_labels=camera_condition.reshape(B, -1).repeat(
+                lora_n_timestamp_samples, 1
+            ),
+            cross_attention_kwargs={"scale": 1.0}
+        ).sample
+        loss_lora = 0.5 * F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+        return loss_lora
+    def train_step(
+        self,
+        text_embeddings,
+        text_embeddings_vd,
+        pred_rgb,
+        camera_condition,
+        im_features,
+        guidance_scale=7.5,
+        guidance_scale_lora=7.5,
+        loss_weight=1.0,
+        min_step_pct=0.02,
+        max_step_pct=0.98,
+        return_aux=False
+    ):
+        pred_rgb = pred_rgb.to(self.torch_dtype)
+        text_embeddings = text_embeddings.to(self.torch_dtype)
+        text_embeddings_vd = text_embeddings_vd.to(self.torch_dtype)
+        camera_condition = camera_condition.to(self.torch_dtype)
+        im_features = im_features.to(self.torch_dtype)
+        # condition_label = camera_condition
+        condition_label = im_features
+        b = pred_rgb.shape[0]
+        # interp to 512x512 to be fed into vae.
+        # _t = time.time()
+        pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: interp {time.time() - _t:.4f}s')
+        # timestep ~ U(0.02, 0.98) to avoid very high/low noise level
+        min_step = int(self.num_train_timesteps * min_step_pct)
+        max_step = int(self.num_train_timesteps * max_step_pct)
+        t = torch.randint(min_step, max_step + 1, [b], dtype=torch.long, device=self.device)
+        # encode image into latents with vae, requires grad!
+        # _t = time.time()
+        latents = self.encode_imgs(pred_rgb_512)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: vae enc {time.time() - _t:.4f}s')
+        # predict the noise residual with unet, NO grad!
+        # _t = time.time()
+        with torch.no_grad():
+            # add noise
+            noise = torch.randn_like(latents)
+            latents_noisy = self.scheduler.add_noise(latents, noise, t)
+            # pred noise
+            latent_model_input = torch.cat([latents_noisy] * 2)
+            # disable unet class embedding here
+            cls_embedding = self.unet.class_embedding
+            self.unet.class_embedding = None
+            cross_attention_kwargs = None
+            noise_pred_pretrain = self.unet(
+                latent_model_input,
+                torch.cat([t, t]),
+                encoder_hidden_states=text_embeddings_vd,
+                class_labels=None,
+                cross_attention_kwargs=cross_attention_kwargs
+            ).sample
+            self.unet.class_embedding = cls_embedding
+            # use view-independent text embeddings in LoRA
+            _, text_embeddings_cond = text_embeddings.chunk(2)
+            noise_pred_est = self.unet_lora(
+                latent_model_input,
+                torch.cat([t, t]),
+                encoder_hidden_states=torch.cat([text_embeddings_cond] * 2),
+                class_labels=torch.cat(
+                    [
+                        condition_label.reshape(b, -1),
+                        torch.zeros_like(condition_label.reshape(b, -1)),
+                    ],
+                    dim=0,
+                ),
+                cross_attention_kwargs={"scale": 1.0},
+            ).sample
+        noise_pred_pretrain_uncond, noise_pred_pretrain_text = noise_pred_pretrain.chunk(2)
+        noise_pred_pretrain = noise_pred_pretrain_uncond + guidance_scale * (
+            noise_pred_pretrain_text - noise_pred_pretrain_uncond
+        )
+        assert self.scheduler.config.prediction_type == "epsilon"
+        if self.scheduler_lora.config.prediction_type == "v_prediction":
+            alphas_cumprod = self.scheduler_lora.alphas_cumprod.to(
+                device=latents_noisy.device, dtype=latents_noisy.dtype
+            )
+            alpha_t = alphas_cumprod[t] ** 0.5
+            sigma_t = (1 - alphas_cumprod[t]) ** 0.5
+            noise_pred_est = latent_model_input * torch.cat([sigma_t] * 2, dim=0).reshape(
+                -1, 1, 1, 1
+            ) + noise_pred_est * torch.cat([alpha_t] * 2, dim=0).reshape(-1, 1, 1, 1)
+        noise_pred_est_uncond, noise_pred_est_camera = noise_pred_est.chunk(2)
+        noise_pred_est = noise_pred_est_uncond + guidance_scale_lora * (
+            noise_pred_est_camera - noise_pred_est_uncond
+        )
+        # w(t), sigma_t^2
+        w = (1 - self.alphas[t])
+        # w = self.alphas[t] ** 0.5 * (1 - self.alphas[t])
+        grad = loss_weight * w[:, None, None, None] * (noise_pred_pretrain - noise_pred_est)
+        grad = torch.nan_to_num(grad)
+        targets = (latents - grad).detach()
+        loss_vsd = 0.5 * F.mse_loss(latents.float(), targets, reduction='sum') / latents.shape[0]
+        loss_lora = self.train_lora(latents, text_embeddings, condition_label)
+        loss = {
+            'loss_vsd': loss_vsd,
+            'loss_lora': loss_lora
+        }
+        if return_aux:
+            aux = {'grad': grad, 't': t, 'w': w}
+            return loss, aux
+        else:
+            return loss
+if __name__ == '__main__':
+    import argparse
+    import matplotlib.pyplot as plt
+    parser = argparse.ArgumentParser()
+    parser.add_argument('prompt', type=str)
+    parser.add_argument('--negative', default='', type=str)
+    parser.add_argument('--sd_version', type=str, default='2.1', choices=['1.5', '2.0', '2.1'], help="stable diffusion version")
+    parser.add_argument('--hf_key', type=str, default=None, help="hugging face Stable diffusion model key")
+    parser.add_argument('-H', type=int, default=512)
+    parser.add_argument('-W', type=int, default=512)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--steps', type=int, default=50)
+    opt = parser.parse_args()
+    seed_everything(opt.seed)
+    device = torch.device('cuda')
+    sd = StableDiffusion_VSD(device, opt.sd_version, opt.hf_key)
+    imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps)
+    # visualize image
+    plt.imshow(imgs[0])
+    plt.show()
+    plt.savefig(f'{opt.prompt}.png')

video3d/discriminator_architecture.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch.nn as nn
+import torch
+from math import log2
+import torch.nn.functional as F
+from torch import autograd
+class DCDiscriminator(nn.Module):
+    ''' DC Discriminator class.
+    Args:
+        in_dim (int): input dimension
+        n_feat (int): features of final hidden layer
+        img_size (int): input image size
+    '''
+    def __init__(self, in_dim=1, out_dim=1, n_feat=512, img_size=256, last_bias=False):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        n_layers = int(log2(img_size) - 2)
+        self.blocks = nn.ModuleList(
+            [nn.Conv2d(
+                in_dim,
+                int(n_feat / (2 ** (n_layers - 1))),
+                4, 2, 1, bias=False)] + [nn.Conv2d(
+                    int(n_feat / (2 ** (n_layers - i))),
+                    int(n_feat / (2 ** (n_layers - 1 - i))),
+                    4, 2, 1, bias=False) for i in range(1, n_layers)])
+        self.conv_out = nn.Conv2d(n_feat, out_dim, 4, 1, 0, bias=last_bias)
+        self.actvn = nn.LeakyReLU(0.2, inplace=True)
+    def forward(self, x):
+        batch_size = x.shape[0]
+        if x.shape[1] != self.in_dim:
+            import ipdb; ipdb.set_trace()
+            x = x[:, :self.in_dim]
+        for layer in self.blocks:
+            x = self.actvn(layer(x))
+        out = self.conv_out(x)
+        out = out.reshape(batch_size, self.out_dim)
+        return out
+# class ADADiscriminator(DCDiscriminator):
+#     def __init__(self, aug, aug_p, **kwargs):
+#         super().__init__(**kwargs)
+#         self.aug = build_from_config(aug)
+#         self.aug.p.copy_(torch.tensor(aug_p, dtype=torch.float32))
+#         self.resolution = kwargs['img_size']
+#     def get_resolution(self):
+#         return self.resolution
+#     def forward(self, x, **kwargs):
+#         x = self.aug(x)
+#         return super().forward(x, **kwargs)
+# class ADADiscriminatorView(ADADiscriminator):
+#     def __init__(self, out_dim_position, out_dim_latent, **kwargs):
+#         self.out_dim_position = out_dim_position
+#         self.out_dim_latent = out_dim_latent
+#         super().__init__(**kwargs)
+def bce_loss_target(d_out, target):
+    targets = d_out.new_full(size=d_out.size(), fill_value=target)
+    loss = F.binary_cross_entropy_with_logits(d_out, targets)
+    return loss.mean()
+def compute_grad2(d_out, x_in):
+    batch_size = x_in.size(0)
+    grad_dout = autograd.grad(
+        outputs=d_out.sum(), inputs=x_in,
+        create_graph=True, retain_graph=True, only_inputs=True
+    )[0]
+    grad_dout2 = grad_dout.pow(2)
+    assert(grad_dout2.size() == x_in.size())
+    reg = grad_dout2.reshape(batch_size, -1).sum(1)
+    return reg.mean()

video3d/flow/__init__.py ADDED Viewed

File without changes

video3d/flow/flow.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from numpy.lib.npyio import load
+from torch._C import device
+import sys
+sys.path.append('/scratch/shared/beegfs/szwu/projects/video3d/RAFT')
+from core.raft import RAFT
+from .utils import InputPadder
+import torch
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+class FlowModel():
+    def __init__(self, model, device):
+        args = AttrDict({'model': model, 'small': False, 'mixed_precision': False, 'alternate_corr': False})
+        self.model = self.load_model(args, device)
+        self.device = device
+    @staticmethod
+    def load_model(args, device):
+        model = torch.nn.DataParallel(RAFT(args))
+        model.load_state_dict(torch.load(args.model))
+        model = model.module
+        model.to(device)
+        model.eval()
+        return model
+    def preprocess_image(self, image):
+        # image = image[:, :, ::-1].copy()
+        image = torch.from_numpy(image).permute(2, 0, 1).float()
+        image = image.to(self.device)
+        image = image[None]
+        # size = [540, 960]
+        # image = torch.nn.functional.interpolate(image, size=size, mode='bilinear', align_corners=False)
+        padder = InputPadder(image.shape)
+        return padder.pad(image)[0], padder
+    def compute_flow(self, frame, next_frame, iters=20):
+        frame, padder = self.preprocess_image(frame)
+        next_frame, padder = self.preprocess_image(next_frame)
+        _, flow = self.model(frame, next_frame, iters=iters, test_mode=True)
+        return padder.unpad(flow)[0].permute(1, 2, 0).cpu().numpy()

video3d/flow/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Taken from RAFT
+import torch.nn.functional as F
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == 'sintel':
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+        else:
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+    def unpad(self,x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]

video3d/geometry/dlmesh.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import torch
+from ..render import mesh
+from ..render import render
+from ..render import regularizer
+###############################################################################
+#  Geometry interface
+###############################################################################
+class DLMesh(torch.nn.Module):
+    def __init__(self, initial_guess, FLAGS):
+        super(DLMesh, self).__init__()
+        self.FLAGS = FLAGS
+        self.initial_guess = initial_guess
+        self.mesh          = initial_guess.clone()
+        print("Base mesh has %d triangles and %d vertices." % (self.mesh.t_pos_idx.shape[0], self.mesh.v_pos.shape[0]))
+        self.mesh.v_pos = torch.nn.Parameter(self.mesh.v_pos, requires_grad=True)
+        self.register_parameter('vertex_pos', self.mesh.v_pos)
+    @torch.no_grad()
+    def getAABB(self):
+        return mesh.aabb(self.mesh)
+    def getMesh(self, material):
+        self.mesh.material = material
+        imesh = mesh.Mesh(base=self.mesh)
+        # Compute normals and tangent space
+        imesh = mesh.auto_normals(imesh)
+        imesh = mesh.compute_tangents(imesh)
+        return imesh
+    def render(self, glctx, target, lgt, opt_material, bsdf=None):
+        opt_mesh = self.getMesh(opt_material)
+        return render.render_mesh(glctx, opt_mesh, target['mvp'], target['campos'], lgt, target['resolution'], spp=target['spp'],
+                                    num_layers=self.FLAGS.layers, msaa=True, background=target['background'], bsdf=bsdf)
+    def tick(self, glctx, target, lgt, opt_material, loss_fn, iteration):
+        # ==============================================================================================
+        #  Render optimizable object with identical conditions
+        # ==============================================================================================
+        buffers = self.render(glctx, target, lgt, opt_material)
+        # ==============================================================================================
+        #  Compute loss
+        # ==============================================================================================
+        t_iter = iteration / self.FLAGS.iter
+        # Image-space loss, split into a coverage component and a color component
+        color_ref = target['img']
+        img_loss = torch.nn.functional.mse_loss(buffers['shaded'][..., 3:], color_ref[..., 3:])
+        img_loss += loss_fn(buffers['shaded'][..., 0:3] * color_ref[..., 3:], color_ref[..., 0:3] * color_ref[..., 3:])
+        reg_loss = torch.tensor([0], dtype=torch.float32, device="cuda")
+        # Compute regularizer.
+        if self.FLAGS.laplace == "absolute":
+            reg_loss += regularizer.laplace_regularizer_const(self.mesh.v_pos, self.mesh.t_pos_idx) * self.FLAGS.laplace_scale * (1 - t_iter)
+        elif self.FLAGS.laplace == "relative":
+            reg_loss += regularizer.laplace_regularizer_const(self.mesh.v_pos - self.initial_guess.v_pos, self.mesh.t_pos_idx) * self.FLAGS.laplace_scale * (1 - t_iter)
+        # Albedo (k_d) smoothnesss regularizer
+        reg_loss += torch.mean(buffers['kd_grad'][..., :-1] * buffers['kd_grad'][..., -1:]) * 0.03 * min(1.0, iteration / 500)
+        # Visibility regularizer
+        reg_loss += torch.mean(buffers['occlusion'][..., :-1] * buffers['occlusion'][..., -1:]) * 0.001 * min(1.0, iteration / 500)
+        # Light white balance regularizer
+        reg_loss = reg_loss + lgt.regularizer() * 0.005
+        return img_loss, reg_loss

video3d/geometry/dmtet.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+from multiprocessing.spawn import get_preparation_data
+import numpy as np
+import torch
+from ..render import mesh
+from ..render import render
+from ..networks import MLPWithPositionalEncoding, MLPWithPositionalEncoding_Style
+###############################################################################
+# Marching tetrahedrons implementation (differentiable), adapted from
+# https://github.com/NVIDIAGameWorks/kaolin/blob/master/kaolin/ops/conversions/tetmesh.py
+#
+# Note this only supports batch size = 1.
+###############################################################################
+class DMTet:
+    def __init__(self):
+        self.triangle_table = torch.tensor([
+                [-1, -1, -1, -1, -1, -1],
+                [ 1,  0,  2, -1, -1, -1],
+                [ 4,  0,  3, -1, -1, -1],
+                [ 1,  4,  2,  1,  3,  4],
+                [ 3,  1,  5, -1, -1, -1],
+                [ 2,  3,  0,  2,  5,  3],
+                [ 1,  4,  0,  1,  5,  4],
+                [ 4,  2,  5, -1, -1, -1],
+                [ 4,  5,  2, -1, -1, -1],
+                [ 4,  1,  0,  4,  5,  1],
+                [ 3,  2,  0,  3,  5,  2],
+                [ 1,  3,  5, -1, -1, -1],
+                [ 4,  1,  2,  4,  3,  1],
+                [ 3,  0,  4, -1, -1, -1],
+                [ 2,  0,  1, -1, -1, -1],
+                [-1, -1, -1, -1, -1, -1]
+                ], dtype=torch.long, device='cuda')
+        self.num_triangles_table = torch.tensor([0,1,1,2,1,2,2,1,1,2,2,1,2,1,1,0], dtype=torch.long, device='cuda')
+        self.base_tet_edges = torch.tensor([0,1,0,2,0,3,1,2,1,3,2,3], dtype=torch.long, device='cuda')
+    ###############################################################################
+    # Utility functions
+    ###############################################################################
+    def sort_edges(self, edges_ex2):
+        with torch.no_grad():
+            order = (edges_ex2[:,0] > edges_ex2[:,1]).long()
+            order = order.unsqueeze(dim=1)
+            a = torch.gather(input=edges_ex2, index=order, dim=1)
+            b = torch.gather(input=edges_ex2, index=1-order, dim=1)
+        return torch.stack([a, b],-1)
+    def map_uv(self, faces, face_gidx, max_idx):
+        N = int(np.ceil(np.sqrt((max_idx+1)//2)))
+        tex_y, tex_x = torch.meshgrid(
+            torch.linspace(0, 1 - (1 / N), N, dtype=torch.float32, device="cuda"),
+            torch.linspace(0, 1 - (1 / N), N, dtype=torch.float32, device="cuda"),
+            indexing='ij'
+        )
+        pad = 0.9 / N
+        uvs = torch.stack([
+            tex_x      , tex_y,
+            tex_x + pad, tex_y,
+            tex_x + pad, tex_y + pad,
+            tex_x      , tex_y + pad
+        ], dim=-1).view(-1, 2)
+        def _idx(tet_idx, N):
+            x = tet_idx % N
+            y = torch.div(tet_idx, N, rounding_mode='trunc')
+            return y * N + x
+        tet_idx = _idx(torch.div(face_gidx, 2, rounding_mode='trunc'), N)
+        tri_idx = face_gidx % 2
+        uv_idx = torch.stack((
+            tet_idx * 4, tet_idx * 4 + tri_idx + 1, tet_idx * 4 + tri_idx + 2
+        ), dim = -1). view(-1, 3)
+        return uvs, uv_idx
+    ###############################################################################
+    # Marching tets implementation
+    ###############################################################################
+    def __call__(self, pos_nx3, sdf_n, tet_fx4):
+        with torch.no_grad():
+            occ_n = sdf_n > 0
+            occ_fx4 = occ_n[tet_fx4.reshape(-1)].reshape(-1,4)
+            occ_sum = torch.sum(occ_fx4, -1)
+            valid_tets = (occ_sum>0) & (occ_sum<4)
+            occ_sum = occ_sum[valid_tets]
+            # find all vertices
+            all_edges = tet_fx4[valid_tets][:,self.base_tet_edges].reshape(-1,2)
+            all_edges = self.sort_edges(all_edges)
+            unique_edges, idx_map = torch.unique(all_edges,dim=0, return_inverse=True)
+            unique_edges = unique_edges.long()
+            mask_edges = occ_n[unique_edges.reshape(-1)].reshape(-1,2).sum(-1) == 1
+            mapping = torch.ones((unique_edges.shape[0]), dtype=torch.long, device="cuda") * -1
+            mapping[mask_edges] = torch.arange(mask_edges.sum(), dtype=torch.long,device="cuda")
+            idx_map = mapping[idx_map] # map edges to verts
+            interp_v = unique_edges[mask_edges]
+        edges_to_interp = pos_nx3[interp_v.reshape(-1)].reshape(-1,2,3)
+        edges_to_interp_sdf = sdf_n[interp_v.reshape(-1)].reshape(-1,2,1)
+        edges_to_interp_sdf[:,-1] *= -1
+        denominator = edges_to_interp_sdf.sum(1,keepdim = True)
+        edges_to_interp_sdf = torch.flip(edges_to_interp_sdf, [1])/denominator
+        verts = (edges_to_interp * edges_to_interp_sdf).sum(1)
+        idx_map = idx_map.reshape(-1,6)
+        v_id = torch.pow(2, torch.arange(4, dtype=torch.long, device="cuda"))
+        tetindex = (occ_fx4[valid_tets] * v_id.unsqueeze(0)).sum(-1)
+        num_triangles = self.num_triangles_table[tetindex]
+        # Generate triangle indices
+        faces = torch.cat((
+            torch.gather(input=idx_map[num_triangles == 1], dim=1, index=self.triangle_table[tetindex[num_triangles == 1]][:, :3]).reshape(-1,3),
+            torch.gather(input=idx_map[num_triangles == 2], dim=1, index=self.triangle_table[tetindex[num_triangles == 2]][:, :6]).reshape(-1,3),
+        ), dim=0)
+        # Get global face index (static, does not depend on topology)
+        num_tets = tet_fx4.shape[0]
+        tet_gidx = torch.arange(num_tets, dtype=torch.long, device="cuda")[valid_tets]
+        face_gidx = torch.cat((
+            tet_gidx[num_triangles == 1]*2,
+            torch.stack((tet_gidx[num_triangles == 2]*2, tet_gidx[num_triangles == 2]*2 + 1), dim=-1).view(-1)
+        ), dim=0)
+        uvs, uv_idx = self.map_uv(faces, face_gidx, num_tets*2)
+        return verts, faces, uvs, uv_idx
+###############################################################################
+# Regularizer
+###############################################################################
+def sdf_bce_reg_loss(sdf, all_edges):
+    sdf_f1x6x2 = sdf[all_edges.reshape(-1)].reshape(-1,2)
+    mask = torch.sign(sdf_f1x6x2[...,0]) != torch.sign(sdf_f1x6x2[...,1])
+    sdf_f1x6x2 = sdf_f1x6x2[mask]
+    sdf_diff = torch.nn.functional.binary_cross_entropy_with_logits(sdf_f1x6x2[...,0], (sdf_f1x6x2[...,1] > 0).float()) + \
+               torch.nn.functional.binary_cross_entropy_with_logits(sdf_f1x6x2[...,1], (sdf_f1x6x2[...,0] > 0).float())
+    if torch.isnan(sdf_diff).any():
+        import ipdb; ipdb.set_trace()
+    return sdf_diff
+###############################################################################
+#  Geometry interface
+###############################################################################
+class DMTetGeometry(torch.nn.Module):
+    def __init__(self, grid_res, scale, sdf_mode, num_layers=None, hidden_size=None, embedder_freq=None, embed_concat_pts=True, init_sdf=None, jitter_grid=0., perturb_sdf_iter=10000, sym_prior_shape=False, dim_of_classes=0, condition_choice='concat'):
+        super(DMTetGeometry, self).__init__()
+        self.sdf_mode = sdf_mode
+        self.grid_res      = grid_res
+        self.marching_tets = DMTet()
+        self.grid_scale = scale
+        self.init_sdf = init_sdf
+        self.jitter_grid = jitter_grid
+        self.perturb_sdf_iter = perturb_sdf_iter
+        self.sym_prior_shape = sym_prior_shape
+        self.load_tets(self.grid_res, self.grid_scale)
+        if sdf_mode == "param":
+            sdf = torch.rand_like(self.verts[:,0]) - 0.1  # Random init.
+            self.sdf    = torch.nn.Parameter(sdf.clone().detach(), requires_grad=True)
+            self.register_parameter('sdf', self.sdf)
+            self.deform = torch.nn.Parameter(torch.zeros_like(self.verts), requires_grad=True)
+            self.register_parameter('deform', self.deform)
+        else:
+            embedder_scaler = 2 * np.pi / self.grid_scale * 0.9  # originally (-0.5*s, 0.5*s) rescale to (-pi, pi) * 0.9
+            if dim_of_classes == 0 or (dim_of_classes != 0 and condition_choice == 'concat'):
+                self.mlp = MLPWithPositionalEncoding(
+                    3,
+                    1,
+                    num_layers,
+                    nf=hidden_size,
+                    extra_dim=dim_of_classes,
+                    dropout=0,
+                    activation=None,
+                    n_harmonic_functions=embedder_freq,
+                    omega0=embedder_scaler,
+                    embed_concat_pts=embed_concat_pts)
+            elif condition_choice == 'film' or condition_choice == 'mod':
+                self.mlp = MLPWithPositionalEncoding_Style(
+                    3,
+                    1,
+                    num_layers,
+                    nf=hidden_size,
+                    extra_dim=dim_of_classes,
+                    dropout=0,
+                    activation=None,
+                    n_harmonic_functions=embedder_freq,
+                    omega0=embedder_scaler,
+                    embed_concat_pts=embed_concat_pts,
+                    style_choice=condition_choice)
+            else:
+                raise NotImplementedError
+    def load_tets(self, grid_res=None, scale=None):
+        if grid_res is None:
+            grid_res = self.grid_res
+        else:
+            self.grid_res = grid_res
+        if scale is None:
+            scale = self.grid_scale
+        else:
+            self.grid_scale = scale
+        tets = np.load('./data/tets/{}_tets.npz'.format(grid_res))
+        self.verts = torch.tensor(tets['vertices'], dtype=torch.float32, device='cuda') * scale  # verts original scale (-0.5, 0.5)
+        self.indices = torch.tensor(tets['indices'], dtype=torch.long, device='cuda')
+        self.generate_edges()
+    def get_sdf(self, pts=None, perturb_sdf=False, total_iter=0, class_vector=None):
+        if self.sdf_mode == 'param':
+            sdf = self.sdf
+        else:
+            if pts is None:
+                pts = self.verts
+            if self.sym_prior_shape:
+                xs, ys, zs = pts.unbind(-1)
+                pts = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+            feat = None
+            if class_vector is not None:
+                feat = class_vector.unsqueeze(0).repeat(pts.shape[0], 1)
+            sdf = self.mlp(pts, feat=feat)
+        if self.init_sdf is None:
+            pass
+        elif type(self.init_sdf) in [float, int]:
+            sdf = sdf + self.init_sdf
+        elif self.init_sdf == 'sphere':
+            init_radius = self.grid_scale * 0.25
+            init_sdf = init_radius - pts.norm(dim=-1, keepdim=True)  # init sdf is a sphere centered at origin
+            sdf = sdf + init_sdf
+        elif self.init_sdf == 'ellipsoid':
+            rxy = self.grid_scale * 0.15
+            xs, ys, zs = pts.unbind(-1)[:3]
+            init_sdf = rxy - torch.stack([xs, ys, zs/2], -1).norm(dim=-1, keepdim=True)  # init sdf is approximately an ellipsoid centered at origin
+            sdf = sdf + init_sdf
+        else:
+            raise NotImplementedError
+        if perturb_sdf:
+            sdf = sdf + torch.randn_like(sdf) * 0.1 * max(0, 1-total_iter/self.perturb_sdf_iter)
+        return sdf
+    def get_sdf_gradient(self, class_vector=None):
+        assert self.sdf_mode == 'mlp', "Only MLP supports gradient computation."
+        num_samples = 5000
+        sample_points = (torch.rand(num_samples, 3, device=self.verts.device) - 0.5) * self.grid_scale
+        mesh_verts = self.mesh_verts.detach() + (torch.rand_like(self.mesh_verts) -0.5) * 0.1 * self.grid_scale
+        rand_idx = torch.randperm(len(mesh_verts), device=mesh_verts.device)[:5000]
+        mesh_verts = mesh_verts[rand_idx]
+        sample_points = torch.cat([sample_points, mesh_verts], 0)
+        sample_points.requires_grad = True
+        y = self.get_sdf(pts=sample_points, perturb_sdf=False, class_vector=class_vector)
+        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
+        try:
+            gradients = torch.autograd.grad(
+                outputs=[y],
+                inputs=sample_points,
+                grad_outputs=d_output,
+                create_graph=True,
+                retain_graph=True,
+                only_inputs=True)[0]
+        except RuntimeError:  # For validation, we have disabled gradient calculation.
+            return torch.zeros_like(sample_points)
+        return gradients
+    def get_sdf_reg_loss(self, class_vector=None):
+        reg_loss = {"sdf_bce_reg_loss": sdf_bce_reg_loss(self.current_sdf, self.all_edges).mean()}
+        if self.sdf_mode == 'mlp':
+            reg_loss["sdf_gradient_reg_loss"] = ((self.get_sdf_gradient(class_vector=class_vector).norm(dim=-1) - 1) ** 2).mean()
+        reg_loss['sdf_inflate_reg_loss'] = -self.current_sdf.mean()
+        return reg_loss
+    def generate_edges(self):
+        with torch.no_grad():
+            edges = torch.tensor([0,1,0,2,0,3,1,2,1,3,2,3], dtype = torch.long, device = "cuda")
+            all_edges = self.indices[:,edges].reshape(-1,2)
+            all_edges_sorted = torch.sort(all_edges, dim=1)[0]
+            self.all_edges = torch.unique(all_edges_sorted, dim=0)
+    @torch.no_grad()
+    def getAABB(self):
+        return torch.min(self.verts, dim=0).values, torch.max(self.verts, dim=0).values
+    def getMesh(self, material=None, perturb_sdf=False, total_iter=0, jitter_grid=True, class_vector=None):
+        # Run DM tet to get a base mesh
+        v_deformed = self.verts
+        # if self.FLAGS.deform_grid:
+        #     v_deformed = self.verts + 2 / (self.grid_res * 2) * torch.tanh(self.deform)
+        # else:
+        #     v_deformed = self.verts
+        if jitter_grid and self.jitter_grid > 0:
+            jitter = (torch.rand(1, device=v_deformed.device)*2-1) * self.jitter_grid * self.grid_scale
+            v_deformed = v_deformed + jitter
+        self.current_sdf = self.get_sdf(v_deformed, perturb_sdf=perturb_sdf, total_iter=total_iter, class_vector=class_vector)
+        verts, faces, uvs, uv_idx = self.marching_tets(v_deformed, self.current_sdf, self.indices)
+        self.mesh_verts = verts
+        return mesh.make_mesh(verts[None], faces[None], uvs[None], uv_idx[None], material)
+    def render(self, glctx, target, lgt, opt_material, bsdf=None):
+        opt_mesh = self.getMesh(opt_material)
+        return render.render_mesh(glctx, opt_mesh, target['mvp'], target['campos'], lgt, target['resolution'], spp=target['spp'], msaa=True, background=target['background'], bsdf=bsdf)
+    def tick(self, glctx, target, lgt, opt_material, loss_fn, iteration):
+        # ==============================================================================================
+        #  Render optimizable object with identical conditions
+        # ==============================================================================================
+        buffers = self.render(glctx, target, lgt, opt_material)
+        # ==============================================================================================
+        #  Compute loss
+        # ==============================================================================================
+        t_iter = iteration / 20000
+        # Image-space loss, split into a coverage component and a color component
+        color_ref = target['img']
+        img_loss = torch.nn.functional.mse_loss(buffers['shaded'][..., 3:], color_ref[..., 3:])
+        img_loss = img_loss + loss_fn(buffers['shaded'][..., 0:3] * color_ref[..., 3:], color_ref[..., 0:3] * color_ref[..., 3:])
+        # SDF regularizer
+        # sdf_weight = self.sdf_regularizer - (self.sdf_regularizer - 0.01) * min(1.0, 4.0 * t_iter)  # Dropoff to 0.01
+        reg_loss = sum(self.get_sdf_reg_loss().values)
+        # Albedo (k_d) smoothnesss regularizer
+        reg_loss += torch.mean(buffers['kd_grad'][..., :-1] * buffers['kd_grad'][..., -1:]) * 0.03 * min(1.0, iteration / 500)
+        # Visibility regularizer
+        reg_loss += torch.mean(buffers['occlusion'][..., :-1] * buffers['occlusion'][..., -1:]) * 0.001 * min(1.0, iteration / 500)
+        # Light white balance regularizer
+        reg_loss = reg_loss + lgt.regularizer() * 0.005
+        return img_loss, reg_loss

video3d/model.py ADDED Viewed

	@@ -0,0 +1,1526 @@

+from multiprocessing.spawn import prepare
+from turtle import forward
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import nvdiffrast.torch as dr
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import os.path as osp
+from video3d.render.regularizer import get_edge_length, normal_consistency
+from . import networks
+from .renderer import *
+from .utils import misc, meters, flow_viz, arap, custom_loss
+from .dataloaders import get_sequence_loader, get_image_loader
+from .cub_dataloaders import get_cub_loader
+from .utils.skinning_v4 import estimate_bones, skinning
+import lpips
+from einops import rearrange
+from .geometry.dmtet import DMTetGeometry
+from .geometry.dlmesh import DLMesh
+from .render import renderutils as ru
+from .render import material
+from .render import mlptexture
+from .render import util
+from .render import mesh
+from .render import light
+from .render import render
+EPS = 1e-7
+def get_optimizer(model, lr=0.0001, betas=(0.9, 0.999), weight_decay=0):
+    return torch.optim.Adam(
+            filter(lambda p: p.requires_grad, model.parameters()),
+            lr=lr, betas=betas, weight_decay=weight_decay)
+def set_requires_grad(model, requires_grad):
+    if model is not None:
+        for param in model.parameters():
+            param.requires_grad = requires_grad
+def forward_to_matrix(vec_forward, up=[0,1,0]):
+    up = torch.FloatTensor(up).to(vec_forward.device)
+    # vec_forward = nn.functional.normalize(vec_forward, p=2, dim=-1)  # x right, y up, z forward
+    vec_right = up.expand_as(vec_forward).cross(vec_forward, dim=-1)
+    vec_right = nn.functional.normalize(vec_right, p=2, dim=-1)
+    vec_up = vec_forward.cross(vec_right, dim=-1)
+    vec_up = nn.functional.normalize(vec_up, p=2, dim=-1)
+    rot_mat = torch.stack([vec_right, vec_up, vec_forward], -2)
+    return rot_mat
+def sample_pose_hypothesis_from_quad_prediction(poses_raw, total_iter, batch_size, num_frames, pose_xflip_recon=False, input_image_xflip_flag=None, rot_temp_scalar=1., num_hypos=4, naive_probs_iter=2000, best_pose_start_iter=6000, random_sample=True):
+    rots_pred = poses_raw[..., :num_hypos*4].view(-1, num_hypos, 4)
+    rots_logits = rots_pred[..., 0]  # Nx4
+    temp = 1 / np.clip(total_iter / 1000 / rot_temp_scalar, 1., 100.)
+    rots_probs = torch.nn.functional.softmax(-rots_logits / temp, dim=1)  # N x K
+    # naive_probs = torch.FloatTensor([10] + [1] * (num_hypos - 1)).to(rots_logits.device)
+    naive_probs = torch.ones(num_hypos).to(rots_logits.device)
+    naive_probs = naive_probs / naive_probs.sum()
+    naive_probs_weight = np.clip(1 - (total_iter - naive_probs_iter) / 2000, 0, 1)
+    rots_probs = naive_probs.view(1, num_hypos) * naive_probs_weight + rots_probs * (1 - naive_probs_weight)
+    rots_pred = rots_pred[..., 1:4]
+    trans_pred = poses_raw[..., -3:]
+    best_rot_idx = torch.argmax(rots_probs, dim=1)  # N
+    if random_sample:
+        # rand_rot_idx = torch.randint(0, 4, (batch_size * num_frames,), device=poses_raw.device)  # N
+        rand_rot_idx = torch.randperm(batch_size * num_frames, device=poses_raw.device) % num_hypos  # N
+        # rand_rot_idx = torch.randperm(batch_size, device=poses_raw.device)[:,None].repeat(1, num_frames).view(-1) % 4  # N
+        best_flag = (torch.randperm(batch_size * num_frames, device=poses_raw.device) / (batch_size * num_frames) < np.clip((total_iter - best_pose_start_iter)/2000, 0, 0.8)).long()
+        rand_flag = 1 - best_flag
+        # best_flag = torch.zeros_like(best_rot_idx)
+        rot_idx = best_rot_idx * best_flag + rand_rot_idx * (1 - best_flag)
+    else:
+        rand_flag = torch.zeros_like(best_rot_idx)
+        rot_idx = best_rot_idx
+    rot_pred = torch.gather(rots_pred, 1, rot_idx[:, None, None].expand(-1, 1, 3))[:, 0]  # Nx3
+    pose_raw = torch.cat([rot_pred, trans_pred], -1)
+    rot_prob = torch.gather(rots_probs, 1, rot_idx[:, None].expand(-1, 1))[:, 0]  # N
+    rot_logit = torch.gather(rots_logits, 1, rot_idx[:, None].expand(-1, 1))[:, 0]  # N
+    if pose_xflip_recon:
+        raise NotImplementedError
+    rot_mat = forward_to_matrix(pose_raw[:, :3], up=[0, 1, 0])
+    pose = torch.cat([rot_mat.view(batch_size * num_frames, -1), pose_raw[:, 3:]], -1)
+    return pose_raw, pose, rot_idx, rot_prob, rot_logit, rots_probs, rand_flag
+class PriorPredictor(nn.Module):
+    def __init__(self, cfgs):
+        super().__init__()
+        dmtet_grid = cfgs.get('dmtet_grid', 64)
+        grid_scale = cfgs.get('grid_scale', 5)
+        prior_sdf_mode = cfgs.get('prior_sdf_mode', 'mlp')
+        num_layers_shape = cfgs.get('num_layers_shape', 5)
+        hidden_size = cfgs.get('hidden_size', 64)
+        embedder_freq_shape = cfgs.get('embedder_freq_shape', 8)
+        embed_concat_pts = cfgs.get('embed_concat_pts', True)
+        init_sdf = cfgs.get('init_sdf', None)
+        jitter_grid = cfgs.get('jitter_grid', 0.)
+        perturb_sdf_iter = cfgs.get('perturb_sdf_iter', 10000)
+        sym_prior_shape = cfgs.get('sym_prior_shape', False)
+        self.netShape = DMTetGeometry(dmtet_grid, grid_scale, prior_sdf_mode, num_layers=num_layers_shape, hidden_size=hidden_size, embedder_freq=embedder_freq_shape, embed_concat_pts=embed_concat_pts, init_sdf=init_sdf, jitter_grid=jitter_grid, perturb_sdf_iter=perturb_sdf_iter, sym_prior_shape=sym_prior_shape)
+        mlp_hidden_size = cfgs.get('hidden_size', 64)
+        tet_bbox = self.netShape.getAABB()
+        self.render_dino_mode = cfgs.get('render_dino_mode', None)
+        num_layers_dino = cfgs.get("num_layers_dino", 5)
+        dino_feature_recon_dim = cfgs.get('dino_feature_recon_dim', 64)
+        sym_dino = cfgs.get("sym_dino", False)
+        dino_min = torch.zeros(dino_feature_recon_dim) + cfgs.get('dino_min', 0.)
+        dino_max = torch.zeros(dino_feature_recon_dim) + cfgs.get('dino_max', 1.)
+        min_max = torch.stack((dino_min, dino_max), dim=0)
+        if self.render_dino_mode is None:
+            pass
+        elif self.render_dino_mode == 'feature_mlpnv':
+            self.netDINO = mlptexture.MLPTexture3D(tet_bbox, channels=dino_feature_recon_dim, internal_dims=mlp_hidden_size, hidden=num_layers_dino-1, feat_dim=0, min_max=min_max, bsdf=None, perturb_normal=False, symmetrize=sym_dino)
+        elif self.render_dino_mode == 'feature_mlp':
+            embedder_scaler = 2 * np.pi / grid_scale * 0.9  # originally (-0.5*s, 0.5*s) rescale to (-pi, pi) * 0.9
+            embed_concat_pts = cfgs.get('embed_concat_pts', True)
+            self.netDINO = networks.MLPTextureSimple(
+                3,  # x, y, z coordinates
+                dino_feature_recon_dim,
+                num_layers_dino,
+                nf=mlp_hidden_size,
+                dropout=0,
+                activation="sigmoid",
+                min_max=min_max,
+                n_harmonic_functions=cfgs.get('embedder_freq_dino', 8),
+                omega0=embedder_scaler,
+                extra_dim=0,
+                embed_concat_pts=embed_concat_pts,
+                perturb_normal=False,
+                symmetrize=sym_dino
+            )
+        elif self.render_dino_mode == 'cluster':
+            num_layers_dino = cfgs.get("num_layers_dino", 5)
+            dino_cluster_dim = cfgs.get('dino_cluster_dim', 64)
+            self.netDINO = mlptexture.MLPTexture3D(tet_bbox, channels=dino_cluster_dim, internal_dims=mlp_hidden_size, hidden=num_layers_dino-1, feat_dim=0, min_max=None, bsdf=None, perturb_normal=False, symmetrize=sym_dino)
+        else:
+            raise NotImplementedError
+    def forward(self, perturb_sdf=False, total_iter=None, is_training=True):
+        prior_shape = self.netShape.getMesh(perturb_sdf=perturb_sdf, total_iter=total_iter, jitter_grid=is_training)
+        return prior_shape, self.netDINO
+class InstancePredictor(nn.Module):
+    def __init__(self, cfgs, tet_bbox=None):
+        super().__init__()
+        self.cfgs = cfgs
+        self.grid_scale = cfgs.get('grid_scale', 5)
+        self.enable_encoder = cfgs.get('enable_encoder', False)
+        if self.enable_encoder:
+            encoder_latent_dim = cfgs.get('latent_dim', 256)
+            encoder_pretrained = cfgs.get('encoder_pretrained', False)
+            encoder_frozen = cfgs.get('encoder_frozen', False)
+            encoder_arch = cfgs.get('encoder_arch', 'simple')
+            in_image_size = cfgs.get('in_image_size', 256)
+            self.dino_feature_input = cfgs.get('dino_feature_input', False)
+            dino_feature_dim = cfgs.get('dino_feature_dim', 64)
+            if encoder_arch == 'simple':
+                if self.dino_feature_input:
+                    self.netEncoder = networks.EncoderWithDINO(cin_rgb=3, cin_dino=dino_feature_dim, cout=encoder_latent_dim, in_size=in_image_size, zdim=None, nf=64, activation=None)
+                else:
+                    self.netEncoder = networks.Encoder(cin=3, cout=encoder_latent_dim, in_size=in_image_size, zdim=None, nf=64, activation=None)
+            elif encoder_arch == 'vgg':
+                self.netEncoder = networks.VGGEncoder(cout=encoder_latent_dim, pretrained=encoder_pretrained)
+            elif encoder_arch == 'resnet':
+                self.netEncoder = networks.ResnetEncoder(cout=encoder_latent_dim, pretrained=encoder_pretrained)
+            elif encoder_arch == 'vit':
+                which_vit = cfgs.get('which_vit', 'dino_vits8')
+                vit_final_layer_type = cfgs.get('vit_final_layer_type', 'conv')
+                self.netEncoder = networks.ViTEncoder(cout=encoder_latent_dim, which_vit=which_vit, pretrained=encoder_pretrained, frozen=encoder_frozen, in_size=in_image_size, final_layer_type=vit_final_layer_type)
+            else:
+                raise NotImplementedError
+        else:
+            encoder_latent_dim = 0
+        mlp_hidden_size = cfgs.get('hidden_size', 64)
+        bsdf = cfgs.get("bsdf", 'diffuse')
+        num_layers_tex = cfgs.get("num_layers_tex", 5)
+        feat_dim = cfgs.get("latent_dim", 64) if self.enable_encoder else 0
+        perturb_normal = cfgs.get("perturb_normal", False)
+        sym_texture = cfgs.get("sym_texture", False)
+        kd_min = torch.FloatTensor(cfgs.get('kd_min', [0., 0., 0., 0.]))
+        kd_max = torch.FloatTensor(cfgs.get('kd_max', [1., 1., 1., 1.]))
+        ks_min = torch.FloatTensor(cfgs.get('ks_min', [0., 0., 0.]))
+        ks_max = torch.FloatTensor(cfgs.get('ks_max', [0., 0., 0.]))
+        nrm_min = torch.FloatTensor(cfgs.get('nrm_min', [-1., -1., 0.]))
+        nrm_max = torch.FloatTensor(cfgs.get('nrm_max', [1., 1., 1.]))
+        mlp_min = torch.cat((kd_min[0:3], ks_min, nrm_min), dim=0)
+        mlp_max = torch.cat((kd_max[0:3], ks_max, nrm_max), dim=0)
+        min_max = torch.stack((mlp_min, mlp_max), dim=0)
+        out_chn = 9
+        # TODO: if the tet verts are deforming, we need to recompute tet_bbox
+        texture_mode = cfgs.get("texture_mode", 'mlp')
+        if texture_mode == 'mlpnv':
+            self.netTexture = mlptexture.MLPTexture3D(tet_bbox, channels=out_chn, internal_dims=mlp_hidden_size, hidden=num_layers_tex-1, feat_dim=feat_dim, min_max=min_max, bsdf=bsdf, perturb_normal=perturb_normal, symmetrize=sym_texture)
+        elif texture_mode == 'mlp':
+            embedder_scaler = 2 * np.pi / self.grid_scale * 0.9  # originally (-0.5*s, 0.5*s) rescale to (-pi, pi) * 0.9
+            embed_concat_pts = cfgs.get('embed_concat_pts', True)
+            self.netTexture = networks.MLPTextureSimple(
+                3,  # x, y, z coordinates
+                out_chn,
+                num_layers_tex,
+                nf=mlp_hidden_size,
+                dropout=0,
+                activation="sigmoid",
+                min_max=min_max,
+                n_harmonic_functions=cfgs.get('embedder_freq_tex', 10),
+                omega0=embedder_scaler,
+                extra_dim=feat_dim,
+                embed_concat_pts=embed_concat_pts,
+                perturb_normal=perturb_normal,
+                symmetrize=sym_texture
+            )
+        self.rot_rep = cfgs.get('rot_rep', 'euler_angle')
+        self.enable_pose = cfgs.get('enable_pose', False)
+        if self.enable_pose:
+            cam_pos_z_offset = cfgs.get('cam_pos_z_offset', 10.)
+            fov = cfgs.get('crop_fov_approx', 25)
+            half_range = np.tan(fov /2 /180 * np.pi) * cam_pos_z_offset  # 2.22
+            self.max_trans_xy_range = half_range * cfgs.get('max_trans_xy_range_ratio', 1.)
+            self.max_trans_z_range = half_range * cfgs.get('max_trans_z_range_ratio', 1.)
+            self.lookat_init = cfgs.get('lookat_init', None)
+            self.lookat_zeroy = cfgs.get('lookat_zeroy', False)
+            self.rot_temp_scalar = cfgs.get('rot_temp_scalar', 1.)
+            self.naive_probs_iter = cfgs.get('naive_probs_iter', 2000)
+            self.best_pose_start_iter = cfgs.get('best_pose_start_iter', 6000)
+            if self.rot_rep == 'euler_angle':
+                pose_cout = 6
+            elif self.rot_rep == 'quaternion':
+                pose_cout = 7
+            elif self.rot_rep == 'lookat':
+                pose_cout = 6
+            elif self.rot_rep == 'quadlookat':
+                self.num_pose_hypos = 4
+                pose_cout = (3 + 1) * self.num_pose_hypos + 3  # 4 forward vectors for 4 quadrants, 4 quadrant classification logits, 3 for translation
+                self.orthant_signs = torch.FloatTensor([[1,1,1], [-1,1,1], [-1,1,-1], [1,1,-1]])
+            elif self.rot_rep == 'octlookat':
+                self.num_pose_hypos = 8
+                pose_cout = (3 + 1) * self.num_pose_hypos + 3  # 4 forward vectors for 8 octants, 8 octant classification logits, 3 for translation
+                self.orthant_signs = torch.stack(torch.meshgrid([torch.arange(1, -2, -2)] *3), -1).view(-1, 3)  # 8x3
+            else:
+                raise NotImplementedError
+            self.pose_arch = cfgs.get('pose_arch', 'mlp')
+            if self.pose_arch == 'mlp':
+                num_layers_pose = cfgs.get('num_layers_pose', 5)
+                self.netPose = networks.MLP(
+                    encoder_latent_dim,
+                    pose_cout,
+                    num_layers_pose,
+                    nf=mlp_hidden_size,
+                    dropout=0,
+                    activation=None
+                )
+            elif self.pose_arch == 'encoder':
+                if self.dino_feature_input:
+                    dino_feature_dim = cfgs.get('dino_feature_dim', 64)
+                    self.netPose = networks.EncoderWithDINO(cin_rgb=3, cin_dino=dino_feature_dim, cout=pose_cout, in_size=in_image_size, zdim=None, nf=64, activation=None)
+                else:
+                    self.netPose = networks.Encoder(cin=3, cout=pose_cout, in_size=in_image_size, zdim=None, nf=64, activation=None)
+            elif self.pose_arch in ['encoder_dino_patch_out', 'encoder_dino_patch_key']:
+                if which_vit == 'dino_vits8':
+                    dino_feat_dim = 384
+                elif which_vit == 'dinov2_vits14':
+                    dino_feat_dim = 384
+                elif which_vit == 'dino_vitb8':
+                    dino_feat_dim = 768
+                self.netPose = networks.Encoder32(cin=dino_feat_dim, cout=pose_cout, nf=256, activation=None)
+            elif self.pose_arch == 'vit':
+                encoder_pretrained = cfgs.get('encoder_pretrained', False)
+                encoder_frozen = cfgs.get('encoder_frozen', False)
+                which_vit = cfgs.get('which_vit', 'dino_vits8')
+                vit_final_layer_type = cfgs.get('vit_final_layer_type', 'conv')
+                self.netPose = networks.ViTEncoder(cout=encoder_latent_dim, which_vit=which_vit, pretrained=encoder_pretrained, frozen=encoder_frozen, in_size=in_image_size, final_layer_type=vit_final_layer_type)
+            else:
+                raise NotImplementedError
+        self.enable_deform = cfgs.get('enable_deform', False)
+        if self.enable_deform:
+            embedder_scaler = 2 * np.pi / self.grid_scale * 0.9  # originally (-0.5*s, 0.5*s) rescale to (-pi, pi) * 0.9
+            embed_concat_pts = cfgs.get('embed_concat_pts', True)
+            num_layers_deform = cfgs.get('num_layers_deform', 5)
+            self.deform_epochs = np.arange(*cfgs.get('deform_epochs', [0, 0]))
+            sym_deform = cfgs.get("sym_deform", False)
+            self.netDeform = networks.MLPWithPositionalEncoding(
+                3,  # x, y, z coordinates
+                3,  # dx, dy, dz deformation
+                num_layers_deform,
+                nf=mlp_hidden_size,
+                dropout=0,
+                activation=None,
+                n_harmonic_functions=cfgs.get('embedder_freq_deform', 10),
+                omega0=embedder_scaler,
+                extra_dim=encoder_latent_dim,
+                embed_concat_pts=embed_concat_pts,
+                symmetrize=sym_deform
+            )
+        self.enable_articulation = cfgs.get('enable_articulation', False)
+        if self.enable_articulation:
+            self.num_body_bones = cfgs.get('num_body_bones', 4)
+            self.articulation_multiplier = cfgs.get('articulation_multiplier', 1)
+            self.static_root_bones = cfgs.get('static_root_bones', False)
+            self.skinning_temperature = cfgs.get('skinning_temperature', 1)
+            self.articulation_epochs = np.arange(*cfgs.get('articulation_epochs', [0, 0]))
+            self.num_legs = cfgs.get('num_legs', 0)
+            self.num_leg_bones = cfgs.get('num_leg_bones', 0)
+            self.body_bones_type = cfgs.get('body_bones_type', 'z_minmax')
+            self.perturb_articulation_epochs = np.arange(*cfgs.get('perturb_articulation_epochs', [0, 0]))
+            self.num_bones = self.num_body_bones + self.num_legs * self.num_leg_bones
+            self.constrain_legs = cfgs.get('constrain_legs', False)
+            self.attach_legs_to_body_epochs = np.arange(*cfgs.get('attach_legs_to_body_epochs', [0, 0]))
+            self.max_arti_angle = cfgs.get('max_arti_angle', 60)
+            num_layers_arti = cfgs.get('num_layers_arti', 5)
+            which_vit = cfgs.get('which_vit', 'dino_vits8')
+            if which_vit == 'dino_vits8':
+                dino_feat_dim = 384
+            elif which_vit == 'dino_vitb8':
+                dino_feat_dim = 768
+            self.articulation_arch = cfgs.get('articulation_arch', 'mlp')
+            self.articulation_feature_mode = cfgs.get('articulation_feature_mode', 'sample')
+            embedder_freq_arti = cfgs.get('embedder_freq_arti', 8)
+            if self.articulation_feature_mode == 'global':
+                feat_dim = encoder_latent_dim
+            elif self.articulation_feature_mode == 'sample':
+                feat_dim = dino_feat_dim
+            elif self.articulation_feature_mode == 'sample+global':
+                feat_dim = encoder_latent_dim + dino_feat_dim
+            if self.articulation_feature_mode == 'attention':
+                arti_feat_attn_zdim = cfgs.get('arti_feat_attn_zdim', 128)
+                pos_dim = 1 + 2 + 3*2
+                self.netFeatureAttn = networks.FeatureAttention(which_vit, pos_dim, embedder_freq_arti, arti_feat_attn_zdim, img_size=in_image_size)
+            embedder_scaler = np.pi * 0.9  # originally (-1, 1) rescale to (-pi, pi) * 0.9
+            self.netArticulation = networks.ArticulationNetwork(self.articulation_arch, feat_dim, 1+2+3*2, num_layers_arti, mlp_hidden_size, n_harmonic_functions=embedder_freq_arti, omega0=embedder_scaler)
+            self.kinematic_tree_epoch = -1
+        self.enable_lighting = cfgs.get('enable_lighting', False)
+        if self.enable_lighting:
+            num_layers_light = cfgs.get('num_layers_light', 5)
+            amb_diff_min = torch.FloatTensor(cfgs.get('amb_diff_min', [0., 0.]))
+            amb_diff_max = torch.FloatTensor(cfgs.get('amb_diff_max', [1., 1.]))
+            intensity_min_max = torch.stack((amb_diff_min, amb_diff_max), dim=0)
+            self.netLight = light.DirectionalLight(encoder_latent_dim, num_layers_light, mlp_hidden_size, intensity_min_max=intensity_min_max)
+        self.cam_pos_z_offset = cfgs.get('cam_pos_z_offset', 10.)
+        self.crop_fov_approx = cfgs.get("crop_fov_approx", 25)
+    def forward_encoder(self, images, dino_features=None):
+        images_in = images.view(-1, *images.shape[2:]) * 2 - 1  # rescale to (-1, 1)
+        patch_out = patch_key = None
+        if self.dino_feature_input and self.cfgs.get('encoder_arch', 'simple') != 'vit':
+            dino_features_in = dino_features.view(-1, *dino_features.shape[2:]) * 2 - 1  # rescale to (-1, 1)
+            feat_out = self.netEncoder(images_in, dino_features_in)  # Shape: (B, latent_dim)
+        elif self.cfgs.get('encoder_arch', 'simple') == 'vit':
+            feat_out, feat_key, patch_out, patch_key = self.netEncoder(images_in, return_patches=True)
+        else:
+            feat_out = self.netEncoder(images_in)  # Shape: (B, latent_dim)
+        return feat_out, feat_key, patch_out, patch_key
+    def forward_pose(self, images, feat, patch_out, patch_key, dino_features):
+        if self.pose_arch == 'mlp':
+            pose = self.netPose(feat)
+        elif self.pose_arch == 'encoder':
+            images_in = images.view(-1, *images.shape[2:]) * 2 - 1  # rescale to (-1, 1)
+            if self.dino_feature_input:
+                dino_features_in = dino_features.view(-1, *dino_features.shape[2:]) * 2 - 1  # rescale to (-1, 1)
+                pose = self.netPose(images_in, dino_features_in)  # Shape: (B, latent_dim)
+            else:
+                pose = self.netPose(images_in)  # Shape: (B, latent_dim)
+        elif self.pose_arch == 'vit':
+            images_in = images.view(-1, *images.shape[2:]) * 2 - 1  # rescale to (-1, 1)
+            pose = self.netPose(images_in)
+        elif self.pose_arch == 'encoder_dino_patch_out':
+            pose = self.netPose(patch_out)  # Shape: (B, latent_dim)
+        elif self.pose_arch == 'encoder_dino_patch_key':
+            pose = self.netPose(patch_key)  # Shape: (B, latent_dim)
+        else:
+            raise NotImplementedError
+        trans_pred = pose[...,-3:].tanh() * torch.FloatTensor([self.max_trans_xy_range, self.max_trans_xy_range, self.max_trans_z_range]).to(pose.device)
+        if self.rot_rep == 'euler_angle':
+            multiplier = 1.
+            if self.gradually_expand_yaw:
+                # multiplier += (min(iteration, 20000) // 500) * 0.25
+                multiplier *= 1.2 ** (min(iteration, 20000) // 500)  # 1.125^40 = 111.200
+            rot_pred = torch.cat([pose[...,:1], pose[...,1:2]*multiplier, pose[...,2:3]], -1).tanh()
+            rot_pred = rot_pred * torch.FloatTensor([self.max_rot_x_range, self.max_rot_y_range, self.max_rot_z_range]).to(pose.device) /180 * np.pi
+        elif self.rot_rep == 'quaternion':
+            quat_init = torch.FloatTensor([0.01,0,0,0]).to(pose.device)
+            rot_pred = pose[...,:4] + quat_init
+            rot_pred = nn.functional.normalize(rot_pred, p=2, dim=-1)
+            # rot_pred = torch.cat([rot_pred[...,:1].abs(), rot_pred[...,1:]], -1)  # make real part non-negative
+            rot_pred = rot_pred * rot_pred[...,:1].sign()  # make real part non-negative
+        elif self.rot_rep == 'lookat':
+            vec_forward_raw = pose[...,:3]
+            if self.lookat_init is not None:
+                vec_forward_raw = vec_forward_raw + torch.FloatTensor(self.lookat_init).to(pose.device)
+            if self.lookat_zeroy:
+                vec_forward_raw = vec_forward_raw * torch.FloatTensor([1,0,1]).to(pose.device)
+            vec_forward_raw = nn.functional.normalize(vec_forward_raw, p=2, dim=-1)  # x right, y up, z forward
+            rot_pred = vec_forward_raw
+        elif self.rot_rep in ['quadlookat', 'octlookat']:
+            rots_pred = pose[..., :self.num_pose_hypos*4].view(-1, self.num_pose_hypos, 4)  # (B, T, K, 4)
+            rots_logits = rots_pred[..., :1]
+            vec_forward_raw = rots_pred[..., 1:4]
+            xs, ys, zs = vec_forward_raw.unbind(-1)
+            margin = 0.
+            xs = nn.functional.softplus(xs, beta=np.log(2)/(0.5+margin)) - margin  # initialize to 0.5
+            if self.rot_rep == 'octlookat':
+                ys = nn.functional.softplus(ys, beta=np.log(2)/(0.5+margin)) - margin  # initialize to 0.5
+            if self.lookat_zeroy:
+                ys = ys * 0
+            zs = nn.functional.softplus(zs, beta=2*np.log(2))  # initialize to 0.5
+            vec_forward_raw = torch.stack([xs, ys, zs], -1)
+            vec_forward_raw = vec_forward_raw * self.orthant_signs.to(pose.device)
+            vec_forward_raw = nn.functional.normalize(vec_forward_raw, p=2, dim=-1)  # x right, y up, z forward
+            rot_pred = torch.cat([rots_logits, vec_forward_raw], -1).view(-1, self.num_pose_hypos*4)
+        else:
+            raise NotImplementedError
+        pose = torch.cat([rot_pred, trans_pred], -1)
+        return pose
+    def forward_deformation(self, shape, feat=None):
+        original_verts = shape.v_pos
+        num_verts = original_verts.shape[1]
+        if feat is not None:
+            deform_feat = feat[:, None, :].repeat(1, num_verts, 1)  # Shape: (B, num_verts, latent_dim)
+            original_verts = original_verts.repeat(len(feat),1,1)
+        deformation = self.netDeform(original_verts, deform_feat) * 0.1  # Shape: (B, num_verts, 3)
+        shape = shape.deform(deformation)
+        return shape, deformation
+    def forward_articulation(self, shape, feat, patch_feat, mvp, w2c, batch_size, num_frames, epoch):
+        """
+        Forward propagation of articulation. For each bone, the network takes: 1) the 3D location of the bone; 2) the feature of the patch which
+        the bone is projected to; and 3) an encoding of the bone's index to predict the bone's rotation (represented by an Euler angle).
+        Args:
+            shape: a Mesh object, whose v_pos has batch size BxF or 1.
+            feat: the feature of the patches. Shape: (BxF, feat_dim, num_patches_per_axis, num_patches_per_axis)
+            mvp: the model-view-projection matrix. Shape: (BxF, 4, 4)
+        Returns:
+            shape: a Mesh object, whose v_pos has batch size BxF (collapsed).
+            articulation_angles: the predicted bone rotations. Shape: (B, F, num_bones, 3)
+            aux: a dictionary containing auxiliary information.
+        """
+        verts = shape.v_pos
+        if len(verts) == 1:
+            verts = verts[None]
+        else:
+            verts = verts.view(batch_size, num_frames, *verts.shape[1:])
+        if self.kinematic_tree_epoch != epoch:
+        # if (epoch == self.articulation_epochs[0]) and (self.kinematic_tree_epoch != epoch):
+        # if (epoch in [self.articulation_epochs[0], self.articulation_epochs[0]+2, self.articulation_epochs[0]+4]) and (self.kinematic_tree_epoch != epoch):
+            attach_legs_to_body = epoch in self.attach_legs_to_body_epochs
+            bones, self.kinematic_tree, self.bone_aux = estimate_bones(verts.detach(), self.num_body_bones, n_legs=self.num_legs, n_leg_bones=self.num_leg_bones, body_bones_type=self.body_bones_type, compute_kinematic_chain=True, attach_legs_to_body=attach_legs_to_body)
+            self.kinematic_tree_epoch = epoch
+        else:
+            bones = estimate_bones(verts.detach(), self.num_body_bones, n_legs=self.num_legs, n_leg_bones=self.num_leg_bones, body_bones_type=self.body_bones_type, compute_kinematic_chain=False, aux=self.bone_aux)
+        bones_pos = bones  # Shape: (B, F, K, 2, 3)
+        if batch_size > bones_pos.shape[0] or num_frames > bones_pos.shape[1]:
+            assert bones_pos.shape[0] == 1 and bones_pos.shape[1] == 1, "If there is a mismatch, then there must be only one canonical mesh."
+            bones_pos = bones_pos.repeat(batch_size, num_frames, 1, 1, 1)
+        num_bones = bones_pos.shape[2]
+        bones_pos = bones_pos.view(batch_size*num_frames, num_bones, 2, 3)  # NxKx2x3
+        bones_mid_pos = bones_pos.mean(2)  # NxKx3
+        bones_idx = torch.arange(num_bones).to(bones_pos.device)
+        bones_mid_pos_world4 = torch.cat([bones_mid_pos, torch.ones_like(bones_mid_pos[..., :1])], -1)  # NxKx4
+        bones_mid_pos_clip4 = bones_mid_pos_world4 @ mvp.transpose(-1, -2)
+        bones_mid_pos_uv = bones_mid_pos_clip4[..., :2] / bones_mid_pos_clip4[..., 3:4]
+        bones_mid_pos_uv = bones_mid_pos_uv.detach()
+        bones_pos_world4 = torch.cat([bones_pos, torch.ones_like(bones_pos[..., :1])], -1)  # NxKx2x4
+        bones_pos_cam4 = bones_pos_world4 @ w2c[:,None].transpose(-1, -2)
+        bones_pos_cam3 = bones_pos_cam4[..., :3] / bones_pos_cam4[..., 3:4]
+        bones_pos_cam3 = bones_pos_cam3 + torch.FloatTensor([0, 0, self.cam_pos_z_offset]).to(bones_pos_cam3.device).view(1, 1, 1, 3)
+        bones_pos_in = bones_pos_cam3.view(batch_size*num_frames, num_bones, 2*3) / self.grid_scale * 2  # (-1, 1), NxKx(2*3)
+        bones_idx_in = ((bones_idx[None, :, None] + 0.5) / num_bones * 2 - 1).repeat(batch_size * num_frames, 1, 1)  # (-1, 1)
+        bones_pos_in = torch.cat([bones_mid_pos_uv, bones_pos_in, bones_idx_in], -1).detach()
+        if self.articulation_feature_mode == 'global':
+            bones_patch_features = feat[:, None].repeat(1, num_bones, 1)  # (BxF, K, feat_dim)
+        elif self.articulation_feature_mode == 'sample':
+            bones_patch_features = F.grid_sample(patch_feat, bones_mid_pos_uv.view(batch_size * num_frames, 1, -1, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # (BxF, K, feat_dim)
+        elif self.articulation_feature_mode == 'sample+global':
+            bones_patch_features = F.grid_sample(patch_feat, bones_mid_pos_uv.view(batch_size * num_frames, 1, -1, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # (BxF, K, feat_dim)
+            bones_patch_features = torch.cat([feat[:, None].repeat(1, num_bones, 1), bones_patch_features], -1)
+        elif self.articulation_feature_mode == 'attention':
+            bones_patch_features = self.netFeatureAttn(bones_pos_in, patch_feat)
+        else:
+            raise NotImplementedError
+        articulation_angles = self.netArticulation(bones_patch_features, bones_pos_in).view(batch_size, num_frames, num_bones, 3) * self.articulation_multiplier
+        if self.static_root_bones:
+            root_bones = [self.num_body_bones // 2 - 1, self.num_body_bones - 1]
+            tmp_mask = torch.ones_like(articulation_angles)
+            tmp_mask[:, :, root_bones] = 0
+            articulation_angles = articulation_angles * tmp_mask
+        articulation_angles = articulation_angles.tanh()
+        if self.constrain_legs:
+            leg_bones_posx = [self.num_body_bones + i for i in range(self.num_leg_bones * self.num_legs // 2)]
+            leg_bones_negx = [self.num_body_bones + self.num_leg_bones * self.num_legs // 2 + i for i in range(self.num_leg_bones * self.num_legs // 2)]
+            tmp_mask = torch.zeros_like(articulation_angles)
+            tmp_mask[:, :, leg_bones_posx + leg_bones_negx, 2] = 1
+            articulation_angles = tmp_mask * (articulation_angles * 0.3) + (1 - tmp_mask) * articulation_angles  # no twist
+            tmp_mask = torch.zeros_like(articulation_angles)
+            tmp_mask[:, :, leg_bones_posx + leg_bones_negx, 1] = 1
+            articulation_angles = tmp_mask * (articulation_angles * 0.3) + (1 - tmp_mask) * articulation_angles  # (-0.4, 0.4),  limit side bending
+        if epoch in self.perturb_articulation_epochs:
+            articulation_angles = articulation_angles + torch.randn_like(articulation_angles) * 0.1
+        articulation_angles = articulation_angles * self.max_arti_angle / 180 * np.pi
+        verts_articulated, aux = skinning(verts, bones, self.kinematic_tree, articulation_angles,
+                                          output_posed_bones=True, temperature=self.skinning_temperature)
+        verts_articulated = verts_articulated.view(batch_size*num_frames, *verts_articulated.shape[2:])
+        v_tex = shape.v_tex
+        if len(v_tex) != len(verts_articulated):
+            v_tex = v_tex.repeat(len(verts_articulated), 1, 1)
+        shape = mesh.make_mesh(
+            verts_articulated,
+            shape.t_pos_idx,
+            v_tex,
+            shape.t_tex_idx,
+            shape.material)
+        return shape, articulation_angles, aux
+    def get_camera_extrinsics_from_pose(self, pose, znear=0.1, zfar=1000.):
+        N = len(pose)
+        cam_pos_offset = torch.FloatTensor([0, 0, -self.cam_pos_z_offset]).to(pose.device)
+        pose_R = pose[:, :9].view(N, 3, 3).transpose(2, 1)
+        pose_T = pose[:, -3:] + cam_pos_offset[None, None, :]
+        pose_T = pose_T.view(N, 3, 1)
+        pose_RT = torch.cat([pose_R, pose_T], axis=2)  # Nx3x4
+        w2c = torch.cat([pose_RT, torch.FloatTensor([0, 0, 0, 1]).repeat(N, 1, 1).to(pose.device)], axis=1)  # Nx4x4
+        # We assume the images are perfect square.
+        proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, znear, zfar)[None].to(pose.device)
+        mvp = torch.matmul(proj, w2c)
+        campos = -torch.matmul(pose_R.transpose(2, 1), pose_T).view(N, 3)
+        return mvp, w2c, campos
+    def forward(self, images=None, prior_shape=None, epoch=None, dino_features=None, dino_clusters=None, total_iter=None, is_training=True):
+        batch_size, num_frames = images.shape[:2]
+        if self.enable_encoder:
+            feat_out, feat_key, patch_out, patch_key = self.forward_encoder(images, dino_features)
+        else:
+            feat_out = feat_key = patch_out = patch_key = None
+        shape = prior_shape
+        texture = self.netTexture
+        multi_hypothesis_aux = {}
+        if self.enable_pose:
+            poses_raw = self.forward_pose(images, feat_out, patch_out, patch_key, dino_features)
+            pose_raw, pose, rot_idx, rot_prob, rot_logit, rots_probs, rand_pose_flag = sample_pose_hypothesis_from_quad_prediction(poses_raw, total_iter, batch_size, num_frames, rot_temp_scalar=self.rot_temp_scalar, num_hypos=self.num_pose_hypos, naive_probs_iter=self.naive_probs_iter, best_pose_start_iter=self.best_pose_start_iter, random_sample=is_training)
+            multi_hypothesis_aux['rot_idx'] = rot_idx
+            multi_hypothesis_aux['rot_prob'] = rot_prob
+            multi_hypothesis_aux['rot_logit'] = rot_logit
+            multi_hypothesis_aux['rots_probs'] = rots_probs
+            multi_hypothesis_aux['rand_pose_flag'] = rand_pose_flag
+        else:
+            raise NotImplementedError
+        mvp, w2c, campos = self.get_camera_extrinsics_from_pose(pose)
+        deformation = None
+        if self.enable_deform and epoch in self.deform_epochs:
+            shape, deformation = self.forward_deformation(shape, feat_key)
+        arti_params, articulation_aux = None, {}
+        if self.enable_articulation and epoch in self.articulation_epochs:
+            shape, arti_params, articulation_aux = self.forward_articulation(shape, feat_key, patch_key, mvp, w2c, batch_size, num_frames, epoch)
+        if self.enable_lighting:
+            light = self.netLight
+        else:
+            light = None
+        aux = articulation_aux
+        aux.update(multi_hypothesis_aux)
+        return shape, pose_raw, pose, mvp, w2c, campos, texture, feat_out, deformation, arti_params, light, aux
+class Unsup3D:
+    def __init__(self, cfgs):
+        self.cfgs = cfgs
+        self.device = cfgs.get('device', 'cpu')
+        self.in_image_size = cfgs.get('in_image_size', 128)
+        self.out_image_size = cfgs.get('out_image_size', 128)
+        self.num_epochs = cfgs.get('num_epochs', 10)
+        self.lr = cfgs.get('lr', 1e-4)
+        self.use_scheduler = cfgs.get('use_scheduler', False)
+        if self.use_scheduler:
+            scheduler_milestone = cfgs.get('scheduler_milestone', [1,2,3,4,5])
+            scheduler_gamma = cfgs.get('scheduler_gamma', 0.5)
+            self.make_scheduler = lambda optim: torch.optim.lr_scheduler.MultiStepLR(optim, milestones=scheduler_milestone, gamma=scheduler_gamma)
+        self.cam_pos_z_offset = cfgs.get('cam_pos_z_offset', 10.)
+        self.full_size_h = cfgs.get('full_size_h', 1080)
+        self.full_size_w = cfgs.get('full_size_w', 1920)
+        # self.fov_w = cfgs.get('fov_w', 60)
+        # self.fov_h = np.arctan(np.tan(self.fov_w /2 /180*np.pi) / self.full_size_w * self.full_size_h) *2 /np.pi*180  # 36
+        self.crop_fov_approx = cfgs.get("crop_fov_approx", 25)
+        self.mesh_regularization_mode = cfgs.get('mesh_regularization_mode', 'seq')
+        self.enable_prior = cfgs.get('enable_prior', False)
+        if self.enable_prior:
+            self.netPrior = PriorPredictor(self.cfgs)
+            self.prior_lr = cfgs.get('prior_lr', self.lr)
+            self.prior_weight_decay = cfgs.get('prior_weight_decay', 0.)
+            self.prior_only_epochs = cfgs.get('prior_only_epochs', 0)
+        self.netInstance = InstancePredictor(self.cfgs, tet_bbox=self.netPrior.netShape.getAABB())
+        self.perturb_sdf = cfgs.get('perturb_sdf', False)
+        self.blur_mask = cfgs.get('blur_mask', False)
+        self.blur_mask_iter = cfgs.get('blur_mask_iter', 1)
+        self.seqshape_epochs = np.arange(*cfgs.get('seqshape_epochs', [0, self.num_epochs]))
+        self.avg_texture_epochs = np.arange(*cfgs.get('avg_texture_epochs', [0, 0]))
+        self.swap_texture_epochs = np.arange(*cfgs.get('swap_texture_epochs', [0, 0]))
+        self.swap_priorshape_epochs = np.arange(*cfgs.get('swap_priorshape_epochs', [0, 0]))
+        self.avg_seqshape_epochs = np.arange(*cfgs.get('avg_seqshape_epochs', [0, 0]))
+        self.swap_seqshape_epochs = np.arange(*cfgs.get('swap_seqshape_epochs', [0, 0]))
+        self.pose_epochs = np.arange(*cfgs.get('pose_epochs', [0, 0]))
+        self.pose_iters = cfgs.get('pose_iters', 0)
+        self.deform_type = cfgs.get('deform_type', None)
+        self.mesh_reg_decay_epoch = cfgs.get('mesh_reg_decay_epoch', 0)
+        self.sdf_reg_decay_start_iter = cfgs.get('sdf_reg_decay_start_iter', 0)
+        self.mesh_reg_decay_rate = cfgs.get('mesh_reg_decay_rate', 1)
+        self.texture_epochs = np.arange(*cfgs.get('texture_epochs', [0, self.num_epochs]))
+        self.zflip_epochs = np.arange(*cfgs.get('zflip_epochs', [0, self.num_epochs]))
+        self.lookat_zflip_loss_epochs = np.arange(*cfgs.get('lookat_zflip_loss_epochs', [0, self.num_epochs]))
+        self.lookat_zflip_no_other_losses = cfgs.get('lookat_zflip_no_other_losses', False)
+        self.flow_loss_epochs = np.arange(*cfgs.get('flow_loss_epochs', [0, self.num_epochs]))
+        self.sdf_inflate_reg_loss_epochs = np.arange(*cfgs.get('sdf_inflate_reg_loss_epochs', [0, self.num_epochs]))
+        self.arti_reg_loss_epochs = np.arange(*cfgs.get('arti_reg_loss_epochs', [0, self.num_epochs]))
+        self.background_mode = cfgs.get('background_mode', 'background')
+        self.shape_prior_type = cfgs.get('shape_prior_type', 'deform')
+        self.backward_prior = cfgs.get('backward_prior', True)
+        self.resume_prior_optim = cfgs.get('resume_prior_optim', True)
+        self.dmtet_grid_smaller_epoch = cfgs.get('dmtet_grid_smaller_epoch', 0)
+        self.dmtet_grid_smaller = cfgs.get('dmtet_grid_smaller', 128)
+        self.dmtet_grid = cfgs.get('dmtet_grid', 256)
+        self.pose_xflip_recon_epochs = np.arange(*cfgs.get('pose_xflip_recon_epochs', [0, 0]))
+        self.rot_rand_quad_epochs = np.arange(*cfgs.get('rot_rand_quad_epochs', [0, 0]))
+        self.rot_all_quad_epochs = np.arange(*cfgs.get('rot_all_quad_epochs', [0, 0]))
+        ## perceptual loss
+        if cfgs.get('perceptual_loss_weight', 0.) > 0:
+            self.perceptual_loss_use_lin = cfgs.get('perceptual_loss_use_lin', True)
+            self.perceptual_loss = lpips.LPIPS(net='vgg', lpips=self.perceptual_loss_use_lin)
+        self.glctx = dr.RasterizeGLContext()
+        self.render_flow = self.cfgs.get('flow_loss_weight', 0.) > 0.
+        self.extra_renders = cfgs.get('extra_renders', [])
+        self.renderer_spp = cfgs.get('renderer_spp', 1)
+        self.dino_feature_recon_dim = cfgs.get('dino_feature_recon_dim', 64)
+        self.total_loss = 0.
+        self.all_scores = torch.Tensor()
+        self.checkpoint_dir = cfgs.get('checkpoint_dir', 'results')
+    @staticmethod
+    def get_data_loaders(cfgs, dataset, in_image_size=256, out_image_size=256, batch_size=64, num_workers=4, run_train=False, run_test=False, train_data_dir=None, val_data_dir=None, test_data_dir=None):
+        train_loader = val_loader = test_loader = None
+        color_jitter_train = cfgs.get('color_jitter_train', None)
+        color_jitter_val = cfgs.get('color_jitter_val', None)
+        random_flip_train = cfgs.get('random_flip_train', False)
+        ## video dataset
+        if dataset == 'video':
+            data_loader_mode = cfgs.get('data_loader_mode', 'n_frame')
+            skip_beginning = cfgs.get('skip_beginning', 4)
+            skip_end = cfgs.get('skip_end', 4)
+            num_sample_frames = cfgs.get('num_sample_frames', 2)
+            min_seq_len = cfgs.get('min_seq_len', 10)
+            max_seq_len = cfgs.get('max_seq_len', 10)
+            debug_seq = cfgs.get('debug_seq', False)
+            random_sample_train_frames = cfgs.get('random_sample_train_frames', False)
+            shuffle_train_seqs = cfgs.get('shuffle_train_seqs', False)
+            random_sample_val_frames = cfgs.get('random_sample_val_frames', False)
+            load_background = cfgs.get('background_mode', 'none') == 'background'
+            rgb_suffix = cfgs.get('rgb_suffix', '.png')
+            load_dino_feature = cfgs.get('load_dino_feature', False)
+            load_dino_cluster = cfgs.get('load_dino_cluster', False)
+            dino_feature_dim = cfgs.get('dino_feature_dim', 64)
+            get_loader = lambda **kwargs: get_sequence_loader(
+                mode=data_loader_mode,
+                batch_size=batch_size,
+                num_workers=num_workers,
+                in_image_size=in_image_size,
+                out_image_size=out_image_size,
+                debug_seq=debug_seq,
+                skip_beginning=skip_beginning,
+                skip_end=skip_end,
+                num_sample_frames=num_sample_frames,
+                min_seq_len=min_seq_len,
+                max_seq_len=max_seq_len,
+                load_background=load_background,
+                rgb_suffix=rgb_suffix,
+                load_dino_feature=load_dino_feature,
+                load_dino_cluster=load_dino_cluster,
+                dino_feature_dim=dino_feature_dim,
+                **kwargs)
+            if run_train:
+                assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}"
+                print(f"Loading training data from {train_data_dir}")
+                train_loader = get_loader(data_dir=train_data_dir, is_validation=False, random_sample=random_sample_train_frames, shuffle=shuffle_train_seqs, dense_sample=True, color_jitter=color_jitter_train, random_flip=random_flip_train)
+                if val_data_dir is not None:
+                    assert osp.isdir(val_data_dir), f"Validation data directory does not exist: {val_data_dir}"
+                    print(f"Loading validation data from {val_data_dir}")
+                    val_loader = get_loader(data_dir=val_data_dir, is_validation=True, random_sample=random_sample_val_frames, shuffle=False, dense_sample=False, color_jitter=color_jitter_val, random_flip=False)
+            if run_test:
+                assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}"
+                print(f"Loading testing data from {test_data_dir}")
+                test_loader = get_loader(data_dir=test_data_dir, is_validation=True, dense_sample=False, color_jitter=None, random_flip=False)
+        ## CUB dataset
+        elif dataset == 'cub':
+            get_loader = lambda **kwargs: get_cub_loader(
+                batch_size=batch_size,
+                num_workers=num_workers,
+                image_size=in_image_size,
+                **kwargs)
+            if run_train:
+                assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}"
+                print(f"Loading training data from {train_data_dir}")
+                train_loader = get_loader(data_dir=train_data_dir, split='train', is_validation=False)
+                val_loader = get_loader(data_dir=val_data_dir, split='val', is_validation=True)
+            if run_test:
+                assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}"
+                print(f"Loading testing data from {test_data_dir}")
+                test_loader = get_loader(data_dir=test_data_dir, split='test', is_validation=True)
+        ## other datasets
+        else:
+            get_loader = lambda **kwargs: get_image_loader(
+                batch_size=batch_size,
+                num_workers=num_workers,
+                image_size=in_image_size,
+                **kwargs)
+            if run_train:
+                assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}"
+                print(f"Loading training data from {train_data_dir}")
+                train_loader = get_loader(data_dir=train_data_dir, is_validation=False, color_jitter=color_jitter_train)
+                if val_data_dir is not None:
+                    assert osp.isdir(val_data_dir), f"Validation data directory does not exist: {val_data_dir}"
+                    print(f"Loading validation data from {val_data_dir}")
+                    val_loader = get_loader(data_dir=val_data_dir, is_validation=True, color_jitter=color_jitter_val)
+            if run_test:
+                assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}"
+                print(f"Loading testing data from {test_data_dir}")
+                test_loader = get_loader(data_dir=test_data_dir, is_validation=True, color_jitter=None)
+        return train_loader, val_loader, test_loader
+    def load_model_state(self, cp):
+        self.netInstance.load_state_dict(cp["netInstance"])
+        if self.enable_prior:
+            self.netPrior.load_state_dict(cp["netPrior"])
+    def load_optimizer_state(self, cp):
+        self.optimizerInstance.load_state_dict(cp["optimizerInstance"])
+        if self.use_scheduler:
+            if 'schedulerInstance' in cp:
+                self.schedulerInstance.load_state_dict(cp["schedulerInstance"])
+        if self.enable_prior and self.resume_prior_optim:
+            self.optimizerPrior.load_state_dict(cp["optimizerPrior"])
+            if self.use_scheduler:
+                if 'schedulerPrior' in cp:
+                    self.schedulerPrior.load_state_dict(cp["schedulerPrior"])
+    def get_model_state(self):
+        state = {"netInstance": self.netInstance.state_dict()}
+        if self.enable_prior:
+            state["netPrior"] = self.netPrior.state_dict()
+        return state
+    def get_optimizer_state(self):
+        state = {"optimizerInstance": self.optimizerInstance.state_dict()}
+        if self.use_scheduler:
+            state["schedulerInstance"] = self.schedulerInstance.state_dict()
+        if self.enable_prior:
+            state["optimizerPrior"] = self.optimizerPrior.state_dict()
+            if self.use_scheduler:
+                state["schedulerPrior"] = self.schedulerPrior.state_dict()
+        return state
+    def to(self, device):
+        self.device = device
+        self.netInstance.to(device)
+        if self.enable_prior:
+            self.netPrior.to(device)
+        if hasattr(self, 'perceptual_loss'):
+            self.perceptual_loss.to(device)
+    def set_train(self):
+        self.netInstance.train()
+        if self.enable_prior:
+            self.netPrior.train()
+    def set_eval(self):
+        self.netInstance.eval()
+        if self.enable_prior:
+            self.netPrior.eval()
+    def reset_optimizers(self):
+        print("Resetting optimizers...")
+        self.optimizerInstance = get_optimizer(self.netInstance, self.lr)
+        if self.use_scheduler:
+            self.schedulerInstance = self.make_scheduler(self.optimizerInstance)
+        if self.enable_prior:
+            self.optimizerPrior = get_optimizer(self.netPrior, lr=self.prior_lr, weight_decay=self.prior_weight_decay)
+            if self.use_scheduler:
+                self.schedulerPrior = self.make_scheduler(self.optimizerPrior)
+    def backward(self):
+        self.optimizerInstance.zero_grad()
+        if self.backward_prior:
+            self.optimizerPrior.zero_grad()
+        self.total_loss.backward()
+        self.optimizerInstance.step()
+        if self.backward_prior:
+            self.optimizerPrior.step()
+        self.total_loss = 0.
+    def scheduler_step(self):
+        if self.use_scheduler:
+            self.schedulerInstance.step()
+            if self.enable_prior:
+                self.schedulerPrior.step()
+    def zflip_pose(self, pose):
+        if self.rot_rep == 'lookat':
+            vec_forward = pose[:,:,6:9]
+            vec_forward = vec_forward * torch.FloatTensor([1,1,-1]).view(1,1,3).to(vec_forward.device)
+            up = torch.FloatTensor([0,1,0]).to(pose.device).view(1,1,3)
+            vec_right = up.expand_as(vec_forward).cross(vec_forward, dim=-1)
+            vec_right = nn.functional.normalize(vec_right, p=2, dim=-1)
+            vec_up = vec_forward.cross(vec_right, dim=-1)
+            vec_up = nn.functional.normalize(vec_up, p=2, dim=-1)
+            rot_mat = torch.stack([vec_right, vec_up, vec_forward], 2)
+            rot_pred = rot_mat.reshape(*pose.shape[:-1], -1)
+            pose_zflip = torch.cat([rot_pred, pose[:,:,9:]], -1)
+        else:
+            raise NotImplementedError
+        return pose_zflip
+    def render(self, shape, texture, mvp, w2c, campos, resolution, background='none', im_features=None, light=None, prior_shape=None, render_flow=True, dino_pred=None, render_mode='diffuse', two_sided_shading=True, num_frames=None, spp=1):
+        h, w = resolution
+        N = len(mvp)
+        if background in ['none', 'black']:
+            bg_image = torch.zeros((N, h, w, 3), device=mvp.device)
+        elif background == 'white':
+            bg_image = torch.ones((N, h, w, 3), device=mvp.device)
+        elif background == 'checkerboard':
+            bg_image = torch.FloatTensor(util.checkerboard((h, w), 8), device=self.device).repeat(N, 1, 1, 1)  # NxHxWxC
+        else:
+            raise NotImplementedError
+        frame_rendered = render.render_mesh(
+            self.glctx,
+            shape,
+            mtx_in=mvp,
+            w2c=w2c,
+            view_pos=campos,
+            material=texture,
+            lgt=light,
+            resolution=resolution,
+            spp=spp,
+            msaa=True,
+            background=bg_image,
+            bsdf=render_mode,
+            feat=im_features,
+            prior_mesh=prior_shape,
+            two_sided_shading=two_sided_shading,
+            render_flow=render_flow,
+            dino_pred=dino_pred,
+            num_frames=num_frames)
+        shaded = frame_rendered['shaded'].permute(0, 3, 1, 2)
+        image_pred = shaded[:, :3, :, :]
+        mask_pred = shaded[:, 3, :, :]
+        albedo = frame_rendered['kd'].permute(0, 3, 1, 2)[:, :3, :, :]
+        if 'shading' in frame_rendered:
+            shading = frame_rendered['shading'].permute(0, 3, 1, 2)[:, :1, :, :]
+        else:
+            shading = None
+        if render_flow:
+            flow_pred = frame_rendered['flow']
+            flow_pred = flow_pred.permute(0, 3, 1, 2)[:, :2, :, :]
+        else:
+            flow_pred = None
+        if dino_pred is not None:
+            dino_feat_im_pred = frame_rendered['dino_feat_im_pred']
+            dino_feat_im_pred = dino_feat_im_pred.permute(0, 3, 1, 2)[:, :-1]
+        else:
+            dino_feat_im_pred = None
+        return image_pred, mask_pred, flow_pred, dino_feat_im_pred, albedo, shading
+    def compute_reconstruction_losses(self, image_pred, image_gt, mask_pred, mask_gt, mask_dt, mask_valid, flow_pred, flow_gt, dino_feat_im_gt, dino_feat_im_pred, background_mode='none', reduce=False):
+        losses = {}
+        batch_size, num_frames, _, h, w = image_pred.shape  # BxFxCxHxW
+        # image_loss = (image_pred - image_gt) ** 2
+        image_loss = (image_pred - image_gt).abs()
+        ## silhouette loss
+        mask_pred_valid = mask_pred * mask_valid
+        # mask_pred_valid = mask_pred
+        # losses["silhouette_loss"] = ((mask_pred - mask_gt) ** 2).mean()
+        # mask_loss_mask = (image_loss.mean(2).detach() > 0.05).float()
+        mask_loss = (mask_pred_valid - mask_gt) ** 2
+        # mask_loss = nn.functional.mse_loss(mask_pred, mask_gt)
+        # num_mask_pixels = mask_loss_mask.reshape(batch_size*num_frames, -1).sum(1).clamp(min=1)
+        # losses["silhouette_loss"] = (mask_loss.reshape(batch_size*num_frames, -1).sum(1) / num_mask_pixels).mean()
+        losses['silhouette_loss'] = mask_loss.view(batch_size, num_frames, -1).mean(2)
+        losses['silhouette_dt_loss'] = (mask_pred * mask_dt[:,:,1]).view(batch_size, num_frames, -1).mean(2)
+        losses['silhouette_inv_dt_loss'] = ((1-mask_pred) * mask_dt[:,:,0]).view(batch_size, num_frames, -1).mean(2)
+        mask_pred_binary = (mask_pred_valid > 0.).float().detach()
+        mask_both_binary = (mask_pred_binary * mask_gt).view(batch_size*num_frames, 1, *mask_pred.shape[2:])
+        mask_both_binary = (nn.functional.avg_pool2d(mask_both_binary, 3, stride=1, padding=1).view(batch_size, num_frames, *mask_pred.shape[2:]) > 0.99).float().detach()  # erode by 1 pixel
+        ## reconstruction loss
+        # image_loss_mask = (mask_pred*mask_gt).unsqueeze(2).expand_as(image_gt)
+        # image_loss = image_loss * image_loss_mask
+        # num_mask_pixels = image_loss_mask.reshape(batch_size*num_frames, -1).sum(1).clamp(min=1)
+        # losses["rgb_loss"] = (image_loss.reshape(batch_size*num_frames, -1).sum(1) / num_mask_pixels).mean()
+        if background_mode in ['background', 'input']:
+            pass
+        else:
+            image_loss = image_loss * mask_both_binary.unsqueeze(2)
+        losses['rgb_loss'] = image_loss.reshape(batch_size, num_frames, -1).mean(2)
+        if self.cfgs.get('perceptual_loss_weight', 0.) > 0:
+            if background_mode in ['background', 'input']:
+                perc_image_pred = image_pred
+                perc_image_gt = image_gt
+            else:
+                perc_image_pred = image_pred * mask_pred_binary.unsqueeze(2) + 0.5 * (1-mask_pred_binary.unsqueeze(2))
+                perc_image_gt = image_gt * mask_pred_binary.unsqueeze(2) + 0.5 * (1-mask_pred_binary.unsqueeze(2))
+            losses['perceptual_loss'] = self.perceptual_loss(perc_image_pred.view(-1, *image_pred.shape[2:]) *2-1, perc_image_gt.view(-1, *image_gt.shape[2:]) *2-1).view(batch_size, num_frames)
+        ## flow loss - between first and second frame
+        if flow_pred is not None:
+            flow_loss = (flow_pred - flow_gt).abs()
+            flow_loss_mask = mask_both_binary[:,:-1].unsqueeze(2).expand_as(flow_gt).detach()
+            ## ignore frames where GT flow is too large (likely inaccurate)
+            large_flow = (flow_gt.abs() > 0.5).float() * flow_loss_mask
+            large_flow = (large_flow.view(batch_size, num_frames-1, -1).sum(2) > 0).float()
+            self.large_flow = large_flow
+            flow_loss = flow_loss * flow_loss_mask * (1 - large_flow[:,:,None,None,None])
+            num_mask_pixels = flow_loss_mask.reshape(batch_size, num_frames-1, -1).sum(2).clamp(min=1)
+            losses['flow_loss'] = (flow_loss.reshape(batch_size, num_frames-1, -1).sum(2) / num_mask_pixels)
+            # losses["flow_loss"] = flow_loss.mean()
+        if dino_feat_im_pred is not None:
+            dino_feat_loss = (dino_feat_im_pred - dino_feat_im_gt) ** 2
+            dino_feat_loss = dino_feat_loss * mask_both_binary.unsqueeze(2)
+            losses['dino_feat_im_loss'] = dino_feat_loss.reshape(batch_size, num_frames, -1).mean(2)
+        if reduce:
+            for k, v in losses.item():
+                losses[k] = v.mean()
+        return losses
+    def compute_pose_xflip_reg_loss(self, input_image, dino_feat_im, pose_raw, input_image_xflip_flag=None):
+        image_xflip = input_image.flip(4)
+        if dino_feat_im is not None:
+            dino_feat_im_xflip = dino_feat_im.flip(4)
+        else:
+            dino_feat_im_xflip = None
+        feat_xflip, _ = self.netInstance.forward_encoder(image_xflip, dino_feat_im_xflip)
+        batch_size, num_frames = input_image.shape[:2]
+        pose_xflip_raw = self.netInstance.forward_pose(image_xflip, feat_xflip, dino_feat_im_xflip)
+        if input_image_xflip_flag is not None:
+            pose_xflip_raw_xflip = pose_xflip_raw * torch.FloatTensor([-1,1,1,-1,1,1]).to(pose_raw.device)  # forward x, trans x
+            pose_xflip_raw = pose_xflip_raw * (1 - input_image_xflip_flag.view(batch_size * num_frames, 1)) + pose_xflip_raw_xflip * input_image_xflip_flag.view(batch_size * num_frames, 1)
+        rot_rep = self.netInstance.rot_rep
+        if rot_rep == 'euler_angle' or rot_rep == 'soft_calss':
+            pose_xflip_xflip = pose_xflip * torch.FloatTensor([1,-1,-1,-1,1,1]).to(pose_xflip.device)  # rot y+z, trans x
+            pose_xflip_reg_loss = ((pose_xflip_xflip - pose) ** 2.).mean()
+        elif rot_rep == 'quaternion':
+            rot_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose[...,:4]), convention='XYZ')
+            pose_euler = torch.cat([rot_euler, pose[...,4:]], -1)
+            rot_xflip_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose_xflip[...,:4]), convention='XYZ')
+            pose_xflip_euler = torch.cat([rot_xflip_euler, pose_xflip[...,4:]], -1)
+            pose_xflip_euler_xflip = pose_xflip_euler * torch.FloatTensor([1,-1,-1,-1,1,1]).to(pose_xflip.device)  # rot y+z, trans x
+            pose_xflip_reg_loss = ((pose_xflip_euler_xflip - pose_euler) ** 2.).mean()
+        elif rot_rep == 'lookat':
+            pose_xflip_raw_xflip = pose_xflip_raw * torch.FloatTensor([-1,1,1,-1,1,1]).to(pose_raw.device)  # forward x, trans x
+            pose_xflip_reg_loss = ((pose_xflip_raw_xflip - pose_raw)[...,0] ** 2.)  # compute x only
+            # if epoch >= self.nolookat_zflip_loss_epochs and self.lookat_zflip_no_other_losses:
+            #     pose_xflip_reg_loss = pose_xflip_reg_loss.mean(1) * is_pose_1_better
+            pose_xflip_reg_loss = pose_xflip_reg_loss.mean()
+        return pose_xflip_reg_loss, pose_xflip_raw
+    def compute_edge_length_reg_loss(self, mesh, prior_mesh):
+        prior_edge_lengths = get_edge_length(prior_mesh.v_pos, prior_mesh.t_pos_idx)
+        max_length = prior_edge_lengths.max().detach() *1.1
+        edge_lengths = get_edge_length(mesh.v_pos, mesh.t_pos_idx)
+        mesh_edge_length_loss = ((edge_lengths - max_length).clamp(min=0)**2).mean()
+        return mesh_edge_length_loss, edge_lengths
+    def compute_regularizers(self, mesh, prior_mesh, input_image, dino_feat_im, pose_raw, input_image_xflip_flag=None, arti_params=None, deformation=None):
+        losses = {}
+        aux = {}
+        if self.enable_prior:
+            losses.update(self.netPrior.netShape.get_sdf_reg_loss())
+        if self.cfgs.get('pose_xflip_reg_loss_weight', 0.) > 0:
+            losses["pose_xflip_reg_loss"], aux['pose_xflip_raw'] = self.compute_pose_xflip_reg_loss(input_image, dino_feat_im, pose_raw, input_image_xflip_flag)
+        b, f = input_image.shape[:2]
+        if b >= 2:
+            vec_forward = pose_raw[..., :3]
+            losses['pose_entropy_loss'] = (vec_forward[:b//2] * vec_forward[b//2:(b//2)*2]).sum(-1).mean()
+        else:
+            losses['pose_entropy_loss'] = 0.
+        losses['mesh_normal_consistency_loss'] = normal_consistency(mesh.v_pos, mesh.t_pos_idx)
+        losses['mesh_edge_length_loss'], aux['edge_lengths'] = self.compute_edge_length_reg_loss(mesh, prior_mesh)
+        if arti_params is not None:
+            losses['arti_reg_loss'] = (arti_params ** 2).mean()
+        if deformation is not None:
+            losses['deformation_reg_loss'] = (deformation ** 2).mean()
+            # losses['deformation_reg_loss'] = deformation.abs().mean()
+        return losses, aux
+    def forward(self, batch, epoch, iter, is_train=True, viz_logger=None, total_iter=None, save_results=False, save_dir=None, which_data='', logger_prefix='', is_training=True):
+        batch = [x.to(self.device) if x is not None else None for x in batch]
+        input_image, mask_gt, mask_dt, mask_valid, flow_gt, bbox, bg_image, dino_feat_im, dino_cluster_im, seq_idx, frame_idx = batch
+        batch_size, num_frames, _, h0, w0 = input_image.shape  # BxFxCxHxW
+        h = w = self.out_image_size
+        def collapseF(x):
+            return None if x is None else x.view(batch_size * num_frames, *x.shape[2:])
+        def expandF(x):
+            return None if x is None else x.view(batch_size, num_frames, *x.shape[1:])
+        if flow_gt.dim() == 2:  # dummy tensor for not loading flow
+            flow_gt = None
+        if dino_feat_im.dim() == 2:  # dummy tensor for not loading dino features
+            dino_feat_im = None
+            dino_feat_im_gt = None
+        else:
+            dino_feat_im_gt = expandF(torch.nn.functional.interpolate(collapseF(dino_feat_im), size=[h, w], mode="bilinear"))[:, :, :self.dino_feature_recon_dim]
+        if dino_cluster_im.dim() == 2:  # dummy tensor for not loading dino clusters
+            dino_cluster_im = None
+            dino_cluster_im_gt = None
+        else:
+            dino_cluster_im_gt = expandF(torch.nn.functional.interpolate(collapseF(dino_cluster_im), size=[h, w], mode="nearest"))
+        seq_idx = seq_idx.squeeze(1)
+        # seq_idx = seq_idx * 0  # single sequnce model
+        frame_id, crop_x0, crop_y0, crop_w, crop_h, full_w, full_h, sharpness = bbox.unbind(2)  # BxFx7
+        bbox = torch.stack([crop_x0, crop_y0, crop_w, crop_h], 2)
+        mask_gt = (mask_gt[:, :, 0, :, :] > 0.9).float()  # BxFxHxW
+        mask_dt = mask_dt / self.in_image_size
+        if which_data != 'video':
+            flow_gt = None
+        aux_viz = {}
+        ## GT
+        image_gt = input_image
+        if self.out_image_size != self.in_image_size:
+            image_gt = expandF(torch.nn.functional.interpolate(collapseF(image_gt), size=[h, w], mode='bilinear'))
+            if flow_gt is not None:
+                flow_gt = torch.nn.functional.interpolate(flow_gt.view(batch_size*(num_frames-1), 2, h0, w0), size=[h, w], mode="bilinear").view(batch_size, num_frames-1, 2, h, w)
+        self.train_pose_only = False
+        if epoch in self.pose_epochs:
+            if (total_iter // self.pose_iters) % 2 == 0:
+                self.train_pose_only = True
+        ## flip input and pose
+        if epoch in self.pose_xflip_recon_epochs:
+            input_image_xflip = input_image.flip(-1)
+            input_image_xflip_flag = torch.randint(0, 2, (batch_size, num_frames), device=input_image.device)
+            input_image = input_image * (1 - input_image_xflip_flag[:,:,None,None,None]) + input_image_xflip * input_image_xflip_flag[:,:,None,None,None]
+        else:
+            input_image_xflip_flag = None
+        ## 1st pose hypothesis with original predictions
+        # ==============================================================================================
+        #  Predict prior mesh.
+        # ==============================================================================================
+        if self.enable_prior:
+            if epoch < self.dmtet_grid_smaller_epoch:
+                if self.netPrior.netShape.grid_res != self.dmtet_grid_smaller:
+                    self.netPrior.netShape.load_tets(self.dmtet_grid_smaller)
+            else:
+                if self.netPrior.netShape.grid_res != self.dmtet_grid:
+                    self.netPrior.netShape.load_tets(self.dmtet_grid)
+            perturb_sdf = self.perturb_sdf if is_train else False
+            prior_shape, dino_pred = self.netPrior(perturb_sdf=perturb_sdf, total_iter=total_iter, is_training=is_training)
+        else:
+            prior_shape = None
+            raise NotImplementedError
+        shape, pose_raw, pose, mvp, w2c, campos, texture, im_features, deformation, arti_params, light, forward_aux = self.netInstance(input_image, prior_shape, epoch, dino_feat_im, dino_cluster_im, total_iter, is_training=is_training)  # frame dim collapsed N=(B*F)
+        rot_logit = forward_aux['rot_logit']
+        rot_idx = forward_aux['rot_idx']
+        rot_prob = forward_aux['rot_prob']
+        aux_viz.update(forward_aux)
+        if self.train_pose_only:
+            safe_detach = lambda x: x.detach() if x is not None else None
+            prior_shape = safe_detach(prior_shape)
+            shape = safe_detach(shape)
+            im_features = safe_detach(im_features)
+            arti_params = safe_detach(arti_params)
+            deformation = safe_detach(deformation)
+            set_requires_grad(texture, False)
+            set_requires_grad(light, False)
+            set_requires_grad(dino_pred, False)
+        else:
+            set_requires_grad(texture, True)
+            set_requires_grad(light, True)
+            set_requires_grad(dino_pred, True)
+        render_flow = self.render_flow and num_frames > 1
+        image_pred, mask_pred, flow_pred, dino_feat_im_pred, albedo, shading = self.render(shape, texture, mvp, w2c, campos, (h, w), background=self.background_mode, im_features=im_features, light=light, prior_shape=prior_shape, render_flow=render_flow, dino_pred=dino_pred, num_frames=num_frames, spp=self.renderer_spp)
+        image_pred, mask_pred, flow_pred, dino_feat_im_pred = map(expandF, (image_pred, mask_pred, flow_pred, dino_feat_im_pred))
+        if flow_pred is not None:
+            flow_pred = flow_pred[:, :-1]  # Bx(F-1)x2xHxW
+        if self.blur_mask:
+            sigma = max(0.5, 3 * (1 - total_iter / self.blur_mask_iter))
+            if sigma > 0.5:
+                mask_gt = util.blur_image(mask_gt, kernel_size=9, sigma=sigma, mode='gaussian')
+            # mask_pred = util.blur_image(mask_pred, kernel_size=7, mode='average')
+        losses = self.compute_reconstruction_losses(image_pred, image_gt, mask_pred, mask_gt, mask_dt, mask_valid, flow_pred, flow_gt, dino_feat_im_gt, dino_feat_im_pred, background_mode=self.background_mode, reduce=False)
+        ## TODO: assume flow loss is not used
+        logit_loss_target = torch.zeros_like(expandF(rot_logit))
+        final_losses = {}
+        for name, loss in losses.items():
+            loss_weight_logit = self.cfgs.get(f"{name}_weight", 0.)
+            # if (name in ['flow_loss'] and epoch not in self.flow_loss_epochs) or (name in ['rgb_loss', 'perceptual_loss'] and epoch not in self.texture_epochs):
+            # if name in ['flow_loss', 'rgb_loss', 'perceptual_loss']:
+            #     loss_weight_logit = 0.
+            if name in ['sdf_bce_reg_loss', 'sdf_gradient_reg_loss', 'sdf_inflate_reg_loss']:
+                if total_iter >= self.sdf_reg_decay_start_iter:
+                    decay_rate = max(0, 1 - (total_iter-self.sdf_reg_decay_start_iter) / 10000)
+                    loss_weight_logit = max(loss_weight_logit * decay_rate, self.cfgs.get(f"{name}_min_weight", 0.))
+            if name in ['dino_feat_im_loss']:
+                loss_weight_logit = loss_weight_logit * self.cfgs.get("logit_loss_dino_feat_im_loss_multiplier", 1.)
+            if loss_weight_logit > 0:
+                logit_loss_target += loss * loss_weight_logit
+            if self.netInstance.rot_rep in ['quadlookat', 'octlookat']:
+                loss = loss * rot_prob.detach().view(batch_size, num_frames)[:, :loss.shape[1]] *self.netInstance.num_pose_hypos
+            if name == 'flow_loss' and num_frames > 1:
+                ri = rot_idx.view(batch_size, num_frames)
+                same_rot_idx = (ri[:, 1:] == ri[:, :-1]).float()
+                loss = loss * same_rot_idx
+            final_losses[name] = loss.mean()
+        final_losses['logit_loss'] = ((expandF(rot_logit) - logit_loss_target.detach())**2.).mean()
+        ## regularizers
+        regularizers, aux = self.compute_regularizers(shape, prior_shape, input_image, dino_feat_im, pose_raw, input_image_xflip_flag, arti_params, deformation)
+        final_losses.update(regularizers)
+        aux_viz.update(aux)
+        total_loss = 0
+        for name, loss in final_losses.items():
+            loss_weight = self.cfgs.get(f"{name}_weight", 0.)
+            if loss_weight <= 0:
+                continue
+            if self.train_pose_only:
+                if name not in ['silhouette_loss', 'silhouette_dt_loss', 'silhouette_inv_dt_loss', 'flow_loss', 'pose_xflip_reg_loss', 'lookat_zflip_loss', 'dino_feat_im_loss']:
+                    continue
+            if epoch not in self.flow_loss_epochs:
+                if name in ['flow_loss']:
+                    continue
+            if epoch not in self.texture_epochs:
+                if name in ['rgb_loss', 'perceptual_loss']:
+                    continue
+            if epoch not in self.lookat_zflip_loss_epochs:
+                if name in ['lookat_zflip_loss']:
+                    continue
+            if name in ['mesh_laplacian_smoothing_loss', 'mesh_normal_consistency_loss']:
+                if total_iter < self.cfgs.get('mesh_reg_start_iter', 0):
+                    continue
+                if epoch >= self.mesh_reg_decay_epoch:
+                    decay_rate = self.mesh_reg_decay_rate ** (epoch - self.mesh_reg_decay_epoch)
+                    loss_weight = max(loss_weight * decay_rate, self.cfgs.get(f"{name}_min_weight", 0.))
+            if epoch not in self.sdf_inflate_reg_loss_epochs:
+                if name in ['sdf_inflate_reg_loss']:
+                    continue
+            if epoch not in self.arti_reg_loss_epochs:
+                if name in ['arti_reg_loss']:
+                    continue
+            if name in ['sdf_bce_reg_loss', 'sdf_gradient_reg_loss', 'sdf_inflate_reg_loss']:
+                if total_iter >= self.sdf_reg_decay_start_iter:
+                    decay_rate = max(0, 1 - (total_iter-self.sdf_reg_decay_start_iter) / 10000)
+                    loss_weight = max(loss_weight * decay_rate, self.cfgs.get(f"{name}_min_weight", 0.))
+            total_loss += loss * loss_weight
+        self.total_loss += total_loss  # reset to 0 in backward step
+        if torch.isnan(self.total_loss):
+            print("NaN in loss...")
+            import ipdb; ipdb.set_trace()
+        final_losses['logit_loss_target'] = logit_loss_target.mean()
+        metrics = {'loss': total_loss, **final_losses}
+        ## log visuals
+        if viz_logger is not None:
+            b0 = max(min(batch_size, 16//num_frames), 1)
+            viz_logger.add_image(logger_prefix+'image/image_gt', misc.image_grid(image_gt.detach().cpu()[:b0,:].reshape(-1,*input_image.shape[2:]).clamp(0,1)), total_iter)
+            viz_logger.add_image(logger_prefix+'image/image_pred', misc.image_grid(image_pred.detach().cpu()[:b0,:].reshape(-1,*image_pred.shape[2:]).clamp(0,1)), total_iter)
+            # viz_logger.add_image(logger_prefix+'image/flow_loss_mask', misc.image_grid(flow_loss_mask[:b0,:,:1].reshape(-1,1,*flow_loss_mask.shape[3:]).repeat(1,3,1,1).clamp(0,1)), total_iter)
+            viz_logger.add_image(logger_prefix+'image/mask_gt', misc.image_grid(mask_gt.detach().cpu()[:b0,:].reshape(-1,*mask_gt.shape[2:]).unsqueeze(1).repeat(1,3,1,1).clamp(0,1)), total_iter)
+            viz_logger.add_image(logger_prefix+'image/mask_pred', misc.image_grid(mask_pred.detach().cpu()[:b0,:].reshape(-1,*mask_pred.shape[2:]).unsqueeze(1).repeat(1,3,1,1).clamp(0,1)), total_iter)
+            if self.render_flow and flow_gt is not None:
+                flow_gt = flow_gt.detach().cpu()
+                flow_gt_viz = torch.cat([flow_gt[:b0], torch.zeros_like(flow_gt[:b0,:,:1])], 2) + 0.5  # -0.5~1.5
+                flow_gt_viz = torch.nn.functional.pad(flow_gt_viz, pad=[0, 0, 0, 0, 0, 0, 0, 1])
+                ## draw marker on large flow frames
+                large_flow_marker_mask = torch.zeros_like(flow_gt_viz)
+                large_flow_marker_mask[:,:,:,:8,:8] = 1.
+                large_flow = torch.cat([self.large_flow, self.large_flow[:,:1] *0.], 1).detach().cpu()[:b0]
+                large_flow_marker_mask = large_flow_marker_mask * large_flow[:,:,None,None,None]
+                red = torch.FloatTensor([1,0,0])[None,None,:,None,None]
+                flow_gt_viz = large_flow_marker_mask * red + (1-large_flow_marker_mask) * flow_gt_viz
+                viz_logger.add_image(logger_prefix+'image/flow_gt', misc.image_grid(flow_gt_viz.reshape(-1,*flow_gt_viz.shape[2:])), total_iter)
+            if self.render_flow and flow_pred is not None:
+                flow_pred = flow_pred.detach().cpu()
+                flow_pred_viz = torch.cat([flow_pred[:b0], torch.zeros_like(flow_pred[:b0,:,:1])], 2) + 0.5  # -0.5~1.5
+                flow_pred_viz = torch.nn.functional.pad(flow_pred_viz, pad=[0, 0, 0, 0, 0, 0, 0, 1])
+                viz_logger.add_image(logger_prefix+'image/flow_pred', misc.image_grid(flow_pred_viz.reshape(-1,*flow_pred_viz.shape[2:])), total_iter)
+            if light is not None:
+                param_names = ['dir_x', 'dir_y', 'dir_z', 'int_ambient', 'int_diffuse']
+                for name, param in zip(param_names, light.light_params.unbind(-1)):
+                    viz_logger.add_histogram(logger_prefix+'light/'+name, param, total_iter)
+                viz_logger.add_image(
+                        logger_prefix + f'image/albedo',
+                        misc.image_grid(expandF(albedo)[:b0, ...].view(-1, *albedo.shape[1:])),
+                        total_iter)
+                viz_logger.add_image(
+                        logger_prefix + f'image/shading',
+                        misc.image_grid(expandF(shading)[:b0, ...].view(-1, *shading.shape[1:]).repeat(1, 3, 1, 1) /2.),
+                        total_iter)
+            viz_logger.add_histogram(logger_prefix+'sdf', self.netPrior.netShape.get_sdf(perturb_sdf=False), total_iter)
+            viz_logger.add_histogram(logger_prefix+'coordinates', shape.v_pos, total_iter)
+            if arti_params is not None:
+                viz_logger.add_histogram(logger_prefix+'arti_params', arti_params, total_iter)
+                viz_logger.add_histogram(logger_prefix+'edge_lengths', aux_viz['edge_lengths'], total_iter)
+            if deformation is not None:
+                viz_logger.add_histogram(logger_prefix+'deformation', deformation, total_iter)
+            rot_rep = self.netInstance.rot_rep
+            if rot_rep == 'euler_angle' or rot_rep == 'soft_calss':
+                for i, name in enumerate(['rot_x', 'rot_y', 'rot_z', 'trans_x', 'trans_y', 'trans_z']):
+                    viz_logger.add_histogram(logger_prefix+'pose/'+name, pose[...,i], total_iter)
+            elif rot_rep == 'quaternion':
+                for i, name in enumerate(['qt_0', 'qt_1', 'qt_2', 'qt_3', 'trans_x', 'trans_y', 'trans_z']):
+                    viz_logger.add_histogram(logger_prefix+'pose/'+name, pose[...,i], total_iter)
+                rot_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose.detach().cpu()[...,:4]), convention='XYZ')
+                for i, name in enumerate(['rot_x', 'rot_y', 'rot_z']):
+                    viz_logger.add_histogram(logger_prefix+'pose/'+name, rot_euler[...,i], total_iter)
+            elif rot_rep in ['lookat', 'quadlookat', 'octlookat']:
+                for i, name in enumerate(['fwd_x', 'fwd_y', 'fwd_z']):
+                    viz_logger.add_histogram(logger_prefix+'pose/'+name, pose_raw[...,i], total_iter)
+                for i, name in enumerate(['trans_x', 'trans_y', 'trans_z']):
+                    viz_logger.add_histogram(logger_prefix+'pose/'+name, pose_raw[...,-3+i], total_iter)
+            if rot_rep in ['quadlookat', 'octlookat']:
+                for i, rp in enumerate(forward_aux['rots_probs'].unbind(-1)):
+                    viz_logger.add_histogram(logger_prefix+'pose/rot_prob_%d'%i, rp, total_iter)
+            if 'pose_xflip_raw' in aux_viz:
+                pose_xflip_raw = aux_viz['pose_xflip_raw']
+                if rot_rep == 'euler_angle' or rot_rep == 'soft_calss':
+                    for i, name in enumerate(['rot_x', 'rot_y', 'rot_z', 'trans_x', 'trans_y', 'trans_z']):
+                        viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip[...,i], total_iter)
+                elif rot_rep == 'quaternion':
+                    for i, name in enumerate(['qt_0', 'qt_1', 'qt_2', 'qt_3', 'trans_x', 'trans_y', 'trans_z']):
+                        viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip[...,i], total_iter)
+                    rot_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose_xflip.detach().cpu()[...,:4]), convention='XYZ')
+                    for i, name in enumerate(['rot_x', 'rot_y', 'rot_z']):
+                        viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, rot_euler[...,i], total_iter)
+                elif rot_rep in ['lookat', 'quadlookat', 'octlookat']:
+                    for i, name in enumerate(['fwd_x', 'fwd_y', 'fwd_z']):
+                        viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip_raw[...,i], total_iter)
+                    for i, name in enumerate(['trans_x', 'trans_y', 'trans_z']):
+                        viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip_raw[...,-3+i], total_iter)
+            if dino_feat_im_gt is not None:
+                dino_feat_im_gt_first3 = dino_feat_im_gt[:,:,:3]
+                viz_logger.add_image(logger_prefix+'image/dino_feat_im_gt', misc.image_grid(dino_feat_im_gt_first3.detach().cpu()[:b0,:].reshape(-1,*dino_feat_im_gt_first3.shape[2:]).clamp(0,1)), total_iter)
+            if dino_cluster_im_gt is not None:
+                viz_logger.add_image(logger_prefix+'image/dino_cluster_im_gt', misc.image_grid(dino_cluster_im_gt.detach().cpu()[:b0,:].reshape(-1,*dino_cluster_im_gt.shape[2:]).clamp(0,1)), total_iter)
+            if dino_feat_im_pred is not None:
+                dino_feat_im_pred_first3 = dino_feat_im_pred[:,:,:3]
+                viz_logger.add_image(logger_prefix+'image/dino_feat_im_pred', misc.image_grid(dino_feat_im_pred_first3.detach().cpu()[:b0,:].reshape(-1,*dino_feat_im_pred_first3.shape[2:]).clamp(0,1)), total_iter)
+            for which_shape, modes in self.extra_renders.items():
+                # This is wrong
+                # if which_shape == "prior":
+                #     shape_to_render = prior_shape.extend(im_features.shape[0])
+                #     needed_im_features = None
+                if which_shape == "instance":
+                    shape_to_render = shape
+                    needed_im_features = im_features
+                else:
+                    raise NotImplementedError
+                for mode in modes:
+                    rendered, _, _, _, _, _ = self.render(shape_to_render, texture, mvp, w2c, campos, (h, w), background=self.background_mode, im_features=needed_im_features, prior_shape=prior_shape, render_mode=mode, render_flow=False, dino_pred=None)
+                    if 'kd' in mode:
+                        rendered = util.rgb_to_srgb(rendered)
+                    rendered = rendered.detach().cpu()
+                    if 'posed_bones' in aux_viz:
+                        rendered_bone_image = self.render_bones(mvp, aux_viz['posed_bones'], (h, w))
+                        rendered_bone_image_mask = (rendered_bone_image < 1).any(1, keepdim=True).float()
+                        # viz_logger.add_image(logger_prefix+'image/articulation_bones', misc.image_grid(self.render_bones(mvp, aux_viz['posed_bones'])), total_iter)
+                        rendered = rendered_bone_image_mask*0.8 * rendered_bone_image + (1-rendered_bone_image_mask*0.8) * rendered
+                    if rot_rep in ['quadlookat', 'octlookat']:
+                        rand_pose_flag = forward_aux['rand_pose_flag'].detach().cpu()
+                        rand_pose_marker_mask = torch.zeros_like(rendered)
+                        rand_pose_marker_mask[:,:,:16,:16] = 1.
+                        rand_pose_marker_mask = rand_pose_marker_mask * rand_pose_flag[:,None,None,None]
+                        red = torch.FloatTensor([1,0,0])[None,:,None,None]
+                        rendered = rand_pose_marker_mask * red + (1-rand_pose_marker_mask) * rendered
+                    viz_logger.add_image(
+                        logger_prefix + f'image/{which_shape}_{mode}',
+                        misc.image_grid(expandF(rendered)[:b0, ...].view(-1, *rendered.shape[1:])),
+                        total_iter)
+                    viz_logger.add_video(
+                        logger_prefix + f'animation/{which_shape}_{mode}',
+                        self.render_rotation_frames(shape_to_render, texture, light, (h, w), background=self.background_mode, im_features=needed_im_features, prior_shape=prior_shape, num_frames=15, render_mode=mode, b=1).detach().cpu().unsqueeze(0),
+                        total_iter,
+                        fps=2)
+            viz_logger.add_video(
+                logger_prefix+'animation/prior_image_rotation',
+                self.render_rotation_frames(prior_shape, texture, light, (h, w), background=self.background_mode, im_features=im_features, num_frames=15, b=1).detach().cpu().unsqueeze(0).clamp(0,1),
+                total_iter,
+                fps=2)
+            viz_logger.add_video(
+                logger_prefix+'animation/prior_normal_rotation',
+                self.render_rotation_frames(prior_shape, texture, light, (h, w), background=self.background_mode, im_features=im_features, num_frames=15, render_mode='geo_normal', b=1).detach().cpu().unsqueeze(0),
+                total_iter,
+                fps=2)
+        if save_results:
+            b0 = self.cfgs.get('num_saved_from_each_batch', batch_size*num_frames)
+            fnames = [f'{total_iter:07d}_{fid:10d}' for fid in collapseF(frame_id.int())][:b0]
+            misc.save_images(save_dir, collapseF(image_gt)[:b0].clamp(0,1).detach().cpu().numpy(), suffix='image_gt', fnames=fnames)
+            misc.save_images(save_dir, collapseF(image_pred)[:b0].clamp(0,1).detach().cpu().numpy(), suffix='image_pred', fnames=fnames)
+            misc.save_images(save_dir, collapseF(mask_gt)[:b0].unsqueeze(1).repeat(1,3,1,1).clamp(0,1).detach().cpu().numpy(), suffix='mask_gt', fnames=fnames)
+            misc.save_images(save_dir, collapseF(mask_pred)[:b0].unsqueeze(1).repeat(1,3,1,1).clamp(0,1).detach().cpu().numpy(), suffix='mask_pred', fnames=fnames)
+            # tmp_shape = shape.first_n(b0).clone()
+            # tmp_shape.material = texture
+            # feat = im_features[:b0] if im_features is not None else None
+            # misc.save_obj(save_dir, tmp_shape, save_material=False, feat=feat, suffix="mesh", fnames=fnames)  # Save the first mesh.
+            # if self.render_flow and flow_gt is not None:
+            #     flow_gt_viz = torch.cat([flow_gt, torch.zeros_like(flow_gt[:,:,:1])], 2) + 0.5  # -0.5~1.5
+            #     flow_gt_viz = flow_gt_viz.view(-1, *flow_gt_viz.shape[2:])
+            #     misc.save_images(save_dir, flow_gt_viz[:b0].clamp(0,1).detach().cpu().numpy(), suffix='flow_gt', fnames=fnames)
+            # if flow_pred is not None:
+            #     flow_pred_viz = torch.cat([flow_pred, torch.zeros_like(flow_pred[:,:,:1])], 2) + 0.5  # -0.5~1.5
+            #     flow_pred_viz = flow_pred_viz.view(-1, *flow_pred_viz.shape[2:])
+            #     misc.save_images(save_dir, flow_pred_viz[:b0].clamp(0,1).detach().cpu().numpy(), suffix='flow_pred', fnames=fnames)
+            misc.save_txt(save_dir, pose[:b0].detach().cpu().numpy(), suffix='pose', fnames=fnames)
+        return metrics
+    def save_scores(self, path):
+        header = 'mask_mse, \
+                  mask_iou, \
+                  image_mse, \
+                  flow_mse'
+        mean = self.all_scores.mean(0)
+        std = self.all_scores.std(0)
+        header = header + '\nMean: ' + ',\t'.join(['%.8f'%x for x in mean])
+        header = header + '\nStd: ' + ',\t'.join(['%.8f'%x for x in std])
+        misc.save_scores(path, self.all_scores, header=header)
+        print(header)
+    def render_rotation_frames(self, mesh, texture, light, resolution, background='none', im_features=None, prior_shape=None, num_frames=36, render_mode='diffuse', b=None):
+        frames = []
+        if b is None:
+            b = len(mesh)
+        else:
+            mesh = mesh.first_n(b)
+            feat = im_features[:b] if im_features is not None else None
+        delta_angle = np.pi / num_frames * 2
+        delta_rot_matrix = torch.FloatTensor([
+            [np.cos(delta_angle),  0, np.sin(delta_angle), 0],
+            [0,                    1, 0,                   0],
+            [-np.sin(delta_angle), 0, np.cos(delta_angle), 0],
+            [0,                    0, 0,                   1],
+        ]).to(self.device).repeat(b, 1, 1)
+        w2c = torch.FloatTensor(np.diag([1., 1., 1., 1]))
+        w2c[:3, 3] = torch.FloatTensor([0, 0, -self.cam_pos_z_offset *1.1])
+        w2c = w2c.repeat(b, 1, 1).to(self.device)
+        proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(b, 1, 1).to(self.device)
+        mvp = torch.bmm(proj, w2c)
+        campos = -w2c[:, :3, 3]
+        def rotate_pose(mvp, campos):
+            mvp = torch.matmul(mvp, delta_rot_matrix)
+            campos = torch.matmul(delta_rot_matrix[:,:3,:3].transpose(2,1), campos[:,:,None])[:,:,0]
+            return mvp, campos
+        for _ in range(num_frames):
+            image_pred, _, _, _, _, _ = self.render(mesh, texture, mvp, w2c, campos, resolution, background=background, im_features=feat, light=light, prior_shape=prior_shape, render_flow=False, dino_pred=None, render_mode=render_mode, two_sided_shading=False)
+            frames += [misc.image_grid(image_pred)]
+            mvp, campos = rotate_pose(mvp, campos)
+        return torch.stack(frames, dim=0)  # Shape: (T, C, H, W)
+    def render_bones(self, mvp, bones_pred, size=(256, 256)):
+        bone_world4 = torch.concat([bones_pred, torch.ones_like(bones_pred[..., :1]).to(bones_pred.device)], dim=-1)
+        b, f, num_bones = bone_world4.shape[:3]
+        bones_clip4 = (bone_world4.view(b, f, num_bones*2, 1, 4) @ mvp.transpose(-1, -2).reshape(b, f, 1, 4, 4)).view(b, f, num_bones, 2, 4)
+        bones_uv = bones_clip4[..., :2] / bones_clip4[..., 3:4]  # b, f, num_bones, 2, 2
+        dpi = 32
+        fx, fy = size[1] // dpi, size[0] // dpi
+        rendered = []
+        for b_idx in range(b):
+            for f_idx in range(f):
+                frame_bones_uv = bones_uv[b_idx, f_idx].cpu().numpy()
+                fig = plt.figure(figsize=(fx, fy), dpi=dpi, frameon=False)
+                ax = plt.Axes(fig, [0., 0., 1., 1.])
+                ax.set_axis_off()
+                for bone in frame_bones_uv:
+                    ax.plot(bone[:, 0], bone[:, 1], marker='o', linewidth=8, markersize=20)
+                ax.set_xlim(-1, 1)
+                ax.set_ylim(-1, 1)
+                ax.invert_yaxis()
+                # Convert to image
+                fig.add_axes(ax)
+                fig.canvas.draw_idle()
+                image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+                w, h = fig.canvas.get_width_height()
+                image.resize(h, w, 3)
+                rendered += [image / 255.]
+        return torch.from_numpy(np.stack(rendered, 0).transpose(0, 3, 1, 2))
+    def render_deformation_frames(self, mesh, texture, batch_size, num_frames, resolution, background='none', im_features=None, render_mode='diffuse', b=None):
+        # frames = []
+        # if b is None:
+        #     b = batch_size
+        #     im_features = im_features[]
+        # mesh = mesh.first_n(num_frames * b)
+        # for i in range(b):
+        #     tmp_mesh = mesh.get_m_to_n(i*num_frames:(i+1)*num_frames)
+        pass

video3d/model_ddp.py ADDED Viewed

The diff for this file is too large to render. See raw diff

video3d/networks.py ADDED Viewed

	@@ -0,0 +1,1724 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.models as models
+from typing import Union, List, Tuple
+import os
+import video3d.utils.misc as misc
+import torch.nn.functional as F
+from siren_pytorch import SirenNet
+from video3d.triplane_texture.lift_architecture import Lift_Encoder
+from video3d.triplane_texture.triplane_transformer import Triplane_Transformer
+EPS = 1e-7
+def get_activation(name, inplace=True, lrelu_param=0.2):
+    if name == 'tanh':
+        return nn.Tanh()
+    elif name == 'sigmoid':
+        return nn.Sigmoid()
+    elif name == 'relu':
+        return nn.ReLU(inplace=inplace)
+    elif name == 'lrelu':
+        return nn.LeakyReLU(lrelu_param, inplace=inplace)
+    else:
+        raise NotImplementedError
+class MLPWithPositionalEncoding(nn.Module):
+    def __init__(self,
+                 cin,
+                 cout,
+                 num_layers,
+                 nf=256,
+                 dropout=0,
+                 activation=None,
+                 n_harmonic_functions=10,
+                 omega0=1,
+                 extra_dim=0,
+                 embed_concat_pts=True,
+                 symmetrize=False):
+        super().__init__()
+        self.extra_dim = extra_dim
+        if n_harmonic_functions > 0:
+            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
+            dim_in = cin * 2 * n_harmonic_functions
+            self.embed_concat_pts = embed_concat_pts
+            if embed_concat_pts:
+                dim_in += cin
+        else:
+            self.embedder = None
+            dim_in = cin
+        self.in_layer = nn.Linear(dim_in, nf)
+        self.relu = nn.ReLU(inplace=True)
+        self.mlp = MLP(nf + extra_dim, cout, num_layers, nf, dropout, activation)
+        self.symmetrize = symmetrize
+    def forward(self, x, feat=None):
+        assert (feat is None and self.extra_dim == 0) or feat.shape[-1] == self.extra_dim
+        if self.symmetrize:
+            xs, ys, zs = x.unbind(-1)
+            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+        if self.embedder is not None:
+            x_in = self.embedder(x)
+            if self.embed_concat_pts:
+                x_in = torch.cat([x, x_in], -1)
+        else:
+            x_in = x
+        x_in = self.relu(self.in_layer(x_in))
+        if feat is not None:
+            # if len(feat.shape) == 1:
+            #     for _ in range(len(x_in.shape) - 1):
+            #         feat = feat.unsqueeze(0)
+            #     feat = feat.repeat(*x_in.shape[:-1], 1)
+            x_in = torch.concat([x_in, feat], dim=-1)
+        return self.mlp(x_in)
+class MLPWithPositionalEncoding_Style(nn.Module):
+    def __init__(self,
+                 cin,
+                 cout,
+                 num_layers,
+                 nf=256,
+                 dropout=0,
+                 activation=None,
+                 n_harmonic_functions=10,
+                 omega0=1,
+                 extra_dim=0,
+                 embed_concat_pts=True,
+                 symmetrize=False,
+                 style_choice='film'):
+        super().__init__()
+        self.extra_dim = extra_dim
+        if n_harmonic_functions > 0:
+            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
+            dim_in = cin * 2 * n_harmonic_functions
+            self.embed_concat_pts = embed_concat_pts
+            if embed_concat_pts:
+                dim_in += cin
+        else:
+            self.embedder = None
+            dim_in = cin
+        self.in_layer = nn.Linear(dim_in, nf)
+        self.relu = nn.ReLU(inplace=True)
+        if extra_dim == 0:
+            self.mlp = MLP(nf + extra_dim, cout, num_layers, nf, dropout, activation)
+        else:
+            if style_choice == 'film':
+                self.mlp = MLP_FiLM(nf, cout, num_layers, nf, dropout, activation)
+                self.style_mlp = MLP(extra_dim, nf*2, 2, nf, dropout, None)
+            elif style_choice == 'mod':
+                self.mlp = MLP_Mod(nf, cout, num_layers, nf, dropout, activation)
+                self.style_mlp = MLP(extra_dim, nf, 2, nf, dropout, None)
+            else:
+                raise NotImplementedError
+            self.style_choice = style_choice
+        self.symmetrize = symmetrize
+    def forward(self, x, feat=None):
+        assert (feat is None and self.extra_dim == 0) or feat.shape[-1] == self.extra_dim
+        if self.symmetrize:
+            xs, ys, zs = x.unbind(-1)
+            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+        if self.embedder is not None:
+            x_in = self.embedder(x)
+            if self.embed_concat_pts:
+                x_in = torch.cat([x, x_in], -1)
+        else:
+            x_in = x
+        x_in = self.relu(self.in_layer(x_in))
+        if feat is not None:
+            style = self.style_mlp(feat)
+            if self.style_choice == 'film':
+                style = style.reshape(style.shape[:-1] + (-1, 2))
+            out = self.mlp(x_in, style)
+        else:
+            out = self.mlp(x_in)
+        return out
+class MLP_FiLM(nn.Module):
+    def __init__(self, cin, cout, num_layers, nf=256, dropout=0, activation=None):
+        # default no dropout
+        super().__init__()
+        assert num_layers >= 1
+        self.num_layers = num_layers
+        if num_layers == 1:
+            self.network = Linear_FiLM(cin, cout, bias=False)
+        else:
+            self.relu = nn.ReLU(inplace=True)
+            for i in range(num_layers):
+                if i == 0:
+                    setattr(self, f'linear_{i}', Linear_FiLM(cin, nf, bias=False))
+                elif i == (num_layers-1):
+                    setattr(self, f'linear_{i}', Linear_FiLM(nf, cout, bias=False))
+                else:
+                    setattr(self, f'linear_{i}', Linear_FiLM(nf, nf, bias=False))
+    def forward(self, input, style):
+        if self.num_layers == 1:
+            out = self.network(input, style)
+        else:
+            x = input
+            for i in range(self.num_layers):
+                linear_layer = getattr(self, f'linear_{i}')
+                if i == (self.num_layers - 1):
+                    x = linear_layer(x, style)
+                else:
+                    x = linear_layer(x, style)
+                    x = self.relu(x)
+            out = x
+        return out
+class MLP_Mod(nn.Module):
+    def __init__(self, cin, cout, num_layers, nf=256, dropout=0, activation=None):
+        # default no dropout
+        super().__init__()
+        assert num_layers >= 1
+        self.num_layers = num_layers
+        if num_layers == 1:
+            self.network = Linear_Mod(cin, cout, bias=False)
+        else:
+            self.relu = nn.ReLU(inplace=True)
+            for i in range(num_layers):
+                if i == 0:
+                    setattr(self, f'linear_{i}', Linear_Mod(cin, nf, bias=False))
+                elif i == (num_layers-1):
+                    setattr(self, f'linear_{i}', Linear_Mod(nf, cout, bias=False))
+                else:
+                    setattr(self, f'linear_{i}', Linear_Mod(nf, nf, bias=False))
+    def forward(self, input, style):
+        if self.num_layers == 1:
+            out = self.network(input, style)
+        else:
+            x = input
+            for i in range(self.num_layers):
+                linear_layer = getattr(self, f'linear_{i}')
+                if i == (self.num_layers - 1):
+                    x = linear_layer(x, style)
+                else:
+                    x = linear_layer(x, style)
+                    x = self.relu(x)
+            out = x
+        return out
+import math
+class Linear_FiLM(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(torch.empty((out_features, in_features), **factory_kwargs))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+    def forward(self, input, style):
+        # if input is [..., D], style should be [..., D, 2]
+        x = input * style[..., 0] + style[..., 1]
+        return torch.nn.functional.linear(x, self.weight, self.bias)
+    def extra_repr(self) -> str:
+        return 'in_features={}, out_features={}, bias={}'.format(
+            self.in_features, self.out_features, self.bias is not None
+        )
+class Linear_Mod(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(torch.empty((out_features, in_features), **factory_kwargs))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+    def forward(self, input, style):
+        # weight: [out_features, in_features]
+        # style: [..., in_features]
+        if len(style.shape) > 1:
+            style = style.reshape(-1, style.shape[-1])
+            style = style[0]
+        weight = self.weight * style.unsqueeze(0)
+        decoefs = ((weight * weight).sum(dim=-1, keepdim=True) + 1e-5).sqrt()
+        weight = weight / decoefs
+        return torch.nn.functional.linear(input, weight, self.bias)
+    def extra_repr(self) -> str:
+        return 'in_features={}, out_features={}, bias={}'.format(
+            self.in_features, self.out_features, self.bias is not None
+        )
+class MLPTextureSimple(nn.Module):
+    def __init__(self,
+                 cin,
+                 cout,
+                 num_layers,
+                 nf=256,
+                 dropout=0,
+                 activation=None,
+                 min_max=None,
+                 n_harmonic_functions=10,
+                 omega0=1,
+                 extra_dim=0,
+                 embed_concat_pts=True,
+                 perturb_normal=False,
+                 symmetrize=False,
+                 texture_act='relu',
+                 linear_bias=False):
+        super().__init__()
+        self.extra_dim = extra_dim
+        if n_harmonic_functions > 0:
+            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
+            dim_in = cin * 2 * n_harmonic_functions
+            self.embed_concat_pts = embed_concat_pts
+            if embed_concat_pts:
+                dim_in += cin
+        else:
+            self.embedder = None
+            dim_in = cin
+        self.in_layer = nn.Linear(dim_in, nf)
+        self.relu = nn.ReLU(inplace=True)
+        if texture_act == 'sin':
+            print('using siren network for texture mlp here')
+            self.mlp = SirenNet(
+                dim_in=(nf + extra_dim),
+                dim_hidden=nf,
+                dim_out=cout,
+                num_layers=num_layers,
+                final_activation=get_activation(activation),
+                w0_initial=30,
+                use_bias=linear_bias,
+                dropout=dropout
+            )
+        else:
+            self.mlp = MLP(nf + extra_dim, cout, num_layers, nf, dropout, activation, inner_act=texture_act, linear_bias=linear_bias)
+        self.perturb_normal = perturb_normal
+        self.symmetrize = symmetrize
+        if min_max is not None:
+            self.register_buffer('min_max', min_max)
+        else:
+            self.min_max = None
+        self.bsdf = None
+    def sample(self, x, feat=None):
+        assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] == self.extra_dim)
+        b, h, w, c = x.shape
+        if self.symmetrize:
+            xs, ys, zs = x.unbind(-1)
+            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+        x = x.view(-1, c)
+        if self.embedder is not None:
+            x_in = self.embedder(x)
+            if self.embed_concat_pts:
+                x_in = torch.cat([x, x_in], -1)
+        else:
+            x_in = x
+        x_in = self.in_layer(x_in)
+        if feat is not None:
+            feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
+            x_in = torch.concat([x_in, feat], dim=-1)
+        out = self.mlp(self.relu(x_in))
+        if self.min_max is not None:
+            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
+        return out.view(b, h, w, -1)
+class MLPTextureTriplane(nn.Module):
+    def __init__(self,
+                 cin,
+                 cout,
+                 num_layers,
+                 nf=256,
+                 dropout=0,
+                 activation=None,
+                 min_max=None,
+                 n_harmonic_functions=10,
+                 omega0=1,
+                 extra_dim=0,
+                 embed_concat_pts=True,
+                 perturb_normal=False,
+                 symmetrize=False,
+                 texture_act='relu',
+                 linear_bias=False,
+                 cam_pos_z_offset=10.,
+                 grid_scale=7,):
+        super().__init__()
+        self.extra_dim = extra_dim
+        if n_harmonic_functions > 0:
+            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
+            dim_in = cin * 2 * n_harmonic_functions
+            self.embed_concat_pts = embed_concat_pts
+            if embed_concat_pts:
+                dim_in += cin
+        else:
+            self.embedder = None
+            dim_in = cin
+        self.in_layer = nn.Linear(dim_in, nf)
+        self.relu = nn.ReLU(inplace=True)
+        self.feat_net = Triplane_Transformer(
+            emb_dim=256,
+            num_layers=8,
+            triplane_dim=80,
+            triplane_scale=grid_scale
+        )
+        self.extra_dim -= extra_dim
+        self.extra_dim += (self.feat_net.triplane_dim * 3)
+        if texture_act == 'sin':
+            print('using siren network for texture mlp here')
+            self.mlp = SirenNet(
+                dim_in=(nf + self.extra_dim),
+                dim_hidden=nf,
+                dim_out=cout,
+                num_layers=num_layers,
+                final_activation=get_activation(activation),
+                w0_initial=30,
+                use_bias=linear_bias,
+                dropout=dropout
+            )
+        else:
+            self.mlp = MLP(nf + self.extra_dim, cout, num_layers, nf, dropout, activation, inner_act=texture_act, linear_bias=linear_bias)
+        self.perturb_normal = perturb_normal
+        self.symmetrize = symmetrize
+        if min_max is not None:
+            self.register_buffer('min_max', min_max)
+        else:
+            self.min_max = None
+        self.bsdf = None
+    def sample(self, x, feat=None, feat_map=None, mvp=None, w2c=None, deform_xyz=None):
+        # assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] == self.extra_dim)
+        b, h, w, c = x.shape
+        if self.symmetrize:
+            xs, ys, zs = x.unbind(-1)
+            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+        if isinstance(feat_map, dict):
+            feat_map = feat_map["im_features_map"]
+        feat_map = feat_map.permute(0, 2, 3, 1)
+        _, ph, pw, _ = feat_map.shape
+        feat_map = feat_map.reshape(feat_map.shape[0], ph*pw, feat_map.shape[-1])
+        pts_feat = self.feat_net(feat_map, x.reshape(b, -1, 3))
+        pts_c = pts_feat.shape[-1]
+        pts_feat = pts_feat.reshape(-1, pts_c)
+        x = x.view(-1, c)
+        if self.embedder is not None:
+            x_in = self.embedder(x)
+            if self.embed_concat_pts:
+                x_in = torch.cat([x, x_in], -1)
+        else:
+            x_in = x
+        x_in = self.in_layer(x_in)
+        x_in = torch.concat([x_in, pts_feat], dim=-1)
+        out = self.mlp(self.relu(x_in))
+        if self.min_max is not None:
+            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
+        return out.view(b, h, w, -1)
+class LocalFeatureBlock(nn.Module):
+    def __init__(self, local_feat_dim, input_dim=384, output_dim=384, upscale_num=3):
+        super().__init__()
+        self.local_feat_dim = local_feat_dim
+        self.conv_list = nn.ModuleList([])
+        self.upscale_list = nn.ModuleList([])
+        for i in range(upscale_num):
+            if i == 0:
+                self.conv_list.append(nn.Conv2d(input_dim, 4 * local_feat_dim, 3, stride=1, padding=1, dilation=1))
+            else:
+                self.conv_list.append(nn.Conv2d(local_feat_dim, 4 * local_feat_dim, 3, stride=1, padding=1, dilation=1))
+            self.upscale_list.append(nn.PixelShuffle(2))
+        self.conv_head = nn.Conv2d(local_feat_dim, output_dim, 3, stride=1, padding=1, dilation=1)
+    def forward(self, x):
+        for idx, conv in enumerate(self.conv_list):
+            x = conv(x)
+            x = self.upscale_list[idx](x)
+        out = self.conv_head(x)
+        return out
+class MLPTextureLocal(nn.Module):
+    def __init__(self,
+                 cin,
+                 cout,
+                 num_layers,
+                 nf=256,
+                 dropout=0,
+                 activation=None,
+                 min_max=None,
+                 n_harmonic_functions=10,
+                 omega0=1,
+                 extra_dim=0,
+                 embed_concat_pts=True,
+                 perturb_normal=False,
+                 symmetrize=False,
+                 texture_way=None,
+                 larger_tex_dim=False,
+                 cam_pos_z_offset=10.,
+                 grid_scale=7.):
+        super().__init__()
+        self.extra_dim = extra_dim
+        self.cam_pos_z_offset = cam_pos_z_offset
+        self.grid_scale = grid_scale
+        local_feat_dim = 64
+        assert texture_way is not None
+        self.texture_way = texture_way
+        if 'local' in texture_way and 'global' in texture_way:
+            # self.extra_dim = extra_dim + local_feat_dim
+            self.extra_dim = extra_dim
+        elif 'local' in texture_way and 'global' not in texture_way:
+            # self.extra_dim = local_feat_dim
+            self.extra_dim = extra_dim
+        elif 'local' not in texture_way and 'global' in texture_way:
+            self.extra_dim = extra_dim
+        if n_harmonic_functions > 0:
+            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
+            dim_in = cin * 2 * n_harmonic_functions
+            self.embed_concat_pts = embed_concat_pts
+            if embed_concat_pts:
+                dim_in += cin
+        else:
+            self.embedder = None
+            dim_in = cin
+        # self.local_feature_block = LocalFeatureBlock(local_feat_dim=local_feat_dim, input_dim=384, output_dim=256)
+        self.local_feature_block = nn.Linear(384, nf, bias=False)
+        self.in_layer = nn.Linear(dim_in, nf)
+        self.relu = nn.ReLU(inplace=True)
+        self.mlp = MLP(nf + self.extra_dim, cout, num_layers, nf, dropout, activation)
+        self.perturb_normal = perturb_normal
+        self.symmetrize = symmetrize
+        if min_max is not None:
+            self.register_buffer('min_max', min_max)
+        else:
+            self.min_max = None
+        self.bsdf = None
+    def get_uv_depth(self, xyz, mvp):
+        # xyz: [b, k, 3]
+        # mvp: [b, 4, 4]
+        cam4 = torch.matmul(torch.nn.functional.pad(xyz, pad=(0,1), mode='constant', value=1.0), torch.transpose(mvp, 1, 2))
+        cam3 = cam4[..., :3] / cam4[..., 3:4]
+        cam_uv = cam3[..., :2]
+        # cam_uv = cam_uv.detach()
+        cam_depth = cam3 + torch.FloatTensor([0, 0, self.cam_pos_z_offset]).to(xyz.device).view(1, 1, 3)
+        cam_depth = cam_depth / self.grid_scale * 2
+        cam_depth = cam_depth[..., 2:3]
+        # cam_depth = cam_depth.detach()
+        return cam_uv, cam_depth
+    def proj_sample_deform(self, xyz, feat_map, mvp, w2c, img_h, img_w):
+        # here the xyz is deformed points
+        # and we don't cast any symmtery here
+        b, k, c = xyz.shape
+        THRESHOLD = 1e-4
+        if isinstance(feat_map, torch.Tensor):
+            coordinates = xyz
+            # use pre-symmetry points to get feature and record depth
+            cam_uv, cam_depth = self.get_uv_depth(coordinates, mvp)
+            cam_uv = cam_uv.detach()
+            cam_depth = cam_depth.detach()
+            # get local feature
+            feature = F.grid_sample(feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
+            self.input_depth = cam_depth.reshape(b, 256, 256, 1)  # [B, 256, 256, 1]
+            self.input_pts = coordinates.detach()
+        elif isinstance(feat_map, dict):
+            original_mvp = feat_map['original_mvp']
+            local_feat_map = feat_map['im_features_map']
+            original_depth = self.input_depth[0:b]
+            coordinates = xyz
+            cam_uv, cam_depth = self.get_uv_depth(coordinates, original_mvp)
+            cam_uv = cam_uv.detach()
+            cam_depth = cam_depth.detach()
+            project_feature = F.grid_sample(local_feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
+            project_depth = F.grid_sample(original_depth.permute(0, 3, 1, 2), cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]
+            use_mask = cam_depth <= project_depth + THRESHOLD
+            feature = project_feature * use_mask.repeat(1, 1, project_feature.shape[-1])
+        ret_feature = self.local_feature_block(feature.reshape(b*k, -1))  # the linear is without bias, so 0 value feature will still get 0 value
+        return ret_feature
+    def proj_sample(self, xyz, feat_map, mvp, w2c, img_h, img_w, xyz_before_sym=None):
+        # the new one with no input feature map upsampling
+        # feat_map: [B, C, H, W]
+        b, k, c = xyz.shape
+        if isinstance(feat_map, torch.Tensor):
+            if xyz_before_sym is None:
+                coordinates = xyz
+            else:
+                coordinates = xyz_before_sym
+            # use pre-symmetry points to get feature and record depth
+            cam_uv, cam_depth = self.get_uv_depth(coordinates, mvp)
+            cam_uv = cam_uv.detach()
+            cam_depth = cam_depth.detach()
+            # get local feature
+            feature = F.grid_sample(feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
+            self.input_depth = cam_depth.reshape(b, 256, 256, 1)  # [B, 256, 256, 1]
+            self.input_pts = coordinates.detach()
+        elif isinstance(feat_map, dict):
+            original_mvp = feat_map['original_mvp']
+            local_feat_map = feat_map['im_features_map']
+            THRESHOLD = 1e-4
+            original_depth = self.input_depth[0:b]
+            # if b == 1:
+            #    from pdb import set_trace; set_trace()
+            #    tmp_mask = xyz[0].reshape(256, 256, 3).sum(dim=-1) != 0
+            #    tmp_mask = tmp_mask.cpu().numpy()
+            #    tmp_mask = tmp_mask * 255
+            #    src_dp = self.input_depth[0,:,:,0].cpu().numpy()
+            #    input_pts = self.input_pts[0].cpu().numpy()
+            #    input_mask = self.input_pts[0].reshape(256, 256, 3).sum(dim=-1) != 0
+            #    input_mask = input_mask.int().cpu().numpy()
+            #    input_mask = input_mask * 255
+            #    np.save('./tmp_save/src_dp.npy', src_dp)
+            #    np.save('./tmp_save/input_pts.npy', input_pts)
+            #    import cv2
+            #    cv2.imwrite('./tmp_save/input_mask.png', input_mask)
+            #    cv2.imwrite('./tmp_save/mask.png', tmp_mask)
+            #    test_pts_pos = xyz[0].cpu().numpy()
+            #    np.save('./tmp_save/test_pts_pos.npy', test_pts_pos)
+            #    test_pts_raw = xyz_before_sym[0].cpu().numpy()
+            #    np.save('./tmp_save/test_pts_raw.npy', test_pts_raw)
+            #    mvp_now = mvp[0].detach().cpu().numpy()
+            #    mvp_original = original_mvp[0].detach().cpu().numpy()
+            #    np.save('./tmp_save/mvp_now.npy', mvp_now)
+            #    np.save('./tmp_save/mvp_original.npy', mvp_original)
+            if xyz_before_sym is None:
+                # just check the project depth of xyz
+                coordinates = xyz
+                cam_uv, cam_depth = self.get_uv_depth(coordinates, original_mvp)
+                cam_uv = cam_uv.detach()
+                cam_depth = cam_depth.detach()
+                project_feature = F.grid_sample(local_feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
+                project_depth = F.grid_sample(original_depth.permute(0, 3, 1, 2), cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]
+                use_mask = cam_depth <= project_depth + THRESHOLD
+                feature = project_feature * use_mask.repeat(1, 1, project_feature.shape[-1])
+            else:
+                # need to double check, but now we are still use symmetry! Even if the two points are all visible in input view
+                coords_inp = xyz
+                x_check, y_check, z_check = xyz.unbind(-1)
+                xyz_check = torch.stack([-1 * x_check, y_check, z_check], -1)
+                coords_rev = xyz_check      # we directly use neg-x to get the points of another side
+                uv_inp, dp_inp = self.get_uv_depth(coords_inp, original_mvp)
+                uv_rev, dp_rev = self.get_uv_depth(coords_rev, original_mvp)
+                uv_inp = uv_inp.detach()
+                uv_rev = uv_rev.detach()
+                dp_inp = dp_inp.detach()
+                dp_rev = dp_rev.detach()
+                proj_feat_inp = F.grid_sample(local_feat_map, uv_inp.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
+                proj_feat_rev = F.grid_sample(local_feat_map, uv_rev.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
+                proj_dp_inp = F.grid_sample(original_depth.permute(0, 3, 1, 2), uv_inp.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]
+                proj_dp_rev = F.grid_sample(original_depth.permute(0, 3, 1, 2), uv_rev.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]
+                use_mask_inp = dp_inp <= proj_dp_inp + THRESHOLD
+                use_mask_rev = dp_rev <= proj_dp_rev + THRESHOLD
+                # for those points we can see in two sides, we use average
+                use_mask_inp = use_mask_inp.int()
+                use_mask_rev = use_mask_rev.int()
+                both_vis = (use_mask_inp == 1) & (use_mask_rev == 1)
+                use_mask_inp[both_vis] = 0.5
+                use_mask_rev[both_vis] = 0.5
+                feature = proj_feat_inp * use_mask_inp.repeat(1, 1, proj_feat_inp.shape[-1]) + proj_feat_rev * use_mask_rev.repeat(1, 1, proj_feat_rev.shape[-1])
+        else:
+            raise NotImplementedError
+        ret_feature = self.local_feature_block(feature.reshape(b*k, -1))  # the linear is without bias, so 0 value feature will still get 0 value
+        return ret_feature
+    def sample(self, x, feat=None, feat_map=None, mvp=None, w2c=None, deform_xyz=None):
+        # assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] <= self.extra_dim)
+        b, h, w, c = x.shape
+        xyz_before_sym = None
+        if self.symmetrize:
+            xyz_before_sym = x.reshape(b, -1, c)
+            xs, ys, zs = x.unbind(-1)
+            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+        mvp = mvp.detach()  # [b, 4, 4]
+        w2c = w2c.detach()  # [b, 4, 4]
+        pts_xyz = x.reshape(b, -1, c)
+        deform_xyz = deform_xyz.reshape(b, -1, c)
+        if 'global' in self.texture_way and 'local' in self.texture_way:
+            global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
+            # local_feat = self.proj_sample(pts_xyz, feat_map, mvp, w2c, h, w, xyz_before_sym=xyz_before_sym)
+            local_feat = self.proj_sample_deform(deform_xyz, feat_map, mvp, w2c, h, w)
+            # feature_rep = torch.concat([global_feat, local_feat], dim=-1)
+            feature_rep = global_feat + local_feat
+        elif 'global' not in self.texture_way and 'local' in self.texture_way:
+            # local_feat = self.proj_sample(pts_xyz, feat_map, mvp, w2c, h, w, xyz_before_sym=xyz_before_sym)
+            local_feat = self.proj_sample_deform(deform_xyz, feat_map, mvp, w2c, h, w)
+            feature_rep = local_feat
+        elif 'global' in self.texture_way and 'local' not in self.texture_way:
+            global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
+            feature_rep = global_feat
+        else:
+            global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
+            feature_rep = global_feat
+        x = x.view(-1, c)
+        if self.embedder is not None:
+            x_in = self.embedder(x)
+            if self.embed_concat_pts:
+                x_in = torch.cat([x, x_in], -1)
+        else:
+            x_in = x
+        x_in = self.in_layer(x_in)
+        # if feat is not None:
+        #     feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
+        #     x_in = torch.concat([x_in, feat], dim=-1)
+        x_in = torch.concat([x_in, feature_rep], dim=-1)
+        out = self.mlp(self.relu(x_in))
+        if self.min_max is not None:
+            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
+        return out.view(b, h, w, -1)
+class LiftTexture(nn.Module):
+    def __init__(self,
+                 cin,
+                 cout,
+                 num_layers,
+                 nf=256,
+                 dropout=0,
+                 activation=None,
+                 min_max=None,
+                 n_harmonic_functions=10,
+                 omega0=1,
+                 extra_dim=0,
+                 embed_concat_pts=True,
+                 perturb_normal=False,
+                 symmetrize=False,
+                 texture_way=None,
+                 cam_pos_z_offset=10.,
+                 grid_scale=7.,
+                 local_feat_dim=128,
+                 grid_size=32,
+                 optim_latent=False):
+        super().__init__()
+        self.extra_dim = extra_dim
+        self.cam_pos_z_offset = cam_pos_z_offset
+        self.grid_scale = grid_scale
+        assert texture_way is not None
+        self.extra_dim = local_feat_dim + extra_dim
+        if n_harmonic_functions > 0:
+            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
+            dim_in = cin * 2 * n_harmonic_functions
+            self.embed_concat_pts = embed_concat_pts
+            if embed_concat_pts:
+                dim_in += cin
+        else:
+            self.embedder = None
+            dim_in = cin
+        self.encoder = Lift_Encoder(
+            cin=384,
+            feat_dim=local_feat_dim,
+            grid_scale=grid_scale / 2,  # the dmtet is initialized in (-0.5, 0.5)
+            grid_size=grid_size,
+            optim_latent=optim_latent,
+            with_z_feature=True,
+            cam_pos_z_offset=cam_pos_z_offset
+        )
+        self.in_layer = nn.Linear(dim_in, nf)
+        self.relu = nn.ReLU(inplace=True)
+        self.mlp = MLP(nf + self.extra_dim, cout, num_layers, nf, dropout, activation)
+        self.perturb_normal = perturb_normal
+        self.symmetrize = symmetrize
+        if min_max is not None:
+            self.register_buffer('min_max', min_max)
+        else:
+            self.min_max = None
+        self.bsdf = None
+    def get_uv_depth(self, xyz, mvp):
+        # xyz: [b, k, 3]
+        # mvp: [b, 4, 4]
+        cam4 = torch.matmul(torch.nn.functional.pad(xyz, pad=(0,1), mode='constant', value=1.0), torch.transpose(mvp, 1, 2))
+        cam3 = cam4[..., :3] / cam4[..., 3:4]
+        cam_uv = cam3[..., :2]
+        # cam_uv = cam_uv.detach()
+        cam_depth = cam3 + torch.FloatTensor([0, 0, self.cam_pos_z_offset]).to(xyz.device).view(1, 1, 3)
+        cam_depth = cam_depth / self.grid_scale * 2
+        cam_depth = cam_depth[..., 2:3]
+        # cam_depth = cam_depth.detach()
+        return cam_uv, cam_depth
+    def proj_sample_deform(self, xyz, feat_map, mvp, w2c, img_h, img_w):
+        # here the xyz is deformed points
+        # and we don't cast any symmtery here
+        if isinstance(feat_map, torch.Tensor):
+            feature = self.encoder(feat_map, mvp, xyz, inference="unproject")
+        elif isinstance(feat_map, dict):
+            feature = self.encoder(feat_map['im_features_map'], mvp, xyz, inference="sample")
+        C = feature.shape[-1]
+        feature = feature.reshape(-1, C)
+        return feature
+    def sample(self, x, feat=None, feat_map=None, mvp=None, w2c=None, deform_xyz=None):
+        # assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] <= self.extra_dim)
+        b, h, w, c = x.shape
+        xyz_before_sym = None
+        if self.symmetrize:
+            xyz_before_sym = x.reshape(b, -1, c)
+            xs, ys, zs = x.unbind(-1)
+            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+        mvp = mvp.detach()  # [b, 4, 4]
+        w2c = w2c.detach()  # [b, 4, 4]
+        pts_xyz = x.reshape(b, -1, c)
+        deform_xyz = deform_xyz.reshape(b, -1, c)
+        global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
+        local_feat = self.proj_sample_deform(deform_xyz, feat_map, mvp, w2c, h, w)
+        feature_rep = torch.concat([global_feat, local_feat], dim=-1)
+        x = x.view(-1, c)
+        if self.embedder is not None:
+            x_in = self.embedder(x)
+            if self.embed_concat_pts:
+                x_in = torch.cat([x, x_in], -1)
+        else:
+            x_in = x
+        x_in = self.in_layer(x_in)
+        # if feat is not None:
+        #     feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
+        #     x_in = torch.concat([x_in, feat], dim=-1)
+        x_in = torch.concat([x_in, feature_rep], dim=-1)
+        out = self.mlp(self.relu(x_in))
+        if self.min_max is not None:
+            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
+        return out.view(b, h, w, -1)
+class HarmonicEmbedding(nn.Module):
+    def __init__(self, n_harmonic_functions=10, omega0=1):
+        """
+        Positional Embedding implementation (adapted from Pytorch3D).
+        Given an input tensor `x` of shape [minibatch, ... , dim],
+        the harmonic embedding layer converts each feature
+        in `x` into a series of harmonic features `embedding`
+        as follows:
+            embedding[..., i*dim:(i+1)*dim] = [
+                sin(x[..., i]),
+                sin(2*x[..., i]),
+                sin(4*x[..., i]),
+                ...
+                sin(2**self.n_harmonic_functions * x[..., i]),
+                cos(x[..., i]),
+                cos(2*x[..., i]),
+                cos(4*x[..., i]),
+                ...
+                cos(2**self.n_harmonic_functions * x[..., i])
+            ]
+        Note that `x` is also premultiplied by `omega0` before
+        evaluting the harmonic functions.
+        """
+        super().__init__()
+        self.frequencies = omega0 * (2.0 ** torch.arange(n_harmonic_functions))
+    def forward(self, x):
+        """
+        Args:
+            x: tensor of shape [..., dim]
+        Returns:
+            embedding: a harmonic embedding of `x`
+                of shape [..., n_harmonic_functions * dim * 2]
+        """
+        embed = (x[..., None] * self.frequencies.to(x.device)).view(*x.shape[:-1], -1)
+        return torch.cat((embed.sin(), embed.cos()), dim=-1)
+class VGGEncoder(nn.Module):
+    def __init__(self, cout, pretrained=False):
+        super().__init__()
+        if pretrained:
+            raise NotImplementedError
+        vgg = models.vgg16()
+        self.vgg_encoder = nn.Sequential(vgg.features, vgg.avgpool)
+        self.linear1 = nn.Linear(25088, 4096)
+        self.linear2 = nn.Linear(4096, cout)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        batch_size, _, _, _ = x.shape
+        out = self.relu(self.linear1(self.vgg_encoder(x).view(batch_size, -1)))
+        return self.linear2(out)
+class ResnetEncoder(nn.Module):
+    def __init__(self, cout, pretrained=False):
+        super().__init__()
+        self.resnet = nn.Sequential(list(models.resnet18(weights="DEFAULT" if pretrained else None).modules())[:-1])
+        self.final_linear = nn.Linear(512, cout)
+    def forward(self, x):
+        return self.final_linear(self.resnet(x))
+class Encoder(nn.Module):
+    def __init__(self, cin, cout, in_size=128, zdim=None, nf=64, activation=None):
+        super().__init__()
+        network = [
+            nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
+            nn.GroupNorm(16, nf),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
+            nn.GroupNorm(16*2, nf*2),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
+            nn.GroupNorm(16*4, nf*4),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf*4, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
+            # nn.GroupNorm(16*8, nf*8),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        ]
+        add_downsample = int(np.log2(in_size//128))
+        if add_downsample > 0:
+            for _ in range(add_downsample):
+                network += [
+                    nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
+                    # nn.GroupNorm(16*8, nf*8),
+                    # nn.ReLU(inplace=True),
+                    nn.LeakyReLU(0.2, inplace=True),
+                ]
+        network += [
+            nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
+            nn.LeakyReLU(0.2, inplace=True),
+        ]
+        if zdim is None:
+            network += [
+                nn.Conv2d(nf*8, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
+                ]
+        else:
+            network += [
+                nn.Conv2d(nf*8, zdim, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
+                # nn.ReLU(inplace=True),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Conv2d(zdim, cout, kernel_size=1, stride=1, padding=0, bias=False),
+                ]
+        if activation is not None:
+            network += [get_activation(activation)]
+        self.network = nn.Sequential(*network)
+    def forward(self, input):
+        return self.network(input).reshape(input.size(0), -1)
+class EncoderWithDINO(nn.Module):
+    def __init__(self, cin_rgb, cin_dino, cout, in_size=128, zdim=None, nf=64, activation=None):
+        super().__init__()
+        network_rgb_in = [
+            nn.Conv2d(cin_rgb, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
+            nn.GroupNorm(16, nf),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
+            nn.GroupNorm(16*2, nf*2),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
+            nn.GroupNorm(16*4, nf*4),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        ]
+        self.network_rgb_in = nn.Sequential(*network_rgb_in)
+        network_dino_in = [
+            nn.Conv2d(cin_dino, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
+            nn.GroupNorm(16, nf),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
+            nn.GroupNorm(16*2, nf*2),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
+            nn.GroupNorm(16*4, nf*4),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        ]
+        self.network_dino_in = nn.Sequential(*network_dino_in)
+        network_fusion = [
+            nn.Conv2d(nf*4*2, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
+            # nn.GroupNorm(16*8, nf*8),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        ]
+        add_downsample = int(np.log2(in_size//128))
+        if add_downsample > 0:
+            for _ in range(add_downsample):
+                network_fusion += [
+                    nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
+                    # nn.GroupNorm(16*8, nf*8),
+                    # nn.ReLU(inplace=True),
+                    nn.LeakyReLU(0.2, inplace=True),
+                ]
+        network_fusion += [
+            nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
+            nn.LeakyReLU(0.2, inplace=True),
+        ]
+        if zdim is None:
+            network_fusion += [
+                nn.Conv2d(nf*8, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
+                ]
+        else:
+            network_fusion += [
+                nn.Conv2d(nf*8, zdim, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
+                # nn.ReLU(inplace=True),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Conv2d(zdim, cout, kernel_size=1, stride=1, padding=0, bias=False),
+                ]
+        if activation is not None:
+            network_fusion += [get_activation(activation)]
+        self.network_fusion = nn.Sequential(*network_fusion)
+    def forward(self, rgb_image, dino_image):
+        rgb_feat = self.network_rgb_in(rgb_image)
+        dino_feat = self.network_dino_in(dino_image)
+        out = self.network_fusion(torch.cat([rgb_feat, dino_feat], dim=1))
+        return out.reshape(rgb_image.size(0), -1)
+class Encoder32(nn.Module):
+    def __init__(self, cin, cout, nf=256, activation=None):
+        super().__init__()
+        network = [
+            nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
+            nn.GroupNorm(nf//4, nf),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
+            nn.GroupNorm(nf//4, nf),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
+            nn.GroupNorm(nf//4, nf),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
+        ]
+        if activation is not None:
+            network += [get_activation(activation)]
+        self.network = nn.Sequential(*network)
+    def forward(self, input):
+        return self.network(input).reshape(input.size(0), -1)
+class MLP(nn.Module):
+    def __init__(self, cin, cout, num_layers, nf=256, dropout=0, activation=None, inner_act='relu', linear_bias=False):
+        super().__init__()
+        assert num_layers >= 1
+        layer_act = get_activation(inner_act)
+        if num_layers == 1:
+            network = [nn.Linear(cin, cout, bias=linear_bias)]
+        else:
+            # network = [nn.Linear(cin, nf, bias=False)]
+            # for _ in range(num_layers-2):
+            #     network += [
+            #         nn.ReLU(inplace=True),
+            #         nn.Linear(nf, nf, bias=False)]
+            #     if dropout:
+            #         network += [nn.Dropout(dropout)]
+            # network += [
+            #     nn.ReLU(inplace=True),
+            #     nn.Linear(nf, cout, bias=False)]
+            network = [nn.Linear(cin, nf, bias=linear_bias)]
+            for _ in range(num_layers-2):
+                network += [
+                    layer_act,
+                    nn.Linear(nf, nf, bias=linear_bias)]
+                if dropout:
+                    network += [nn.Dropout(dropout)]
+            network += [
+                layer_act,
+                nn.Linear(nf, cout, bias=linear_bias)]
+        if activation is not None:
+            network += [get_activation(activation)]
+        self.network = nn.Sequential(*network)
+    def forward(self, input):
+        return self.network(input)
+class Embedding(nn.Module):
+    def __init__(self, cin, cout, zdim=128, nf=64, activation=None):
+        super().__init__()
+        network = [
+            nn.Linear(cin, nf, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(nf, zdim, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(zdim, cout, bias=False)]
+        if activation is not None:
+            network += [get_activation(activation)]
+        self.network = nn.Sequential(*network)
+    def forward(self, input):
+        return self.network(input.reshape(input.size(0), -1)).reshape(input.size(0), -1)
+class PerceptualLoss(nn.Module):
+    def __init__(self, requires_grad=False):
+        super(PerceptualLoss, self).__init__()
+        mean_rgb = torch.FloatTensor([0.485, 0.456, 0.406])
+        std_rgb = torch.FloatTensor([0.229, 0.224, 0.225])
+        self.register_buffer('mean_rgb', mean_rgb)
+        self.register_buffer('std_rgb', std_rgb)
+        vgg_pretrained_features = torchvision.models.vgg16(pretrained=True).features
+        self.slice1 = nn.Sequential()
+        self.slice2 = nn.Sequential()
+        self.slice3 = nn.Sequential()
+        self.slice4 = nn.Sequential()
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def normalize(self, x):
+        out = x/2 + 0.5
+        out = (out - self.mean_rgb.view(1,3,1,1)) / self.std_rgb.view(1,3,1,1)
+        return out
+    def __call__(self, im1, im2, mask=None, conf_sigma=None):
+        im = torch.cat([im1,im2], 0)
+        im = self.normalize(im)  # normalize input
+        ## compute features
+        feats = []
+        f = self.slice1(im)
+        feats += [torch.chunk(f, 2, dim=0)]
+        f = self.slice2(f)
+        feats += [torch.chunk(f, 2, dim=0)]
+        f = self.slice3(f)
+        feats += [torch.chunk(f, 2, dim=0)]
+        f = self.slice4(f)
+        feats += [torch.chunk(f, 2, dim=0)]
+        losses = []
+        for f1, f2 in feats[2:3]:  # use relu3_3 features only
+            loss = (f1-f2)**2
+            if conf_sigma is not None:
+                loss = loss / (2*conf_sigma**2 +EPS) + (conf_sigma +EPS).log()
+            if mask is not None:
+                b, c, h, w = loss.shape
+                _, _, hm, wm = mask.shape
+                sh, sw = hm//h, wm//w
+                mask0 = nn.functional.avg_pool2d(mask, kernel_size=(sh,sw), stride=(sh,sw)).expand_as(loss)
+                loss = (loss * mask0).sum() / mask0.sum()
+            else:
+                loss = loss.mean()
+            losses += [loss]
+        return sum(losses)
+## from: https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.norm_layer = norm_layer
+        if norm_layer is not None:
+            self.bn1 = norm_layer(planes)
+            self.bn2 = norm_layer(planes)
+        if inplanes != planes:
+            self.downsample = nn.Sequential(
+                conv1x1(inplanes, planes, stride),
+                norm_layer(planes),
+            )
+        else:
+            self.downsample = None
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        if self.norm_layer is not None:
+            out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        if self.norm_layer is not None:
+            out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResEncoder(nn.Module):
+    def __init__(self, cin, cout, in_size=128, zdim=None, nf=64, activation=None):
+        super().__init__()
+        network = [
+            nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
+            # nn.GroupNorm(16, nf),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
+            # nn.GroupNorm(16*2, nf*2),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            BasicBlock(nf*2, nf*2, norm_layer=None),
+            BasicBlock(nf*2, nf*2, norm_layer=None),
+            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
+            # nn.GroupNorm(16*4, nf*4),
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            BasicBlock(nf*4, nf*4, norm_layer=None),
+            BasicBlock(nf*4, nf*4, norm_layer=None),
+            nn.Conv2d(nf*4, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
+            # nn.ReLU(inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            BasicBlock(nf*8, nf*8, norm_layer=None),
+            BasicBlock(nf*8, nf*8, norm_layer=None),
+        ]
+        add_downsample = int(np.log2(in_size//64))
+        if add_downsample > 0:
+            for _ in range(add_downsample):
+                network += [
+                    nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
+                    # nn.ReLU(inplace=True),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    BasicBlock(nf*8, nf*8, norm_layer=None),
+                    BasicBlock(nf*8, nf*8, norm_layer=None),
+                ]
+        if zdim is None:
+            network += [
+                nn.Conv2d(nf*8, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
+                ]
+        else:
+            network += [
+                nn.Conv2d(nf*8, zdim, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
+                # nn.ReLU(inplace=True),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Conv2d(zdim, cout, kernel_size=1, stride=1, padding=0, bias=False),
+                ]
+        if activation is not None:
+            network += [get_activation(activation)]
+        self.network = nn.Sequential(*network)
+    def forward(self, input):
+        return self.network(input).reshape(input.size(0), -1)
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+class ViTEncoder(nn.Module):
+    def __init__(self, cout, which_vit='dino_vits8', pretrained=False, frozen=False, in_size=256, final_layer_type='none', root='/root'):
+        super().__init__()
+        if misc.is_main_process():
+            force_reload = not os.path.exists(os.path.join(root, ".cache/torch/hub/checkpoints/"))
+        else:
+            force_reload = False
+        if "dinov2" in which_vit:
+            self.ViT = torch.hub.load('facebookresearch/dinov2:main', which_vit, pretrained=pretrained, force_reload=force_reload)
+        else:
+            self.ViT = torch.hub.load('facebookresearch/dino:main', which_vit, pretrained=pretrained, force_reload=force_reload)
+        if frozen:
+            for p in self.ViT.parameters():
+                p.requires_grad = False
+        if which_vit == 'dino_vits8':
+            self.vit_feat_dim = 384
+            self.patch_size = 8
+        elif which_vit == 'dinov2_vits14':
+            self.vit_feat_dim = 384
+            self.patch_size = 14
+        elif which_vit == 'dino_vitb8':
+            self.vit_feat_dim = 768
+            self.patch_size = 8
+        self._feats = []
+        self.hook_handlers = []
+        if final_layer_type == 'none':
+            pass
+        elif final_layer_type == 'conv':
+            self.final_layer_patch_out = Encoder32(self.vit_feat_dim, cout, nf=256, activation=None)
+            self.final_layer_patch_key = Encoder32(self.vit_feat_dim, cout, nf=256, activation=None)
+        elif final_layer_type == 'attention':
+            raise NotImplementedError
+            self.final_layer = Attention(
+                dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+            self.fc = nn.Linear(self.vit_feat_dim, cout)
+        else:
+            raise NotImplementedError
+        self.final_layer_type = final_layer_type
+    def _get_hook(self, facet: str):
+        """
+        generate a hook method for a specific block and facet.
+        """
+        if facet in ['attn', 'token']:
+            def _hook(model, input, output):
+                self._feats.append(output)
+            return _hook
+        if facet == 'query':
+            facet_idx = 0
+        elif facet == 'key':
+            facet_idx = 1
+        elif facet == 'value':
+            facet_idx = 2
+        else:
+            raise TypeError(f"{facet} is not a supported facet.")
+        def _inner_hook(module, input, output):
+            input = input[0]
+            B, N, C = input.shape
+            qkv = module.qkv(input).reshape(B, N, 3, module.num_heads, C // module.num_heads).permute(2, 0, 3, 1, 4)
+            self._feats.append(qkv[facet_idx]) #Bxhxtxd
+        return _inner_hook
+    def _register_hooks(self, layers: List[int], facet: str) -> None:
+        """
+        register hook to extract features.
+        :param layers: layers from which to extract features.
+        :param facet: facet to extract. One of the following options: ['key' | 'query' | 'value' | 'token' | 'attn']
+        """
+        for block_idx, block in enumerate(self.ViT.blocks):
+            if block_idx in layers:
+                if facet == 'token':
+                    self.hook_handlers.append(block.register_forward_hook(self._get_hook(facet)))
+                elif facet == 'attn':
+                    self.hook_handlers.append(block.attn.attn_drop.register_forward_hook(self._get_hook(facet)))
+                elif facet in ['key', 'query', 'value']:
+                    self.hook_handlers.append(block.attn.register_forward_hook(self._get_hook(facet)))
+                else:
+                    raise TypeError(f"{facet} is not a supported facet.")
+    def _unregister_hooks(self) -> None:
+        """
+        unregisters the hooks. should be called after feature extraction.
+        """
+        for handle in self.hook_handlers:
+            handle.remove()
+        self.hook_handlers = []
+    def forward(self, x, return_patches=False):
+        b, c, h, w = x.shape
+        self._feats = []
+        self._register_hooks([11], 'key')
+        #self._register_hooks([11], 'token')
+        x = self.ViT.prepare_tokens(x)
+        #x = self.ViT.prepare_tokens_with_masks(x)
+        for blk in self.ViT.blocks:
+            x = blk(x)
+        out = self.ViT.norm(x)
+        self._unregister_hooks()
+        ph, pw = h // self.patch_size, w // self.patch_size
+        patch_out = out[:, 1:]  # first is class token
+        patch_out = patch_out.reshape(b, ph, pw, self.vit_feat_dim).permute(0, 3, 1, 2)
+        patch_key = self._feats[0][:,:,1:]  # B, num_heads, num_patches, dim
+        patch_key = patch_key.permute(0, 1, 3, 2).reshape(b, self.vit_feat_dim, ph, pw)
+        if self.final_layer_type == 'none':
+            global_feat_out = out[:, 0].reshape(b, -1)  # first is class token
+            global_feat_key = self._feats[0][:, :, 0].reshape(b, -1)  # first is class token
+        elif self.final_layer_type == 'conv':
+            global_feat_out = self.final_layer_patch_out(patch_out).view(b, -1)
+            global_feat_key = self.final_layer_patch_key(patch_key).view(b, -1)
+        elif self.final_layer_type == 'attention':
+            raise NotImplementedError
+        else:
+            raise NotImplementedError
+        if not return_patches:
+            patch_out = patch_key = None
+        return global_feat_out, global_feat_key, patch_out, patch_key
+class ArticulationNetwork(nn.Module):
+    def __init__(self, net_type, feat_dim, pos_dim, num_layers, nf, n_harmonic_functions=0, omega0=1, activation=None, enable_articulation_idadd=False):
+        super().__init__()
+        if n_harmonic_functions > 0:
+            self.posenc = HarmonicEmbedding(n_harmonic_functions=n_harmonic_functions, omega0=omega0)
+            pos_dim = pos_dim * (n_harmonic_functions * 2 + 1)
+        else:
+            self.posenc = None
+            pos_dim = 4
+        cout = 3
+        if net_type == 'mlp':
+            self.network = MLP(
+                feat_dim + pos_dim,  # + bone xyz pos and index
+                cout,  # We represent the rotation of each bone by its Euler angles ψ, θ, and φ
+                num_layers,
+                nf=nf,
+                dropout=0,
+                activation=activation
+            )
+        elif net_type == 'attention':
+            self.in_layer = nn.Sequential(
+                nn.Linear(feat_dim + pos_dim, nf),
+                nn.GELU(),
+                nn.LayerNorm(nf),
+            )
+            self.blocks = nn.ModuleList([
+            Block(
+                dim=nf, num_heads=8, mlp_ratio=2., qkv_bias=False, qk_scale=None,
+                drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm)
+            for i in range(num_layers)])
+            out_layer = [nn.Linear(nf, cout)]
+            if activation:
+                out_layer += [get_activation(activation)]
+            self.out_layer = nn.Sequential(*out_layer)
+        else:
+            raise NotImplementedError
+        self.net_type = net_type
+        self.enable_articulation_idadd = enable_articulation_idadd
+    def forward(self, x, pos):
+        pos_inp = pos
+        if self.posenc is not None:
+            pos = torch.cat([pos, self.posenc(pos)], dim=-1)
+        x = torch.cat([x, pos], dim=-1)
+        if self.enable_articulation_idadd:
+            articulation_id = pos_inp[..., -1:]
+            x = x + articulation_id
+        if self.net_type == 'mlp':
+            out = self.network(x)
+        elif self.net_type == 'attention':
+            x = self.in_layer(x)
+            for blk in self.blocks:
+                x = blk(x)
+            out = self.out_layer(x)
+        else:
+            raise NotImplementedError
+        return out
+## Attention block from ViT (https://github.com/facebookresearch/dino/blob/main/vision_transformer.py)
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, return_attention=False):
+        y, attn = self.attn(self.norm1(x))
+        if return_attention:
+            return attn
+        x = x + self.drop_path(y)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class FeatureAttention(nn.Module):
+    def __init__(self, vit_type, pos_dim, embedder_freq=0, zdim=128, img_size=256, activation=None):
+        super().__init__()
+        self.zdim = zdim
+        if embedder_freq > 0:
+            self.posenc = HarmonicEmbedding(n_harmonic_functions=embedder_freq, omega0=1)
+            pos_dim = pos_dim * (embedder_freq * 2 + 1)
+        else:
+            self.posenc = None
+        self.pos_dim = pos_dim
+        if vit_type == 'dino_vits8':
+            self.vit_feat_dim = 384
+            patch_size = 8
+        elif which_vit == 'dinov2_vits14':
+            self.vit_feat_dim = 384
+            self.patch_size = 14
+        elif vit_type == 'dino_vitb8':
+            self.vit_feat_dim = 768
+            patch_size = 8
+        else:
+            raise NotImplementedError
+        self.num_patches_per_dim = img_size // patch_size
+        self.kv = nn.Sequential(
+            nn.Linear(self.vit_feat_dim, zdim),
+            nn.ReLU(inplace=True),
+            nn.LayerNorm(zdim),
+            nn.Linear(zdim, zdim*2),
+        )
+        self.q = nn.Sequential(
+            nn.Linear(pos_dim, zdim),
+            nn.ReLU(inplace=True),
+            nn.LayerNorm(zdim),
+            nn.Linear(zdim, zdim),
+        )
+        final_mlp = [
+            nn.Linear(zdim, zdim),
+            nn.ReLU(inplace=True),
+            nn.LayerNorm(zdim),
+            nn.Linear(zdim, self.vit_feat_dim)
+        ]
+        if activation is not None:
+            final_mlp += [get_activation(activation)]
+        self.final_ln = nn.Sequential(*final_mlp)
+    def forward(self, x, feat):
+        _, vit_feat_dim, ph, pw = feat.shape
+        assert ph == pw and ph == self.num_patches_per_dim and vit_feat_dim == self.vit_feat_dim
+        if self.posenc is not None:
+            x = torch.cat([x, self.posenc(x)], dim=-1)
+        bxf, k, c = x.shape
+        assert c == self.pos_dim
+        query = self.q(x)
+        feat_in = feat.view(bxf, vit_feat_dim, ph*pw).permute(0, 2, 1)  # N, K, C
+        k, v = self.kv(feat_in).chunk(2, dim=-1)
+        attn = torch.einsum('bnd,bpd->bnp', query, k).softmax(dim=-1)
+        out = torch.einsum('bnp,bpd->bnd', attn, v)
+        out = self.final_ln(out)
+        return out

video3d/render/light.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+import nvdiffrast.torch as dr
+from . import util
+from . import renderutils as ru
+from ..networks import MLP
+######################################################################################
+# Utility functions
+######################################################################################
+class cubemap_mip(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, cubemap):
+        return util.avg_pool_nhwc(cubemap, (2,2))
+    @staticmethod
+    def backward(ctx, dout):
+        res = dout.shape[1] * 2
+        out = torch.zeros(6, res, res, dout.shape[-1], dtype=torch.float32, device="cuda")
+        for s in range(6):
+            gy, gx = torch.meshgrid(torch.linspace(-1.0 + 1.0 / res, 1.0 - 1.0 / res, res, device="cuda"),
+                                    torch.linspace(-1.0 + 1.0 / res, 1.0 - 1.0 / res, res, device="cuda"),
+                                    indexing='ij')
+            v = util.safe_normalize(util.cube_to_dir(s, gx, gy))
+            out[s, ...] = dr.texture(dout[None, ...] * 0.25, v[None, ...].contiguous(), filter_mode='linear', boundary_mode='cube')
+        return out
+######################################################################################
+# Split-sum environment map light source with automatic mipmap generation
+######################################################################################
+class EnvironmentLight(torch.nn.Module):
+    LIGHT_MIN_RES = 16
+    MIN_ROUGHNESS = 0.08
+    MAX_ROUGHNESS = 0.5
+    def __init__(self, base):
+        super(EnvironmentLight, self).__init__()
+        self.mtx = None
+        self.base = torch.nn.Parameter(base.clone().detach(), requires_grad=True)
+        self.register_parameter('env_base', self.base)
+    def xfm(self, mtx):
+        self.mtx = mtx
+    def clone(self):
+        return EnvironmentLight(self.base.clone().detach())
+    def clamp_(self, min=None, max=None):
+        self.base.clamp_(min, max)
+    def get_mip(self, roughness):
+        return torch.where(roughness < self.MAX_ROUGHNESS
+                        , (torch.clamp(roughness, self.MIN_ROUGHNESS, self.MAX_ROUGHNESS) - self.MIN_ROUGHNESS) / (self.MAX_ROUGHNESS - self.MIN_ROUGHNESS) * (len(self.specular) - 2)
+                        , (torch.clamp(roughness, self.MAX_ROUGHNESS, 1.0) - self.MAX_ROUGHNESS) / (1.0 - self.MAX_ROUGHNESS) + len(self.specular) - 2)
+    def build_mips(self, cutoff=0.99):
+        self.specular = [self.base]
+        while self.specular[-1].shape[1] > self.LIGHT_MIN_RES:
+            self.specular += [cubemap_mip.apply(self.specular[-1])]
+        self.diffuse = ru.diffuse_cubemap(self.specular[-1])
+        for idx in range(len(self.specular) - 1):
+            roughness = (idx / (len(self.specular) - 2)) * (self.MAX_ROUGHNESS - self.MIN_ROUGHNESS) + self.MIN_ROUGHNESS
+            self.specular[idx] = ru.specular_cubemap(self.specular[idx], roughness, cutoff)
+        self.specular[-1] = ru.specular_cubemap(self.specular[-1], 1.0, cutoff)
+    def regularizer(self):
+        white = (self.base[..., 0:1] + self.base[..., 1:2] + self.base[..., 2:3]) / 3.0
+        return torch.mean(torch.abs(self.base - white))
+    def shade(self, gb_pos, gb_normal, kd, ks, view_pos, specular=True):
+        wo = util.safe_normalize(view_pos - gb_pos)
+        if specular:
+            roughness = ks[..., 1:2] # y component
+            metallic  = ks[..., 2:3] # z component
+            spec_col  = (1.0 - metallic)*0.04 + kd * metallic
+            diff_col  = kd * (1.0 - metallic)
+        else:
+            diff_col = kd
+        reflvec = util.safe_normalize(util.reflect(wo, gb_normal))
+        nrmvec = gb_normal
+        if self.mtx is not None: # Rotate lookup
+            mtx = torch.as_tensor(self.mtx, dtype=torch.float32, device='cuda')
+            reflvec = ru.xfm_vectors(reflvec.view(reflvec.shape[0], reflvec.shape[1] * reflvec.shape[2], reflvec.shape[3]), mtx).view(*reflvec.shape)
+            nrmvec  = ru.xfm_vectors(nrmvec.view(nrmvec.shape[0], nrmvec.shape[1] * nrmvec.shape[2], nrmvec.shape[3]), mtx).view(*nrmvec.shape)
+        # Diffuse lookup
+        diffuse = dr.texture(self.diffuse[None, ...], nrmvec.contiguous(), filter_mode='linear', boundary_mode='cube')
+        shaded_col = diffuse * diff_col
+        if specular:
+            # Lookup FG term from lookup texture
+            NdotV = torch.clamp(util.dot(wo, gb_normal), min=1e-4)
+            fg_uv = torch.cat((NdotV, roughness), dim=-1)
+            if not hasattr(self, '_FG_LUT'):
+                self._FG_LUT = torch.as_tensor(np.fromfile('data/irrmaps/bsdf_256_256.bin', dtype=np.float32).reshape(1, 256, 256, 2), dtype=torch.float32, device='cuda')
+            fg_lookup = dr.texture(self._FG_LUT, fg_uv, filter_mode='linear', boundary_mode='clamp')
+            # Roughness adjusted specular env lookup
+            miplevel = self.get_mip(roughness)
+            spec = dr.texture(self.specular[0][None, ...], reflvec.contiguous(), mip=list(m[None, ...] for m in self.specular[1:]), mip_level_bias=miplevel[..., 0], filter_mode='linear-mipmap-linear', boundary_mode='cube')
+            # Compute aggregate lighting
+            reflectance = spec_col * fg_lookup[...,0:1] + fg_lookup[...,1:2]
+            shaded_col += spec * reflectance
+        return shaded_col * (1.0 - ks[..., 0:1]) # Modulate by hemisphere visibility
+######################################################################################
+# Load and store
+######################################################################################
+# Load from latlong .HDR file
+def _load_env_hdr(fn, scale=1.0):
+    latlong_img = torch.tensor(util.load_image(fn), dtype=torch.float32, device='cuda')*scale
+    cubemap = util.latlong_to_cubemap(latlong_img, [512, 512])
+    l = EnvironmentLight(cubemap)
+    l.build_mips()
+    return l
+def load_env(fn, scale=1.0):
+    if os.path.splitext(fn)[1].lower() == ".hdr":
+        return _load_env_hdr(fn, scale)
+    else:
+        assert False, "Unknown envlight extension %s" % os.path.splitext(fn)[1]
+def save_env_map(fn, light):
+    assert isinstance(light, EnvironmentLight), "Can only save EnvironmentLight currently"
+    if isinstance(light, EnvironmentLight):
+        color = util.cubemap_to_latlong(light.base, [512, 1024])
+    util.save_image_raw(fn, color.detach().cpu().numpy())
+######################################################################################
+# Create trainable env map with random initialization
+######################################################################################
+def create_trainable_env_rnd(base_res, scale=0.5, bias=0.25):
+    base = torch.rand(6, base_res, base_res, 3, dtype=torch.float32, device='cuda') * scale + bias
+    return EnvironmentLight(base)
+######################################################################################
+# Directional light source
+######################################################################################
+class DirectionalLight(torch.nn.Module):
+    def __init__(self, mlp_in, mlp_layers, mlp_hidden_size, intensity_min_max=None):
+        super(DirectionalLight, self).__init__()
+        self.mlp = MLP(mlp_in, 4, mlp_layers, nf=mlp_hidden_size, activation='sigmoid')
+        if intensity_min_max is not None:
+            self.register_buffer('intensity_min_max', intensity_min_max)
+        else:
+            self.intensity_min_max = None
+    def forward(self, feat):
+        # print('----------------- forward light !!! -----------------')
+        out = self.mlp(feat)
+        light_dir = F.normalize(torch.cat([out[..., 0:1] *2-1, torch.ones_like(out[..., :1]) * 0.5, out[..., 1:2] *2-1], dim=-1), dim=-1)  # upper hemisphere
+        if self.intensity_min_max is not None:
+            int = out[..., 2:] * (self.intensity_min_max[1][None, :] - self.intensity_min_max[0][None, :]) + self.intensity_min_max[0][None, :]
+        self.light_params = torch.cat([light_dir, int], -1)
+        return self.light_params
+    def shade(self, feat, kd, normal):
+        light_params = self.forward(feat)
+        light_dir = light_params[..., :3][:, None, None, :]
+        int_amb = light_params[..., 3:4][:, None, None, :]
+        int_diff = light_params[..., 4:5][:, None, None, :]
+        shading = (int_amb + int_diff * torch.clamp(util.dot(light_dir, normal), min=0.0))
+        shaded = shading * kd
+        return shaded, shading

video3d/render/material.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import os
+import numpy as np
+import torch
+import nvdiffrast.torch as dr
+import cv2
+from video3d.render.render import render_uv
+from . import util
+from . import texture
+from . import mlptexture
+from ..utils import misc
+######################################################################################
+# Wrapper to make materials behave like a python dict, but register textures as
+# torch.nn.Module parameters.
+######################################################################################
+class Material(torch.nn.Module):
+    def __init__(self, mat_dict):
+        super(Material, self).__init__()
+        self.mat_keys = set()
+        for key in mat_dict.keys():
+            self.mat_keys.add(key)
+            self[key] = mat_dict[key]
+    def __contains__(self, key):
+        return hasattr(self, key)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, val):
+        self.mat_keys.add(key)
+        setattr(self, key, val)
+    def __delitem__(self, key):
+        self.mat_keys.remove(key)
+        delattr(self, key)
+    def keys(self):
+        return self.mat_keys
+######################################################################################
+# .mtl material format loading / storing
+######################################################################################
+@torch.no_grad()
+def load_mtl(fn, clear_ks=True):
+    import re
+    mtl_path = os.path.dirname(fn)
+    # Read file
+    with open(fn, 'r') as f:
+        lines = f.readlines()
+    # Parse materials
+    materials = []
+    for line in lines:
+        split_line = re.split(' +|\t+|\n+', line.strip())
+        prefix = split_line[0].lower()
+        data = split_line[1:]
+        if 'newmtl' in prefix:
+            material = Material({'name' : data[0]})
+            materials += [material]
+        elif materials:
+            if 'bsdf' in prefix or 'map_kd' in prefix or 'map_ks' in prefix or 'bump' in prefix:
+                material[prefix] = data[0]
+            else:
+                material[prefix] = torch.tensor(tuple(float(d) for d in data), dtype=torch.float32, device='cuda')
+    # Convert everything to textures. Our code expects 'kd' and 'ks' to be texture maps. So replace constants with 1x1 maps
+    for mat in materials:
+        if not 'bsdf' in mat:
+            mat['bsdf'] = 'pbr'
+        if 'map_kd' in mat:
+            mat['kd'] = texture.load_texture2D(os.path.join(mtl_path, mat['map_kd']))
+        else:
+            mat['kd'] = texture.Texture2D(mat['kd'])
+        if 'map_ks' in mat:
+            mat['ks'] = texture.load_texture2D(os.path.join(mtl_path, mat['map_ks']), channels=3)
+        else:
+            mat['ks'] = texture.Texture2D(mat['ks'])
+        if 'bump' in mat:
+            mat['normal'] = texture.load_texture2D(os.path.join(mtl_path, mat['bump']), lambda_fn=lambda x: x * 2 - 1, channels=3)
+        # Convert Kd from sRGB to linear RGB
+        mat['kd'] = texture.srgb_to_rgb(mat['kd'])
+        if clear_ks:
+            # Override ORM occlusion (red) channel by zeros. We hijack this channel
+            for mip in mat['ks'].getMips():
+                mip[..., 0] = 0.0
+    return materials
+@torch.no_grad()
+def save_mtl(fn, material, mesh=None, feat=None, resolution=[256, 256], prior_shape=None):
+    folder = os.path.dirname(fn)
+    file = os.path.basename(fn)
+    prefix = '_'.join(file.split('_')[:-1]) + '_'
+    with open(fn, "w") as f:
+        f.write('newmtl defaultMat\n')
+        if material is not None:
+            f.write('bsdf   %s\n' % material['bsdf'])
+            if 'kd_ks_normal' in material.keys():
+                assert mesh is not None
+                glctx = dr.RasterizeGLContext()
+                mask, kd, ks, normal = render_uv(glctx, mesh, resolution, material['kd_ks_normal'], feat=feat, prior_shape=prior_shape)
+                hole_mask = 1. - mask
+                hole_mask = hole_mask.int()[0]
+                def uv_padding(image):
+                    uv_padding_size = 4
+                    inpaint_image = (
+                        cv2.inpaint(
+                            (image.detach().cpu().numpy() * 255).astype(np.uint8),
+                            (hole_mask.detach().cpu().numpy() * 255).astype(np.uint8),
+                            uv_padding_size,
+                            cv2.INPAINT_TELEA,
+                        )
+                        / 255.0
+                    )
+                    return torch.from_numpy(inpaint_image).to(image)
+                kd = uv_padding(kd[0])[None]
+                batch_size = kd.shape[0]
+                f.write(f'map_Kd {prefix}texture_kd.png\n')
+                misc.save_images(folder, kd.permute(0,3,1,2).detach().cpu().numpy(), fnames=[prefix + "texture_kd"] * batch_size)
+                f.write(f'map_Ks {prefix}texture_ks.png\n')
+                misc.save_images(folder, ks.permute(0,3,1,2).detach().cpu().numpy(), fnames=[prefix + "texture_ks"] * batch_size)
+                # disable normal
+                # f.write(f'bump {prefix}texture_n.png\n')
+                # misc.save_images(folder, normal.permute(0,3,1,2).detach().cpu().numpy(), fnames=[prefix + "texture_n"] * batch_size)
+            if 'kd' in material.keys():
+                f.write('map_Kd texture_kd.png\n')
+                texture.save_texture2D(os.path.join(folder, 'texture_Kd.png'), texture.rgb_to_srgb(material['kd']))
+            if 'ks' in material.keys():
+                f.write('map_Ks texture_ks.png\n')
+                texture.save_texture2D(os.path.join(folder, 'texture_Ks.png'), material['ks'])
+            if 'normal' in material.keys():
+                f.write('bump texture_n.png\n')
+                texture.save_texture2D(os.path.join(folder, 'texture_n.png'), material['normal'], lambda_fn=lambda x:(util.safe_normalize(x)+1)*0.5)
+        else:
+            f.write('Kd 1 1 1\n')
+            f.write('Ks 0 0 0\n')
+            f.write('Ka 0 0 0\n')
+            f.write('Tf 1 1 1\n')
+            f.write('Ni 1\n')
+            f.write('Ns 0\n')
+######################################################################################
+# Merge multiple materials into a single uber-material
+######################################################################################
+def _upscale_replicate(x, full_res):
+    x = x.permute(0, 3, 1, 2)
+    x = torch.nn.functional.pad(x, (0, full_res[1] - x.shape[3], 0, full_res[0] - x.shape[2]), 'replicate')
+    return x.permute(0, 2, 3, 1).contiguous()
+def merge_materials(materials, texcoords, tfaces, mfaces):
+    assert len(materials) > 0
+    for mat in materials:
+        assert mat['bsdf'] == materials[0]['bsdf'], "All materials must have the same BSDF (uber shader)"
+        assert ('normal' in mat) is ('normal' in materials[0]), "All materials must have either normal map enabled or disabled"
+    uber_material = Material({
+        'name' : 'uber_material',
+        'bsdf' : materials[0]['bsdf'],
+    })
+    textures = ['kd', 'ks', 'normal']
+    # Find maximum texture resolution across all materials and textures
+    max_res = None
+    for mat in materials:
+        for tex in textures:
+            tex_res = np.array(mat[tex].getRes()) if tex in mat else np.array([1, 1])
+            max_res = np.maximum(max_res, tex_res) if max_res is not None else tex_res
+    # Compute size of compund texture and round up to nearest PoT
+    full_res = 2**np.ceil(np.log2(max_res * np.array([1, len(materials)]))).astype(np.int)
+    # Normalize texture resolution across all materials & combine into a single large texture
+    for tex in textures:
+        if tex in materials[0]:
+            tex_data = torch.cat(tuple(util.scale_img_nhwc(mat[tex].data, tuple(max_res)) for mat in materials), dim=2) # Lay out all textures horizontally, NHWC so dim2 is x
+            tex_data = _upscale_replicate(tex_data, full_res)
+            uber_material[tex] = texture.Texture2D(tex_data)
+    # Compute scaling values for used / unused texture area
+    s_coeff = [full_res[0] / max_res[0], full_res[1] / max_res[1]]
+    # Recompute texture coordinates to cooincide with new composite texture
+    new_tverts = {}
+    new_tverts_data = []
+    for fi in range(len(tfaces)):
+        matIdx = mfaces[fi]
+        for vi in range(3):
+            ti = tfaces[fi][vi]
+            if not (ti in new_tverts):
+                new_tverts[ti] = {}
+            if not (matIdx in new_tverts[ti]): # create new vertex
+                new_tverts_data.append([(matIdx + texcoords[ti][0]) / s_coeff[1], texcoords[ti][1] / s_coeff[0]]) # Offset texture coodrinate (x direction) by material id & scale to local space. Note, texcoords are (u,v) but texture is stored (w,h) so the indexes swap here
+                new_tverts[ti][matIdx] = len(new_tverts_data) - 1
+            tfaces[fi][vi] = new_tverts[ti][matIdx] # reindex vertex
+    return uber_material, new_tverts_data, tfaces
+######################################################################################
+# Utility functions for material
+######################################################################################
+def initial_guess_material(cfgs, mlp=False, init_mat=None, tet_bbox=None):
+    kd_min = torch.tensor(cfgs.get('kd_min', [0., 0., 0., 0.]), dtype=torch.float32)
+    kd_max = torch.tensor(cfgs.get('kd_max', [1., 1., 1., 1.]), dtype=torch.float32)
+    ks_min = torch.tensor(cfgs.get('ks_min', [0., 0., 0.]), dtype=torch.float32)
+    ks_max = torch.tensor(cfgs.get('ks_max', [0., 0., 0.]), dtype=torch.float32)
+    nrm_min = torch.tensor(cfgs.get('nrm_min', [-1., -1., 0.]), dtype=torch.float32)
+    nrm_max = torch.tensor(cfgs.get('nrm_max', [1., 1., 1.]), dtype=torch.float32)
+    if mlp:
+        num_layers = cfgs.get("num_layers_tex", 5)
+        nf = cfgs.get("hidden_size", 128)
+        enable_encoder = cfgs.get("enable_encoder", False)
+        feat_dim = cfgs.get("latent_dim", 64) if enable_encoder else 0
+        mlp_min = torch.cat((kd_min[0:3], ks_min, nrm_min), dim=0)
+        mlp_max = torch.cat((kd_max[0:3], ks_max, nrm_max), dim=0)
+        min_max = torch.stack((mlp_min, mlp_max), dim=0)
+        out_chn = 9
+        mlp_map_opt = mlptexture.MLPTexture3D(tet_bbox, channels=out_chn, internal_dims=nf, hidden=num_layers-1, feat_dim=feat_dim, min_max=min_max)
+        mat =  Material({'kd_ks_normal' : mlp_map_opt})
+    else:
+        # Setup Kd (albedo) and Ks (x, roughness, metalness) textures
+        if cfgs.random_textures or init_mat is None:
+            num_channels = 4 if cfgs.layers > 1 else 3
+            kd_init = torch.rand(size=cfgs.texture_res + [num_channels]) * (kd_max - kd_min)[None, None, 0:num_channels] + kd_min[None, None, 0:num_channels]
+            kd_map_opt = texture.create_trainable(kd_init , cfgs.texture_res, not cfgs.custom_mip, [kd_min, kd_max])
+            ksR = np.random.uniform(size=cfgs.texture_res + [1], low=0.0, high=0.01)
+            ksG = np.random.uniform(size=cfgs.texture_res + [1], low=ks_min[1].cpu(), high=ks_max[1].cpu())
+            ksB = np.random.uniform(size=cfgs.texture_res + [1], low=ks_min[2].cpu(), high=ks_max[2].cpu())
+            ks_map_opt = texture.create_trainable(np.concatenate((ksR, ksG, ksB), axis=2), cfgs.texture_res, not cfgs.custom_mip, [ks_min, ks_max])
+        else:
+            kd_map_opt = texture.create_trainable(init_mat['kd'], cfgs.texture_res, not cfgs.custom_mip, [kd_min, kd_max])
+            ks_map_opt = texture.create_trainable(init_mat['ks'], cfgs.texture_res, not cfgs.custom_mip, [ks_min, ks_max])
+        # Setup normal map
+        if cfgs.random_textures or init_mat is None or 'normal' not in init_mat:
+            normal_map_opt = texture.create_trainable(np.array([0, 0, 1]), cfgs.texture_res, not cfgs.custom_mip, [nrm_min, nrm_max])
+        else:
+            normal_map_opt = texture.create_trainable(init_mat['normal'], cfgs.texture_res, not cfgs.custom_mip, [nrm_min, nrm_max])
+        mat = Material({
+            'kd'     : kd_map_opt,
+            'ks'     : ks_map_opt,
+            'normal' : normal_map_opt
+        })
+    if init_mat is not None:
+        mat['bsdf'] = init_mat['bsdf']
+    elif "bsdf" in cfgs:
+        mat['bsdf'] = cfgs["bsdf"]
+    else:
+        mat['bsdf'] = 'pbr'
+    if not cfgs.get("perturb_normal", False):
+        mat['no_perturbed_nrm'] = True
+    return mat

video3d/render/mesh.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+from difflib import unified_diff
+import os
+import numpy as np
+import torch
+from . import obj
+from . import util
+#########################################################################################
+# Base mesh class
+#
+# Minibatch in mesh is supported, as long as each mesh shares the same edge connectivity.
+#########################################################################################
+class Mesh:
+    def __init__(self,
+                 v_pos=None,
+                 t_pos_idx=None,
+                 v_nrm=None,
+                 t_nrm_idx=None,
+                 v_tex=None,
+                 t_tex_idx=None,
+                 v_tng=None,
+                 t_tng_idx=None,
+                 material=None,
+                 base=None):
+        self.v_pos = v_pos
+        self.v_nrm = v_nrm
+        self.v_tex = v_tex
+        self.v_tng = v_tng
+        self.t_pos_idx = t_pos_idx
+        self.t_nrm_idx = t_nrm_idx
+        self.t_tex_idx = t_tex_idx
+        self.t_tng_idx = t_tng_idx
+        self.material = material
+        if base is not None:
+            self.copy_none(base)
+    def __len__(self):
+        return len(self.v_pos)
+    def copy_none(self, other):
+        if self.v_pos is None:
+            self.v_pos = other.v_pos
+        if self.t_pos_idx is None:
+            self.t_pos_idx = other.t_pos_idx
+        if self.v_nrm is None:
+            self.v_nrm = other.v_nrm
+        if self.t_nrm_idx is None:
+            self.t_nrm_idx = other.t_nrm_idx
+        if self.v_tex is None:
+            self.v_tex = other.v_tex
+        if self.t_tex_idx is None:
+            self.t_tex_idx = other.t_tex_idx
+        if self.v_tng is None:
+            self.v_tng = other.v_tng
+        if self.t_tng_idx is None:
+            self.t_tng_idx = other.t_tng_idx
+        if self.material is None:
+            self.material = other.material
+    def clone(self):
+        out = Mesh(base=self)
+        if out.v_pos is not None:
+            out.v_pos = out.v_pos.clone().detach()
+        if out.t_pos_idx is not None:
+            out.t_pos_idx = out.t_pos_idx.clone().detach()
+        if out.v_nrm is not None:
+            out.v_nrm = out.v_nrm.clone().detach()
+        if out.t_nrm_idx is not None:
+            out.t_nrm_idx = out.t_nrm_idx.clone().detach()
+        if out.v_tex is not None:
+            out.v_tex = out.v_tex.clone().detach()
+        if out.t_tex_idx is not None:
+            out.t_tex_idx = out.t_tex_idx.clone().detach()
+        if out.v_tng is not None:
+            out.v_tng = out.v_tng.clone().detach()
+        if out.t_tng_idx is not None:
+            out.t_tng_idx = out.t_tng_idx.clone().detach()
+        return out
+    def detach(self):
+        return self.clone()
+    def extend(self, N: int):
+        """
+        Create new Mesh class which contains each input mesh N times.
+        Args:
+            N: number of new copies of each mesh.
+        Returns:
+            new Mesh object.
+        """
+        verts = self.v_pos.repeat(N, 1, 1)
+        faces = self.t_pos_idx
+        uvs = self.v_tex.repeat(N, 1, 1)
+        uv_idx = self.t_tex_idx
+        mat = self.material
+        return make_mesh(verts, faces, uvs, uv_idx, self.material)
+    def deform(self, deformation):
+        """
+        Create new Mesh class which is obtained by performing the deformation to the self.
+        Args:
+            deformation: tensor with shape (B, V, 3)
+        Returns:
+            new Mesh object after the deformation.
+        """
+        assert deformation.shape[1] == self.v_pos.shape[1] and deformation.shape[2] == 3
+        verts = self.v_pos + deformation
+        return make_mesh(verts, self.t_pos_idx, self.v_tex.repeat(len(verts), 1, 1), self.t_tex_idx, self.material)
+    def get_m_to_n(self, m: int, n: int):
+        """
+        Create new Mesh class with the n-th (included) mesh to the m-th (not included) mesh in the batch.
+        Args:
+            m: the index of the starting mesh to be contained.
+            n: the index of the first mesh not to be contained.
+        """
+        verts = self.v_pos[m:n, ...]
+        faces = self.t_pos_idx
+        uvs = self.v_tex[m:n, ...]
+        uv_idx = self.t_tex_idx
+        mat = self.material
+        return make_mesh(verts, faces, uvs, uv_idx, mat)
+    def first_n(self, n: int):
+        """
+        Create new Mesh class with only the first n meshes in the batch.
+        Args:
+            n: number of meshes to be contained.
+        Returns:
+            new Mesh object with the first n meshes.
+        """
+        return self.get_m_to_n(0, n)
+        verts = self.v_pos[:n, ...]
+        faces = self.t_pos_idx
+        uvs = self.v_tex[:n, ...]
+        uv_idx = self.t_tex_idx
+        mat = self.material
+        return make_mesh(verts, faces, uvs, uv_idx, mat)
+    def get_n(self, n: int):
+        """
+        Create new Mesh class with only the n-th meshes in the batch.
+        Args:
+            n: the index of the mesh to be contained.
+        Returns:
+            new Mesh object with the n-th mesh.
+        """
+        verts = self.v_pos[n:n+1, ...]
+        faces = self.t_pos_idx
+        uvs = self.v_tex[n:n+1, ...]
+        uv_idx = self.t_tex_idx
+        mat = self.material
+        return make_mesh(verts, faces, uvs, uv_idx, mat)
+######################################################################################
+# Mesh loading helper
+######################################################################################
+def load_mesh(filename, mtl_override=None):
+    name, ext = os.path.splitext(filename)
+    if ext == ".obj":
+        return obj.load_obj(filename, clear_ks=True, mtl_override=mtl_override)
+    assert False, "Invalid mesh file extension"
+######################################################################################
+# Compute AABB
+######################################################################################
+def aabb(mesh):
+    return torch.min(mesh.v_pos, dim=0).values, torch.max(mesh.v_pos, dim=0).values
+######################################################################################
+# Compute unique edge list from attribute/vertex index list
+######################################################################################
+def compute_edges(attr_idx, return_inverse=False):
+    with torch.no_grad():
+        # Create all edges, packed by triangle
+        idx = attr_idx[0]
+        all_edges = torch.cat((
+            torch.stack((idx[:, 0], idx[:, 1]), dim=-1),
+            torch.stack((idx[:, 1], idx[:, 2]), dim=-1),
+            torch.stack((idx[:, 2], idx[:, 0]), dim=-1),
+        ), dim=-1).view(-1, 2)
+        # Swap edge order so min index is always first
+        order = (all_edges[:, 0] > all_edges[:, 1]).long().unsqueeze(dim=1)
+        sorted_edges = torch.cat((
+            torch.gather(all_edges, 1, order),
+            torch.gather(all_edges, 1, 1 - order)
+        ), dim=-1)
+        # Eliminate duplicates and return inverse mapping
+        return torch.unique(sorted_edges, dim=0, return_inverse=return_inverse)
+######################################################################################
+# Compute unique edge to face mapping from attribute/vertex index list
+######################################################################################
+def compute_edge_to_face_mapping(attr_idx, return_inverse=False):
+    with torch.no_grad():
+        # Get unique edges
+        # Create all edges, packed by triangle
+        idx = attr_idx[0]
+        all_edges = torch.cat((
+            torch.stack((idx[:, 0], idx[:, 1]), dim=-1),
+            torch.stack((idx[:, 1], idx[:, 2]), dim=-1),
+            torch.stack((idx[:, 2], idx[:, 0]), dim=-1),
+        ), dim=-1).view(-1, 2)
+        # Swap edge order so min index is always first
+        order = (all_edges[:, 0] > all_edges[:, 1]).long().unsqueeze(dim=1)
+        sorted_edges = torch.cat((
+            torch.gather(all_edges, 1, order),
+            torch.gather(all_edges, 1, 1 - order)
+        ), dim=-1)
+        # Elliminate duplicates and return inverse mapping
+        unique_edges, idx_map = torch.unique(sorted_edges, dim=0, return_inverse=True)
+        tris = torch.arange(idx.shape[0]).repeat_interleave(3).cuda()
+        tris_per_edge = torch.zeros((unique_edges.shape[0], 2), dtype=torch.int64).cuda()
+        # Compute edge to face table
+        mask0 = order[:,0] == 0
+        mask1 = order[:,0] == 1
+        tris_per_edge[idx_map[mask0], 0] = tris[mask0]
+        tris_per_edge[idx_map[mask1], 1] = tris[mask1]
+        return tris_per_edge
+######################################################################################
+# Align base mesh to reference mesh:move & rescale to match bounding boxes.
+######################################################################################
+def unit_size(mesh):
+    with torch.no_grad():
+        vmin, vmax = aabb(mesh)
+        scale = 2 / torch.max(vmax - vmin).item()
+        v_pos = mesh.v_pos - (vmax + vmin) / 2 # Center mesh on origin
+        v_pos = v_pos * scale                  # Rescale to unit size
+        return Mesh(v_pos, base=mesh)
+######################################################################################
+# Center & scale mesh for rendering
+######################################################################################
+def center_by_reference(base_mesh, ref_aabb, scale):
+    center = (ref_aabb[0] + ref_aabb[1]) * 0.5
+    scale = scale / torch.max(ref_aabb[1] - ref_aabb[0]).item()
+    v_pos = (base_mesh.v_pos - center[None, ...]) * scale
+    return Mesh(v_pos, base=base_mesh)
+######################################################################################
+# Simple smooth vertex normal computation
+######################################################################################
+def auto_normals(imesh):
+    batch_size = imesh.v_pos.shape[0]
+    i0 = imesh.t_pos_idx[0, :, 0]  # Shape: (F)
+    i1 = imesh.t_pos_idx[0, :, 1]  # Shape: (F)
+    i2 = imesh.t_pos_idx[0, :, 2]  # Shape: (F)
+    v0 = imesh.v_pos[:, i0, :]  # Shape: (B, F, 3)
+    v1 = imesh.v_pos[:, i1, :]  # Shape: (B, F, 3)
+    v2 = imesh.v_pos[:, i2, :]  # Shape: (B, F, 3)
+    face_normals = torch.cross(v1 - v0, v2 - v0, dim=-1)  # Shape: (B, F, 3)
+    # Splat face normals to vertices
+    v_nrm = torch.zeros_like(imesh.v_pos)  # Shape: (B, V, 3)
+    v_nrm.scatter_add_(1, i0[None, :, None].repeat(batch_size, 1, 3), face_normals)
+    v_nrm.scatter_add_(1, i1[None, :, None].repeat(batch_size, 1, 3), face_normals)
+    v_nrm.scatter_add_(1, i2[None, :, None].repeat(batch_size, 1, 3), face_normals)
+    # Normalize, replace zero (degenerated) normals with some default value
+    v_nrm = torch.where(util.dot(v_nrm, v_nrm) > 1e-20,
+                        v_nrm, torch.tensor([0.0, 0.0, 1.0],
+                        dtype=torch.float32, device='cuda'))
+    v_nrm = util.safe_normalize(v_nrm)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(v_nrm))
+    return Mesh(v_nrm=v_nrm, t_nrm_idx=imesh.t_pos_idx, base=imesh)
+######################################################################################
+# Compute tangent space from texture map coordinates
+# Follows http://www.mikktspace.com/ conventions
+######################################################################################
+def compute_tangents(imesh):
+    batch_size = imesh.v_pos.shape[0]
+    vn_idx = [None] * 3
+    pos = [None] * 3
+    tex = [None] * 3
+    for i in range(0,3):
+        pos[i] = imesh.v_pos[:, imesh.t_pos_idx[0, :, i]]
+        tex[i] = imesh.v_tex[:, imesh.t_tex_idx[0, :, i]]
+        vn_idx[i] = imesh.t_nrm_idx[..., i:i+1]
+    tangents = torch.zeros_like(imesh.v_nrm)
+    tansum   = torch.zeros_like(imesh.v_nrm)
+    # Compute tangent space for each triangle
+    uve1 = tex[1] - tex[0]  # Shape: (B, F, 2)
+    uve2 = tex[2] - tex[0]  # Shape: (B, F, 2)
+    pe1  = pos[1] - pos[0]  # Shape: (B, F, 3)
+    pe2  = pos[2] - pos[0]  # Shape: (B, F, 3)
+    nom   = pe1 * uve2[..., 1:2] - pe2 * uve1[..., 1:2]  # Shape: (B, F, 3)
+    denom = uve1[..., 0:1] * uve2[..., 1:2] - uve1[..., 1:2] * uve2[..., 0:1]  # Shape: (B, F, 1)
+    # Avoid division by zero for degenerated texture coordinates
+    tang = nom / torch.where(denom > 0.0, torch.clamp(denom, min=1e-6), torch.clamp(denom, max=-1e-6))  # Shape: (B, F, 3)
+    # Update all 3 vertices
+    for i in range(0,3):
+        idx = vn_idx[i].repeat(batch_size, 1, 3)  # Shape: (B, F, 3)
+        tangents.scatter_add_(1, idx, tang)       # tangents[n_i] = tangents[n_i] + tang
+        tansum.scatter_add_(1, idx, torch.ones_like(tang)) # tansum[n_i] = tansum[n_i] + 1
+    tangents = tangents / tansum
+    # Normalize and make sure tangent is perpendicular to normal
+    tangents = util.safe_normalize(tangents)
+    tangents = util.safe_normalize(tangents - util.dot(tangents, imesh.v_nrm) * imesh.v_nrm)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(tangents))
+    return Mesh(v_tng=tangents, t_tng_idx=imesh.t_nrm_idx, base=imesh)
+######################################################################################
+# Create new Mesh from verts, faces, uvs, and uv_idx. The rest is auto computed.
+######################################################################################
+def make_mesh(verts, faces, uvs, uv_idx, material):
+    """
+    Create new Mesh class with given verts, faces, uvs, and uv_idx.
+    Args:
+        verts: tensor of shape (B, V, 3)
+        faces: tensor of shape (1, F, 3)
+        uvs: tensor of shape (B, V, 2)
+        uv_idx: tensor of shape (1, F, 3)
+        material: an Material instance, specifying the material of the mesh.
+    Returns:
+        new Mesh object.
+    """
+    assert len(verts.shape) == 3 and len(faces.shape) == 3 and len(uvs.shape) == 3 and len(uv_idx.shape) == 3, "All components must be batched."
+    assert faces.shape[0] == 1 and uv_idx.shape[0] == 1, "Every mesh must share the same edge connectivity."
+    assert verts.shape[0] == uvs.shape[0], "Batch size must be consistent."
+    ret = Mesh(verts, faces, v_tex=uvs, t_tex_idx=uv_idx, material=material)
+    ret = auto_normals(ret)
+    ret = compute_tangents(ret)
+    return ret

video3d/render/mlptexture.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import torch
+import tinycudann as tcnn
+import numpy as np
+#######################################################################################################################################################
+# Small MLP using PyTorch primitives, internal helper class
+#######################################################################################################################################################
+class _MLP(torch.nn.Module):
+    def __init__(self, cfg, loss_scale=1.0):
+        super(_MLP, self).__init__()
+        self.loss_scale = loss_scale
+        net = (torch.nn.Linear(cfg['n_input_dims'], cfg['n_neurons'], bias=False), torch.nn.ReLU())
+        for i in range(cfg['n_hidden_layers']-1):
+            net = net + (torch.nn.Linear(cfg['n_neurons'], cfg['n_neurons'], bias=False), torch.nn.ReLU())
+        net = net + (torch.nn.Linear(cfg['n_neurons'], cfg['n_output_dims'], bias=False),)
+        self.net = torch.nn.Sequential(*net).cuda()
+        self.net.apply(self._init_weights)
+        if self.loss_scale != 1.0:
+            self.net.register_full_backward_hook(lambda module, grad_i, grad_o: (grad_i[0] * self.loss_scale, ))
+    def forward(self, x):
+        return self.net(x.to(torch.float32))
+    @staticmethod
+    def _init_weights(m):
+        if type(m) == torch.nn.Linear:
+            torch.nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
+            if hasattr(m.bias, 'data'):
+                m.bias.data.fill_(0.0)
+#######################################################################################################################################################
+# Outward visible MLP class
+#######################################################################################################################################################
+class MLPTexture3D(torch.nn.Module):
+    def __init__(self, AABB, channels=3, internal_dims=32, hidden=2, feat_dim=0, min_max=None, bsdf='diffuse', perturb_normal=False, symmetrize=False):
+        super(MLPTexture3D, self).__init__()
+        self.channels = channels
+        self.feat_dim = feat_dim
+        self.internal_dims = internal_dims
+        self.AABB = AABB
+        self.bsdf = bsdf
+        self.perturb_normal = perturb_normal
+        self.symmetrize = symmetrize
+        if min_max is not None:
+            self.register_buffer('min_max', min_max)
+        else:
+            self.min_max = None
+        # Setup positional encoding, see https://github.com/NVlabs/tiny-cuda-nn for details.
+        desired_resolution = 4096
+        base_grid_resolution = 16
+        num_levels = 16
+        per_level_scale = np.exp(np.log(desired_resolution / base_grid_resolution) / (num_levels-1))
+        enc_cfg =  {
+            "otype": "HashGrid",
+            "n_levels": num_levels,
+            "n_features_per_level": 2,
+            "log2_hashmap_size": 19,
+            "base_resolution": base_grid_resolution,
+            "per_level_scale" : per_level_scale
+	    }
+        # gradient_scaling = 128.0
+        gradient_scaling = 1.0
+        self.encoder = tcnn.Encoding(3, enc_cfg)
+        self.encoder.register_full_backward_hook(lambda module, grad_i, grad_o: (grad_i[0] / gradient_scaling, ))
+        # Setup MLP
+        mlp_cfg = {
+            "n_input_dims" : internal_dims + feat_dim,
+            "n_output_dims" : self.channels,
+            "n_hidden_layers" : hidden,
+            "n_neurons" : self.internal_dims
+        }
+        self.linear = torch.nn.Linear(self.encoder.n_output_dims, internal_dims)
+        self.net = _MLP(mlp_cfg, gradient_scaling)
+        self.relu = torch.nn.ReLU(inplace=True)
+        print("Encoder output: %d dims" % (self.encoder.n_output_dims))
+    # Sample texture at a given location
+    def sample(self, texc, feat=None):
+        assert (feat is None and self.feat_dim == 0) or feat.shape[-1] == self.feat_dim
+        if self.symmetrize:
+            xs, ys, zs = texc.unbind(-1)
+            texc = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
+        _texc = (texc.view(-1, 3) - self.AABB[0][None, ...]) / (self.AABB[1][None, ...] - self.AABB[0][None, ...])
+        _texc = torch.clamp(_texc, min=0, max=1)
+        _, image_h, image_w, _ = texc.shape
+        p_enc = self.encoder(_texc.contiguous())
+        x_in = self.linear(p_enc.type(texc.dtype))
+        if feat is not None:
+            feat_in = feat[:, None, None, :].repeat(1, image_h, image_w, 1).view(-1, self.feat_dim)
+            x_in = torch.concat([x_in, feat_in], dim=-1)
+        out = self.net(self.relu(x_in))
+        # Sigmoid limit and scale to the allowed range
+        out = torch.sigmoid(out)
+        if self.min_max is not None:
+            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
+        return out.view(*texc.shape[:-1], self.channels) # Remap to [n, h, w, c]
+    def cleanup(self):
+        tcnn.free_temporary_memory()

video3d/render/obj.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import os
+import torch
+import xatlas
+import trimesh
+import numpy as np
+import cv2
+import nvdiffrast.torch as dr
+from video3d.render.render import render_uv
+from video3d.render.mesh import Mesh
+from . import texture
+from . import mesh
+from . import material
+######################################################################################
+# Utility functions
+######################################################################################
+def _find_mat(materials, name):
+    for mat in materials:
+        if mat['name'] == name:
+            return mat
+    return materials[0] # Materials 0 is the default
+######################################################################################
+# Create mesh object from objfile
+######################################################################################
+def load_obj(filename, clear_ks=True, mtl_override=None):
+    obj_path = os.path.dirname(filename)
+    # Read entire file
+    with open(filename, 'r') as f:
+        lines = f.readlines()
+    # Load materials
+    all_materials = [
+        {
+            'name' : '_default_mat',
+            'bsdf' : 'pbr',
+            'kd'   : texture.Texture2D(torch.tensor([0.5, 0.5, 0.5], dtype=torch.float32, device='cuda')),
+            'ks'   : texture.Texture2D(torch.tensor([0.0, 0.0, 0.0], dtype=torch.float32, device='cuda'))
+        }
+    ]
+    if mtl_override is None:
+        for line in lines:
+            if len(line.split()) == 0:
+                continue
+            if line.split()[0] == 'mtllib':
+                all_materials += material.load_mtl(os.path.join(obj_path, line.split()[1]), clear_ks) # Read in entire material library
+    else:
+        all_materials += material.load_mtl(mtl_override)
+    # load vertices
+    vertices, texcoords, normals  = [], [], []
+    for line in lines:
+        if len(line.split()) == 0:
+            continue
+        prefix = line.split()[0].lower()
+        if prefix == 'v':
+            vertices.append([float(v) for v in line.split()[1:]])
+        elif prefix == 'vt':
+            val = [float(v) for v in line.split()[1:]]
+            texcoords.append([val[0], 1.0 - val[1]])
+        elif prefix == 'vn':
+            normals.append([float(v) for v in line.split()[1:]])
+    # load faces
+    activeMatIdx = None
+    used_materials = []
+    faces, tfaces, nfaces, mfaces = [], [], [], []
+    for line in lines:
+        if len(line.split()) == 0:
+            continue
+        prefix = line.split()[0].lower()
+        if prefix == 'usemtl': # Track used materials
+            mat = _find_mat(all_materials, line.split()[1])
+            if not mat in used_materials:
+                used_materials.append(mat)
+            activeMatIdx = used_materials.index(mat)
+        elif prefix == 'f': # Parse face
+            vs = line.split()[1:]
+            nv = len(vs)
+            vv = vs[0].split('/')
+            v0 = int(vv[0]) - 1
+            t0 = int(vv[1]) - 1 if vv[1] != "" else -1
+            n0 = int(vv[2]) - 1 if vv[2] != "" else -1
+            for i in range(nv - 2): # Triangulate polygons
+                vv = vs[i + 1].split('/')
+                v1 = int(vv[0]) - 1
+                t1 = int(vv[1]) - 1 if vv[1] != "" else -1
+                n1 = int(vv[2]) - 1 if vv[2] != "" else -1
+                vv = vs[i + 2].split('/')
+                v2 = int(vv[0]) - 1
+                t2 = int(vv[1]) - 1 if vv[1] != "" else -1
+                n2 = int(vv[2]) - 1 if vv[2] != "" else -1
+                mfaces.append(activeMatIdx)
+                faces.append([v0, v1, v2])
+                tfaces.append([t0, t1, t2])
+                nfaces.append([n0, n1, n2])
+    assert len(tfaces) == len(faces) and len(nfaces) == len (faces)
+    # Create an "uber" material by combining all textures into a larger texture
+    if len(used_materials) > 1:
+        uber_material, texcoords, tfaces = material.merge_materials(used_materials, texcoords, tfaces, mfaces)
+    else:
+        uber_material = used_materials[0]
+    vertices = torch.tensor(vertices, dtype=torch.float32, device='cuda')
+    texcoords = torch.tensor(texcoords, dtype=torch.float32, device='cuda') if len(texcoords) > 0 else None
+    normals = torch.tensor(normals, dtype=torch.float32, device='cuda') if len(normals) > 0 else None
+    faces = torch.tensor(faces, dtype=torch.int64, device='cuda')
+    tfaces = torch.tensor(tfaces, dtype=torch.int64, device='cuda') if texcoords is not None else None
+    nfaces = torch.tensor(nfaces, dtype=torch.int64, device='cuda') if normals is not None else None
+    return mesh.Mesh(vertices, faces, normals, nfaces, texcoords, tfaces, material=uber_material)
+######################################################################################
+# Save mesh object to objfile
+######################################################################################
+def write_obj(folder, fname, mesh, idx, save_material=True, feat=None, resolution=[256, 256]):
+    obj_file = os.path.join(folder, fname + '.obj')
+    print("Writing mesh: ", obj_file)
+    with open(obj_file, "w") as f:
+        f.write(f"mtllib {fname}.mtl\n")
+        f.write("g default\n")
+        v_pos = mesh.v_pos[idx].detach().cpu().numpy() if mesh.v_pos is not None else None
+        v_nrm = mesh.v_nrm[idx].detach().cpu().numpy() if mesh.v_nrm is not None else None
+        v_tex = mesh.v_tex[idx].detach().cpu().numpy() if mesh.v_tex is not None else None
+        t_pos_idx = mesh.t_pos_idx[0].detach().cpu().numpy() if mesh.t_pos_idx is not None else None
+        t_nrm_idx = mesh.t_nrm_idx[0].detach().cpu().numpy() if mesh.t_nrm_idx is not None else None
+        t_tex_idx = mesh.t_tex_idx[0].detach().cpu().numpy() if mesh.t_tex_idx is not None else None
+        print("    writing %d vertices" % len(v_pos))
+        for v in v_pos:
+            f.write('v {} {} {} \n'.format(v[0], v[1], v[2]))
+        if v_tex is not None and save_material:
+            print("    writing %d texcoords" % len(v_tex))
+            assert(len(t_pos_idx) == len(t_tex_idx))
+            for v in v_tex:
+                f.write('vt {} {} \n'.format(v[0], 1.0 - v[1]))
+        if v_nrm is not None:
+            print("    writing %d normals" % len(v_nrm))
+            assert(len(t_pos_idx) == len(t_nrm_idx))
+            for v in v_nrm:
+                f.write('vn {} {} {}\n'.format(v[0], v[1], v[2]))
+        # faces
+        f.write("s 1 \n")
+        f.write("g pMesh1\n")
+        f.write("usemtl defaultMat\n")
+        # Write faces
+        print("    writing %d faces" % len(t_pos_idx))
+        for i in range(len(t_pos_idx)):
+            f.write("f ")
+            for j in range(3):
+                f.write(' %s/%s/%s' % (str(t_pos_idx[i][j]+1), '' if v_tex is None else str(t_tex_idx[i][j]+1), '' if v_nrm is None else str(t_nrm_idx[i][j]+1)))
+            f.write("\n")
+    if save_material and mesh.material is not None:
+        mtl_file = os.path.join(folder, fname + '.mtl')
+        print("Writing material: ", mtl_file)
+        material.save_mtl(mtl_file, mesh.material, mesh=mesh.get_n(idx), feat=feat, resolution=resolution)
+    print("Done exporting mesh")
+def write_textured_obj(folder, fname, mesh, idx, save_material=True, feat=None, resolution=[256, 256], prior_shape=None):
+    mesh = mesh.get_n(idx)
+    obj_file = os.path.join(folder, fname + '.obj')
+    print("Writing mesh: ", obj_file)
+    # Create uvs with xatlas
+    v_pos = mesh.v_pos.detach().cpu().numpy()
+    t_pos_idx = mesh.t_pos_idx.detach().cpu().numpy()
+    # v_color = torch.Tensor(v_pos)[None].to("cuda")
+    # v_color = mesh.material.sample(v_color, feat)
+    # v_color = v_color[0,0,:,:3].detach().cpu()
+    # v_color = torch.concat([v_color, torch.ones((v_color.shape[0], 1))], dim=-1)
+    # v_color = v_color.numpy() * 255
+    # v_color = v_color.astype(np.int32)
+    # tmp = trimesh.Trimesh(vertices=v_pos[0], faces=t_pos_idx[0], vertex_colors=v_color)
+    # _ = tmp.export("tmp.obj")
+    # from pdb import set_trace; set_trace()
+    atlas = xatlas.Atlas()
+    atlas.add_mesh(
+        v_pos[0],
+        t_pos_idx[0],
+    )
+    co = xatlas.ChartOptions()
+    po = xatlas.PackOptions()
+    # for k, v in xatlas_chart_options.items():
+    #     setattr(co, k, v)
+    # for k, v in xatlas_pack_options.items():
+    #     setattr(po, k, v)
+    atlas.generate(co, po)
+    vmapping, indices, uvs = atlas.get_mesh(0)
+    # vmapping, indices, uvs = xatlas.parametrize(v_pos[0], t_pos_idx[0])
+    # Convert to tensors
+    indices_int64 = indices.astype(np.uint64, casting='same_kind').view(np.int64)
+    uvs = torch.tensor(uvs, dtype=torch.float32, device='cuda')
+    faces = torch.tensor(indices_int64, dtype=torch.int64, device='cuda')
+    # new_mesh = Mesh(v_tex=uvs, t_tex_idx=faces, base=mesh)
+    new_mesh = Mesh(v_tex=uvs[None], t_tex_idx=faces[None], base=mesh)
+    # glctx = dr.RasterizeGLContext()
+    # mask, kd, ks, normal = render_uv(glctx, new_mesh, resolution, mesh.material, feat=feat)
+    # kd_min, kd_max = torch.tensor([ 0.0,  0.0,  0.0,  0.0], dtype=torch.float32, device='cuda'), torch.tensor([ 1.0,  1.0,  1.0,  1.0], dtype=torch.float32, device='cuda')
+    # ks_min, ks_max = torch.tensor([ 0.0,  0.0,  0.0] , dtype=torch.float32, device='cuda'), torch.tensor([ 0.0,  0.0,  0.0] , dtype=torch.float32, device='cuda')
+    # nrm_min, nrm_max = torch.tensor([-1.0, -1.0,  0.0], dtype=torch.float32, device='cuda'), torch.tensor([ 1.0,  1.0,  1.0], dtype=torch.float32, device='cuda')
+    new_mesh.material = material.Material({
+        'bsdf'   : 'diffuse',
+        # 'kd'     : texture.Texture2D(kd, min_max=[kd_min, kd_max]),
+        # 'ks'     : texture.Texture2D(ks, min_max=[ks_min, ks_max]),
+        # 'normal' : texture.Texture2D(normal, min_max=[nrm_min, nrm_max]),
+        'kd_ks_normal': mesh.material
+    })
+    with open(obj_file, "w") as f:
+        f.write(f"mtllib {fname}.mtl\n")
+        f.write("g default\n")
+        v_pos = new_mesh.v_pos[idx].detach().cpu().numpy() if new_mesh.v_pos is not None else None
+        v_nrm = new_mesh.v_nrm[idx].detach().cpu().numpy() if new_mesh.v_nrm is not None else None
+        v_tex = new_mesh.v_tex[idx].detach().cpu().numpy() if new_mesh.v_tex is not None else None
+        t_pos_idx = new_mesh.t_pos_idx[0].detach().cpu().numpy() if new_mesh.t_pos_idx is not None else None
+        t_nrm_idx = new_mesh.t_nrm_idx[0].detach().cpu().numpy() if new_mesh.t_nrm_idx is not None else None
+        t_tex_idx = new_mesh.t_tex_idx[0].detach().cpu().numpy() if new_mesh.t_tex_idx is not None else None
+        print("    writing %d vertices" % len(v_pos))
+        for v in v_pos:
+            f.write('v {} {} {} \n'.format(v[0], v[1], v[2]))
+        if v_tex is not None and save_material:
+            print("    writing %d texcoords" % len(v_tex))
+            assert(len(t_pos_idx) == len(t_tex_idx))
+            for v in v_tex:
+                f.write('vt {} {} \n'.format(v[0], 1.0 - v[1]))
+        if v_nrm is not None:
+            print("    writing %d normals" % len(v_nrm))
+            assert(len(t_pos_idx) == len(t_nrm_idx))
+            for v in v_nrm:
+                f.write('vn {} {} {}\n'.format(v[0], v[1], v[2]))
+        # faces
+        f.write("s 1 \n")
+        f.write("g pMesh1\n")
+        f.write("usemtl defaultMat\n")
+        # Write faces
+        print("    writing %d faces" % len(t_pos_idx))
+        for i in range(len(t_pos_idx)):
+            f.write("f ")
+            for j in range(3):
+                f.write(' %s/%s/%s' % (str(t_pos_idx[i][j]+1), '' if v_tex is None else str(t_tex_idx[i][j]+1), '' if v_nrm is None else str(t_nrm_idx[i][j]+1)))
+            f.write("\n")
+    mtl_file = os.path.join(folder, fname + '.mtl')
+    print("Writing material: ", mtl_file)
+    material.save_mtl(mtl_file, new_mesh.material, mesh=new_mesh, feat=feat, resolution=resolution, prior_shape=prior_shape)
+    print("Done exporting mesh")

video3d/render/regularizer.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import torch
+import nvdiffrast.torch as dr
+from . import util
+from . import mesh
+######################################################################################
+# Computes the image gradient, useful for kd/ks smoothness losses
+######################################################################################
+def image_grad(buf, std=0.01):
+    t, s = torch.meshgrid(torch.linspace(-1.0 + 1.0 / buf.shape[1], 1.0 - 1.0 / buf.shape[1], buf.shape[1], device="cuda"),
+                          torch.linspace(-1.0 + 1.0 / buf.shape[2], 1.0 - 1.0 / buf.shape[2], buf.shape[2], device="cuda"),
+                          indexing='ij')
+    tc   = torch.normal(mean=0, std=std, size=(buf.shape[0], buf.shape[1], buf.shape[2], 2), device="cuda") + torch.stack((s, t), dim=-1)[None, ...]
+    tap  = dr.texture(buf, tc, filter_mode='linear', boundary_mode='clamp')
+    return torch.abs(tap[..., :-1] - buf[..., :-1]) * tap[..., -1:] * buf[..., -1:]
+######################################################################################
+# Computes the avergage edge length of a mesh.
+# Rough estimate of the tessellation of a mesh. Can be used e.g. to clamp gradients
+######################################################################################
+def avg_edge_length(v_pos, t_pos_idx):
+    e_pos_idx = mesh.compute_edges(t_pos_idx)
+    edge_len  = util.length(v_pos[:, e_pos_idx[:, 0]] - v_pos[:, e_pos_idx[:, 1]])
+    return torch.mean(edge_len)
+######################################################################################
+# Laplacian regularization using umbrella operator (Fujiwara / Desbrun).
+# https://mgarland.org/class/geom04/material/smoothing.pdf
+######################################################################################
+def laplace_regularizer_const(v_pos, t_pos_idx):
+    batch_size = v_pos.shape[0]
+    term = torch.zeros_like(v_pos)
+    norm = torch.zeros_like(v_pos[..., 0:1])
+    v0 = v_pos[:, t_pos_idx[0, :, 0], :]
+    v1 = v_pos[:, t_pos_idx[0, :, 1], :]
+    v2 = v_pos[:, t_pos_idx[0, :, 2], :]
+    term.scatter_add_(1, t_pos_idx[..., 0:1].repeat(batch_size, 1, 3), (v1 - v0) + (v2 - v0))
+    term.scatter_add_(1, t_pos_idx[..., 1:2].repeat(batch_size, 1, 3), (v0 - v1) + (v2 - v1))
+    term.scatter_add_(1, t_pos_idx[..., 2:3].repeat(batch_size, 1, 3), (v0 - v2) + (v1 - v2))
+    two = torch.ones_like(v0) * 2.0
+    # norm.scatter_add_(1, t_pos_idx[..., 0:1].repeat(batch_size, 1, 3), two)
+    # norm.scatter_add_(1, t_pos_idx[..., 1:2].repeat(batch_size, 1, 3), two)
+    # norm.scatter_add_(1, t_pos_idx[..., 2:3].repeat(batch_size, 1, 3), two)
+    norm.scatter_add_(1, t_pos_idx[..., 0:1].repeat(batch_size, 1, 1), two)
+    norm.scatter_add_(1, t_pos_idx[..., 1:2].repeat(batch_size, 1, 1), two)
+    norm.scatter_add_(1, t_pos_idx[..., 2:3].repeat(batch_size, 1, 1), two)
+    term = term / torch.clamp(norm, min=1.0)
+    return torch.mean(term ** 2)
+######################################################################################
+# Smooth vertex normals
+######################################################################################
+def normal_consistency(v_pos, t_pos_idx):
+    # Compute face normals
+    v0 = v_pos[:, t_pos_idx[0, :, 0]]
+    v1 = v_pos[:, t_pos_idx[0, :, 1]]
+    v2 = v_pos[:, t_pos_idx[0, :, 2]]
+    face_normals = util.safe_normalize(torch.cross(v1 - v0, v2 - v0, dim=-1))
+    tris_per_edge = mesh.compute_edge_to_face_mapping(t_pos_idx)
+    # Fetch normals for both faces sharing an edge
+    n0 = face_normals[:, tris_per_edge[:, 0], :]
+    n1 = face_normals[:, tris_per_edge[:, 1], :]
+    # Compute error metric based on normal difference
+    term = torch.clamp(util.dot(n0, n1), min=-1.0, max=1.0)
+    term = (1.0 - term) * 0.5
+    return torch.mean(torch.abs(term))
+def get_edge_length(v_pos, t_pos_idx):
+    e_pos_idx = mesh.compute_edges(t_pos_idx)
+    edge_len  = util.length(v_pos[:, e_pos_idx[:, 0]] - v_pos[:, e_pos_idx[:, 1]])
+    return edge_len

video3d/render/render.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import torch
+import nvdiffrast.torch as dr
+from . import util
+from . import renderutils as ru
+from . import light
+# ==============================================================================================
+#  Helper functions
+# ==============================================================================================
+def interpolate(attr, rast, attr_idx, rast_db=None):
+    return dr.interpolate(attr.contiguous(), rast, attr_idx, rast_db=rast_db, diff_attrs=None if rast_db is None else 'all')
+# ==============================================================================================
+#  pixel shader
+# ==============================================================================================
+def shade(
+        gb_pos,
+        gb_geometric_normal,
+        gb_normal,
+        gb_tangent,
+        gb_tex_pos,
+        gb_texc,
+        gb_texc_deriv,
+        w2c,
+        view_pos,
+        lgt,
+        material,
+        bsdf,
+        feat,
+        two_sided_shading,
+        delta_xy_interp=None,
+        dino_pred=None,
+        class_vector=None,
+        im_features_map=None,
+        mvp=None
+    ):
+    ################################################################################
+    # Texture lookups
+    ################################################################################
+    perturbed_nrm = None
+    # Combined texture, used for MLPs because lookups are expensive
+    # all_tex_jitter = material.sample(gb_tex_pos + torch.normal(mean=0, std=0.01, size=gb_tex_pos.shape, device="cuda"), feat=feat)
+    if material is not None:
+        if im_features_map is None:
+            all_tex = material.sample(gb_tex_pos, feat=feat)
+        else:
+            all_tex = material.sample(gb_tex_pos, feat=feat, feat_map=im_features_map, mvp=mvp, w2c=w2c, deform_xyz=gb_pos)
+    else:
+        all_tex = torch.ones(*gb_pos.shape[:-1], 9, device=gb_pos.device)
+    kd, ks, perturbed_nrm = all_tex[..., :3], all_tex[..., 3:6], all_tex[..., 6:9]
+    # Compute albedo (kd) gradient, used for material regularizer
+    # kd_grad    = torch.sum(torch.abs(all_tex_jitter[..., :-6] - all_tex[..., :-6]), dim=-1, keepdim=True) /
+    if dino_pred is not None and class_vector is None:
+        # DOR: predive the dino value using x,y,z, we would concatenate the label vector.
+        # trained together, generated image as the supervision for the one-hot-vector.
+        dino_feat_im_pred = dino_pred.sample(gb_tex_pos)
+        # dino_feat_im_pred = dino_pred.sample(gb_tex_pos.detach())
+    if dino_pred is not None and class_vector is not None:
+        dino_feat_im_pred = dino_pred.sample(gb_tex_pos, feat=class_vector)
+    # else:
+    #     kd_jitter  = material['kd'].sample(gb_texc + torch.normal(mean=0, std=0.005, size=gb_texc.shape, device="cuda"), gb_texc_deriv)
+    #     kd = material['kd'].sample(gb_texc, gb_texc_deriv)
+    #     ks = material['ks'].sample(gb_texc, gb_texc_deriv)[..., 0:3] # skip alpha
+    #     if 'normal' in material:
+    #         perturbed_nrm = material['normal'].sample(gb_texc, gb_texc_deriv)
+    #     kd_grad    = torch.sum(torch.abs(kd_jitter[..., 0:3] - kd[..., 0:3]), dim=-1, keepdim=True) / 3
+    # Separate kd into alpha and color, default alpha = 1
+    # alpha = kd[..., 3:4] if kd.shape[-1] == 4 else torch.ones_like(kd[..., 0:1])
+    # kd = kd[..., 0:3]
+    alpha = torch.ones_like(kd[..., 0:1])
+    ################################################################################
+    # Normal perturbation & normal bend
+    ################################################################################
+    if material is None or not material.perturb_normal:
+        perturbed_nrm = None
+    gb_normal = ru.prepare_shading_normal(gb_pos, view_pos, perturbed_nrm, gb_normal, gb_tangent, gb_geometric_normal, two_sided_shading=two_sided_shading, opengl=True, use_python=True)
+    # if two_sided_shading:
+    #     view_vec = util.safe_normalize(view_pos - gb_pos, -1)
+    #     gb_normal = torch.where(torch.sum(gb_geometric_normal * view_vec, -1, keepdim=True) > 0, gb_geometric_normal, -gb_geometric_normal)
+    # else:
+    #     gb_normal = gb_geometric_normal
+    b, h, w, _ = gb_normal.shape
+    cam_normal = util.safe_normalize(torch.matmul(gb_normal.view(b, -1, 3), w2c[:,:3,:3].transpose(2,1))).view(b, h, w, 3)
+    ################################################################################
+    # Evaluate BSDF
+    ################################################################################
+    assert bsdf is not None or material.bsdf is not None, "Material must specify a BSDF type"
+    bsdf = bsdf if bsdf is not None else material.bsdf
+    shading = None
+    if bsdf == 'pbr':
+        if isinstance(lgt, light.EnvironmentLight):
+            shaded_col = lgt.shade(gb_pos, gb_normal, kd, ks, view_pos, specular=True)
+        else:
+            assert False, "Invalid light type"
+    elif bsdf == 'diffuse':
+        if lgt is None:
+            shaded_col = kd
+        elif isinstance(lgt, light.EnvironmentLight):
+            shaded_col = lgt.shade(gb_pos, gb_normal, kd, ks, view_pos, specular=False)
+        # elif isinstance(lgt, light.DirectionalLight):
+        #     shaded_col, shading = lgt.shade(feat, kd, cam_normal)
+        # else:
+        #     assert False, "Invalid light type"
+        else:
+            shaded_col, shading = lgt.shade(feat, kd, cam_normal)
+    elif bsdf == 'normal':
+        shaded_col = (gb_normal + 1.0) * 0.5
+    elif bsdf == 'geo_normal':
+        shaded_col = (gb_geometric_normal + 1.0) * 0.5
+    elif bsdf == 'tangent':
+        shaded_col = (gb_tangent + 1.0) * 0.5
+    elif bsdf == 'kd':
+        shaded_col = kd
+    elif bsdf == 'ks':
+        shaded_col = ks
+    else:
+        assert False, "Invalid BSDF '%s'" % bsdf
+    # Return multiple buffers
+    buffers = {
+        'kd'   : torch.cat((kd, alpha), dim=-1),
+        'shaded'    : torch.cat((shaded_col, alpha), dim=-1),
+        # 'kd_grad'   : torch.cat((kd_grad, alpha), dim=-1),
+        # 'occlusion' : torch.cat((ks[..., :1], alpha), dim=-1),
+    }
+    if dino_pred is not None:
+        buffers['dino_feat_im_pred'] = torch.cat((dino_feat_im_pred, alpha), dim=-1)
+    if delta_xy_interp is not None:
+        buffers['flow'] = torch.cat((delta_xy_interp, alpha), dim=-1)
+    if shading is not None:
+        buffers['shading'] = torch.cat((shading, alpha), dim=-1)
+    return buffers
+# ==============================================================================================
+#  Render a depth slice of the mesh (scene), some limitations:
+#  - Single light
+#  - Single material
+# ==============================================================================================
+def render_layer(
+        rast,
+        rast_deriv,
+        mesh,
+        w2c,
+        view_pos,
+        material,
+        lgt,
+        resolution,
+        spp,
+        msaa,
+        bsdf,
+        feat,
+        prior_mesh,
+        two_sided_shading,
+        render_flow,
+        delta_xy=None,
+        dino_pred=None,
+        class_vector=None,
+        im_features_map=None,
+        mvp=None
+    ):
+    full_res = [resolution[0]*spp, resolution[1]*spp]
+    if prior_mesh is None:
+        prior_mesh = mesh
+    ################################################################################
+    # Rasterize
+    ################################################################################
+    # Scale down to shading resolution when MSAA is enabled, otherwise shade at full resolution
+    if spp > 1 and msaa:
+        rast_out_s = util.scale_img_nhwc(rast, resolution, mag='nearest', min='nearest')
+        rast_out_deriv_s = util.scale_img_nhwc(rast_deriv, resolution, mag='nearest', min='nearest') * spp
+    else:
+        rast_out_s = rast
+        rast_out_deriv_s = rast_deriv
+    if render_flow:
+        delta_xy_interp, _ = interpolate(delta_xy, rast_out_s, mesh.t_pos_idx[0].int())
+    else:
+        delta_xy_interp = None
+    ################################################################################
+    # Interpolate attributes
+    ################################################################################
+    # Interpolate world space position
+    gb_pos, _ = interpolate(mesh.v_pos, rast_out_s, mesh.t_pos_idx[0].int())
+    # Compute geometric normals. We need those because of bent normals trick (for bump mapping)
+    v0 = mesh.v_pos[:, mesh.t_pos_idx[0, :, 0], :]
+    v1 = mesh.v_pos[:, mesh.t_pos_idx[0, :, 1], :]
+    v2 = mesh.v_pos[:, mesh.t_pos_idx[0, :, 2], :]
+    face_normals = util.safe_normalize(torch.cross(v1 - v0, v2 - v0, dim=-1))
+    num_faces = face_normals.shape[1]
+    face_normal_indices = (torch.arange(0, num_faces, dtype=torch.int64, device='cuda')[:, None]).repeat(1, 3)
+    gb_geometric_normal, _ = interpolate(face_normals, rast_out_s, face_normal_indices.int())
+    # Compute tangent space
+    assert mesh.v_nrm is not None and mesh.v_tng is not None
+    gb_normal, _ = interpolate(mesh.v_nrm, rast_out_s, mesh.t_nrm_idx[0].int())
+    gb_tangent, _ = interpolate(mesh.v_tng, rast_out_s, mesh.t_tng_idx[0].int()) # Interpolate tangents
+    # Texture coordinate
+    assert mesh.v_tex is not None
+    gb_texc, gb_texc_deriv = interpolate(mesh.v_tex, rast_out_s, mesh.t_tex_idx[0].int(), rast_db=rast_out_deriv_s)
+    ################################################################################
+    # Shade
+    ################################################################################
+    gb_tex_pos, _ = interpolate(prior_mesh.v_pos, rast_out_s, mesh.t_pos_idx[0].int())
+    buffers = shade(gb_pos, gb_geometric_normal, gb_normal, gb_tangent, gb_tex_pos, gb_texc, gb_texc_deriv, w2c, view_pos, lgt, material, bsdf, feat=feat, two_sided_shading=two_sided_shading, delta_xy_interp=delta_xy_interp, dino_pred=dino_pred, class_vector=class_vector, im_features_map=im_features_map, mvp=mvp)
+    ################################################################################
+    # Prepare output
+    ################################################################################
+    # Scale back up to visibility resolution if using MSAA
+    if spp > 1 and msaa:
+        for key in buffers.keys():
+            buffers[key] = util.scale_img_nhwc(buffers[key], full_res, mag='nearest', min='nearest')
+    # Return buffers
+    return buffers
+# ==============================================================================================
+#  Render a depth peeled mesh (scene), some limitations:
+#  - Single light
+#  - Single material
+# ==============================================================================================
+def render_mesh(
+        ctx,
+        mesh,
+        mtx_in,
+        w2c,
+        view_pos,
+        material,
+        lgt,
+        resolution,
+        spp         = 1,
+        num_layers  = 1,
+        msaa        = False,
+        background  = None,
+        bsdf        = None,
+        feat        = None,
+        prior_mesh  = None,
+        two_sided_shading = True,
+        render_flow = False,
+        dino_pred = None,
+        class_vector = None,
+        num_frames = None,
+        im_features_map = None
+    ):
+    def prepare_input_vector(x):
+        x = torch.tensor(x, dtype=torch.float32, device='cuda') if not torch.is_tensor(x) else x
+        return x[:, None, None, :] if len(x.shape) == 2 else x
+    def composite_buffer(key, layers, background, antialias):
+        accum = background
+        for buffers, rast in reversed(layers):
+            alpha = (rast[..., -1:] > 0).float() * buffers[key][..., -1:]
+            accum = torch.lerp(accum, torch.cat((buffers[key][..., :-1], torch.ones_like(buffers[key][..., -1:])), dim=-1), alpha)
+            if antialias:
+                accum = dr.antialias(accum.contiguous(), rast, v_pos_clip, mesh.t_pos_idx[0].int())
+        return accum
+    assert mesh.t_pos_idx.shape[1] > 0, "Got empty training triangle mesh (unrecoverable discontinuity)"
+    assert background is None or (background.shape[1] == resolution[0] and background.shape[2] == resolution[1])
+    full_res = [resolution[0] * spp, resolution[1] * spp]
+    # Convert numpy arrays to torch tensors
+    mtx_in      = torch.tensor(mtx_in, dtype=torch.float32, device='cuda') if not torch.is_tensor(mtx_in) else mtx_in
+    view_pos    = prepare_input_vector(view_pos)  # Shape: (B, 1, 1, 3)
+    # clip space transform
+    v_pos_clip = ru.xfm_points(mesh.v_pos, mtx_in, use_python=True)
+    # render flow
+    if render_flow:
+        v_pos_clip2 = v_pos_clip[..., :2] / v_pos_clip[..., -1:]
+        v_pos_clip2 = v_pos_clip2.view(-1, num_frames, *v_pos_clip2.shape[1:])
+        delta_xy = v_pos_clip2[:, 1:] - v_pos_clip2[:, :-1]
+        delta_xy = torch.cat([delta_xy, torch.zeros_like(delta_xy[:, :1])], dim=1)
+        delta_xy = delta_xy.view(-1, *delta_xy.shape[2:])
+    else:
+        delta_xy = None
+    # Render all layers front-to-back
+    layers = []
+    with dr.DepthPeeler(ctx, v_pos_clip, mesh.t_pos_idx[0].int(), full_res) as peeler:
+        for _ in range(num_layers):
+            rast, db = peeler.rasterize_next_layer()
+            rendered = render_layer(rast, db, mesh, w2c, view_pos, material, lgt, resolution, spp, msaa, bsdf, feat=feat, prior_mesh=prior_mesh, two_sided_shading=two_sided_shading, render_flow=render_flow, delta_xy=delta_xy, dino_pred=dino_pred, class_vector=class_vector, im_features_map=im_features_map, mvp=mtx_in)
+            layers += [(rendered, rast)]
+    # Setup background
+    if background is not None:
+        if spp > 1:
+            background = util.scale_img_nhwc(background, full_res, mag='nearest', min='nearest')
+        background = torch.cat((background, torch.zeros_like(background[..., 0:1])), dim=-1)
+    else:
+        background = torch.zeros(1, full_res[0], full_res[1], 4, dtype=torch.float32, device='cuda')
+    # Composite layers front-to-back
+    out_buffers = {}
+    for key in layers[0][0].keys():
+        antialias = key in ['shaded', 'dino_feat_im_pred', 'flow']
+        bg = background if key in ['shaded'] else torch.zeros_like(layers[0][0][key])
+        accum = composite_buffer(key, layers, bg, antialias)
+        # Downscale to framebuffer resolution. Use avg pooling
+        out_buffers[key] = util.avg_pool_nhwc(accum, spp) if spp > 1 else accum
+    return out_buffers
+# ==============================================================================================
+#  Render UVs
+# ==============================================================================================
+def render_uv(ctx, mesh, resolution, mlp_texture, feat=None, prior_shape=None):
+    # clip space transform
+    uv_clip = mesh.v_tex * 2.0 - 1.0
+    # pad to four component coordinate
+    uv_clip4 = torch.cat((uv_clip, torch.zeros_like(uv_clip[...,0:1]), torch.ones_like(uv_clip[...,0:1])), dim = -1)
+    # rasterize
+    rast, _ = dr.rasterize(ctx, uv_clip4, mesh.t_tex_idx[0].int(), resolution)
+    # Interpolate world space position
+    if prior_shape is not None:
+        gb_pos, _ = interpolate(prior_shape.v_pos, rast, mesh.t_pos_idx[0].int())
+    else:
+        gb_pos, _ = interpolate(mesh.v_pos, rast, mesh.t_pos_idx[0].int())
+    # Sample out textures from MLP
+    all_tex = mlp_texture.sample(gb_pos, feat=feat)
+    assert all_tex.shape[-1] == 9 or all_tex.shape[-1] == 10, "Combined kd_ks_normal must be 9 or 10 channels"
+    perturbed_nrm = all_tex[..., -3:]
+    return (rast[..., -1:] > 0).float(), all_tex[..., :-6], all_tex[..., -6:-3], util.safe_normalize(perturbed_nrm)

video3d/render/renderutils/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+from .ops import xfm_points, xfm_vectors, image_loss, diffuse_cubemap, specular_cubemap, prepare_shading_normal, lambert, frostbite_diffuse, pbr_specular, pbr_bsdf, _fresnel_shlick, _ndf_ggx, _lambda_ggx, _masking_smith
+__all__ = ["xfm_vectors", "xfm_points", "image_loss", "diffuse_cubemap","specular_cubemap", "prepare_shading_normal", "lambert", "frostbite_diffuse", "pbr_specular", "pbr_bsdf", "_fresnel_shlick", "_ndf_ggx", "_lambda_ggx", "_masking_smith", ]

video3d/render/renderutils/bsdf.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import math
+import torch
+NORMAL_THRESHOLD = 0.1
+################################################################################
+# Vector utility functions
+################################################################################
+def _dot(x, y):
+    return torch.sum(x*y, -1, keepdim=True)
+def _reflect(x, n):
+    return 2*_dot(x, n)*n - x
+def _safe_normalize(x):
+    return torch.nn.functional.normalize(x, dim = -1)
+def _bend_normal(view_vec, smooth_nrm, geom_nrm, two_sided_shading):
+    # Swap normal direction for backfacing surfaces
+    if two_sided_shading:
+        smooth_nrm = torch.where(_dot(geom_nrm, view_vec) > 0, smooth_nrm, -smooth_nrm)
+        geom_nrm   = torch.where(_dot(geom_nrm, view_vec) > 0, geom_nrm, -geom_nrm)
+    t = torch.clamp(_dot(view_vec, smooth_nrm) / NORMAL_THRESHOLD, min=0, max=1)
+    return torch.lerp(geom_nrm, smooth_nrm, t)
+def _perturb_normal(perturbed_nrm, smooth_nrm, smooth_tng, opengl):
+    smooth_bitang = _safe_normalize(torch.cross(smooth_tng, smooth_nrm, dim=-1))
+    if opengl:
+        shading_nrm = smooth_tng * perturbed_nrm[..., 0:1] - smooth_bitang * perturbed_nrm[..., 1:2] + smooth_nrm * torch.clamp(perturbed_nrm[..., 2:3], min=0.0)
+    else:
+        shading_nrm = smooth_tng * perturbed_nrm[..., 0:1] + smooth_bitang * perturbed_nrm[..., 1:2] + smooth_nrm * torch.clamp(perturbed_nrm[..., 2:3], min=0.0)
+    return _safe_normalize(shading_nrm)
+def bsdf_prepare_shading_normal(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl):
+    smooth_nrm = _safe_normalize(smooth_nrm)
+    smooth_tng = _safe_normalize(smooth_tng)
+    view_vec   = _safe_normalize(view_pos - pos)
+    shading_nrm = _perturb_normal(perturbed_nrm, smooth_nrm, smooth_tng, opengl)
+    return _bend_normal(view_vec, shading_nrm, geom_nrm, two_sided_shading)
+################################################################################
+# Simple lambertian diffuse BSDF
+################################################################################
+def bsdf_lambert(nrm, wi):
+    return torch.clamp(_dot(nrm, wi), min=0.0) / math.pi
+################################################################################
+# Frostbite diffuse
+################################################################################
+def bsdf_frostbite(nrm, wi, wo, linearRoughness):
+    wiDotN = _dot(wi, nrm)
+    woDotN = _dot(wo, nrm)
+    h = _safe_normalize(wo + wi)
+    wiDotH = _dot(wi, h)
+    energyBias = 0.5 * linearRoughness
+    energyFactor = 1.0 - (0.51 / 1.51) * linearRoughness
+    f90 = energyBias + 2.0 * wiDotH * wiDotH * linearRoughness
+    f0 = 1.0
+    wiScatter = bsdf_fresnel_shlick(f0, f90, wiDotN)
+    woScatter = bsdf_fresnel_shlick(f0, f90, woDotN)
+    res = wiScatter * woScatter * energyFactor
+    return torch.where((wiDotN > 0.0) & (woDotN > 0.0), res, torch.zeros_like(res))
+################################################################################
+# Phong specular, loosely based on mitsuba implementation
+################################################################################
+def bsdf_phong(nrm, wo, wi, N):
+    dp_r = torch.clamp(_dot(_reflect(wo, nrm), wi), min=0.0, max=1.0)
+    dp_l = torch.clamp(_dot(nrm, wi), min=0.0, max=1.0)
+    return (dp_r ** N) * dp_l * (N + 2) / (2 * math.pi)
+################################################################################
+# PBR's implementation of GGX specular
+################################################################################
+specular_epsilon = 1e-4
+def bsdf_fresnel_shlick(f0, f90, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    return f0 + (f90 - f0) * (1.0 - _cosTheta) ** 5.0
+def bsdf_ndf_ggx(alphaSqr, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1
+    return alphaSqr / (d * d * math.pi)
+def bsdf_lambda_ggx(alphaSqr, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    cosThetaSqr = _cosTheta * _cosTheta
+    tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr
+    res = 0.5 * (torch.sqrt(1 + alphaSqr * tanThetaSqr) - 1.0)
+    return res
+def bsdf_masking_smith_ggx_correlated(alphaSqr, cosThetaI, cosThetaO):
+    lambdaI = bsdf_lambda_ggx(alphaSqr, cosThetaI)
+    lambdaO = bsdf_lambda_ggx(alphaSqr, cosThetaO)
+    return 1 / (1 + lambdaI + lambdaO)
+def bsdf_pbr_specular(col, nrm, wo, wi, alpha, min_roughness=0.08):
+    _alpha = torch.clamp(alpha, min=min_roughness*min_roughness, max=1.0)
+    alphaSqr = _alpha * _alpha
+    h = _safe_normalize(wo + wi)
+    woDotN = _dot(wo, nrm)
+    wiDotN = _dot(wi, nrm)
+    woDotH = _dot(wo, h)
+    nDotH  = _dot(nrm, h)
+    D = bsdf_ndf_ggx(alphaSqr, nDotH)
+    G = bsdf_masking_smith_ggx_correlated(alphaSqr, woDotN, wiDotN)
+    F = bsdf_fresnel_shlick(col, 1, woDotH)
+    w = F * D * G * 0.25 / torch.clamp(woDotN, min=specular_epsilon)
+    frontfacing = (woDotN > specular_epsilon) & (wiDotN > specular_epsilon)
+    return torch.where(frontfacing, w, torch.zeros_like(w))
+def bsdf_pbr(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF):
+    wo = _safe_normalize(view_pos - pos)
+    wi = _safe_normalize(light_pos - pos)
+    spec_str  = arm[..., 0:1] # x component
+    roughness = arm[..., 1:2] # y component
+    metallic  = arm[..., 2:3] # z component
+    ks = (0.04 * (1.0 - metallic) + kd * metallic) * (1 - spec_str)
+    kd = kd * (1.0 - metallic)
+    if BSDF == 0:
+        diffuse = kd * bsdf_lambert(nrm, wi)
+    else:
+        diffuse = kd * bsdf_frostbite(nrm, wi, wo, roughness)
+    specular = bsdf_pbr_specular(ks, nrm, wo, wi, roughness*roughness, min_roughness=min_roughness)
+    return diffuse + specular

video3d/render/renderutils/c_src/bsdf.cu ADDED Viewed

	@@ -0,0 +1,710 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include "common.h"
+#include "bsdf.h"
+#define SPECULAR_EPSILON 1e-4f
+//------------------------------------------------------------------------
+// Lambert functions
+__device__ inline float fwdLambert(const vec3f nrm, const vec3f wi)
+{
+    return max(dot(nrm, wi) / M_PI, 0.0f);
+}
+__device__ inline void bwdLambert(const vec3f nrm, const vec3f wi, vec3f& d_nrm, vec3f& d_wi, const float d_out)
+{
+    if (dot(nrm, wi) > 0.0f)
+        bwdDot(nrm, wi, d_nrm, d_wi, d_out / M_PI);
+}
+//------------------------------------------------------------------------
+// Fresnel Schlick
+__device__ inline float fwdFresnelSchlick(const float f0, const float f90, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = powf(1.0f - _cosTheta, 5.0f);
+    return f0 * (1.0f - scale) + f90 * scale;
+}
+__device__ inline void bwdFresnelSchlick(const float f0, const float f90, const float cosTheta, float& d_f0, float& d_f90, float& d_cosTheta, const float d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
+    d_f0 += d_out * (1.0 - scale);
+    d_f90 += d_out * scale;
+    if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f);
+    }
+}
+__device__ inline vec3f fwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = powf(1.0f - _cosTheta, 5.0f);
+    return f0 * (1.0f - scale) + f90 * scale;
+}
+__device__ inline void bwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta, vec3f& d_f0, vec3f& d_f90, float& d_cosTheta, const vec3f d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
+    d_f0 += d_out * (1.0 - scale);
+    d_f90 += d_out * scale;
+    if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += sum(d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f));
+    }
+}
+//------------------------------------------------------------------------
+// Frostbite diffuse
+__device__ inline float fwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness)
+{
+    float wiDotN = dot(wi, nrm);
+    float woDotN = dot(wo, nrm);
+    if (wiDotN > 0.0f && woDotN > 0.0f)
+    {
+        vec3f h = safeNormalize(wo + wi);
+        float wiDotH = dot(wi, h);
+        float energyBias = 0.5f * linearRoughness;
+        float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float f0 = 1.f;
+        float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
+        float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        return wiScatter * woScatter * energyFactor;
+    }
+    else return 0.0f;
+}
+__device__ inline void bwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness, vec3f& d_nrm, vec3f& d_wi, vec3f& d_wo, float &d_linearRoughness, const float d_out)
+{
+    float wiDotN = dot(wi, nrm);
+    float woDotN = dot(wo, nrm);
+    if (wiDotN > 0.0f && woDotN > 0.0f)
+    {
+        vec3f h = safeNormalize(wo + wi);
+        float wiDotH = dot(wi, h);
+        float energyBias = 0.5f * linearRoughness;
+        float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float f0 = 1.f;
+        float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
+        float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        // -------------- BWD --------------
+        // Backprop: return wiScatter * woScatter * energyFactor;
+        float d_wiScatter = d_out * woScatter * energyFactor;
+        float d_woScatter = d_out * wiScatter * energyFactor;
+        float d_energyFactor = d_out * wiScatter * woScatter;
+        // Backprop: float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        float d_woDotN = 0.0f, d_f0 = 0.0, d_f90 = 0.0f;
+        bwdFresnelSchlick(f0, f90, woDotN, d_f0, d_f90, d_woDotN, d_woScatter);
+        // Backprop: float wiScatter = fwdFresnelSchlick(fd0, fd90, wiDotN);
+        float d_wiDotN = 0.0f;
+        bwdFresnelSchlick(f0, f90, wiDotN, d_f0, d_f90, d_wiDotN, d_wiScatter);
+        // Backprop: float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float d_energyBias = d_f90;
+        float d_wiDotH = d_f90 * 4 * wiDotH * linearRoughness;
+        d_linearRoughness += d_f90 * 2 * wiDotH * wiDotH;
+        // Backprop: float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        d_linearRoughness -= (0.51f / 1.51f) * d_energyFactor;
+        // Backprop: float energyBias = 0.5f * linearRoughness;
+        d_linearRoughness += 0.5 * d_energyBias;
+        // Backprop: float wiDotH = dot(wi, h);
+        vec3f d_h(0);
+        bwdDot(wi, h, d_wi, d_h, d_wiDotH);
+        // Backprop: vec3f h = safeNormalize(wo + wi);
+        vec3f d_wo_wi(0);
+        bwdSafeNormalize(wo + wi, d_wo_wi, d_h);
+        d_wi += d_wo_wi; d_wo += d_wo_wi;
+        bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
+        bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
+    }
+}
+//------------------------------------------------------------------------
+// Ndf GGX
+__device__ inline float fwdNdfGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
+    return alphaSqr / (d * d * M_PI);
+}
+__device__ inline void bwdNdfGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
+{
+    // Torch only back propagates if clamp doesn't trigger
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    d_alphaSqr += d_out * (1.0f - (alphaSqr + 1.0f) * cosThetaSqr) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
+    if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += d_out * -(4.0f * (alphaSqr - 1.0f) * alphaSqr * cosTheta) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
+    }
+}
+//------------------------------------------------------------------------
+// Lambda GGX
+__device__ inline float fwdLambdaGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
+    float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
+    return res;
+}
+__device__ inline void bwdLambdaGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
+    float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
+    d_alphaSqr += d_out * (0.25 * tanThetaSqr) / sqrtf(alphaSqr * tanThetaSqr + 1.0f);
+    if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+        d_cosTheta += d_out * -(0.5 * alphaSqr) / (powf(_cosTheta, 3.0f) * sqrtf(alphaSqr / cosThetaSqr - alphaSqr + 1.0f));
+}
+//------------------------------------------------------------------------
+// Masking GGX
+__device__ inline float fwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO)
+{
+    float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
+    float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
+    return 1.0f / (1.0f + lambdaI + lambdaO);
+}
+__device__ inline void bwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO, float& d_alphaSqr, float& d_cosThetaI, float& d_cosThetaO, const float d_out)
+{
+    // FWD eval
+    float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
+    float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
+    // BWD eval
+    float d_lambdaIO = -d_out / powf(1.0f + lambdaI + lambdaO, 2.0f);
+    bwdLambdaGGX(alphaSqr, cosThetaI, d_alphaSqr, d_cosThetaI, d_lambdaIO);
+    bwdLambdaGGX(alphaSqr, cosThetaO, d_alphaSqr, d_cosThetaO, d_lambdaIO);
+}
+//------------------------------------------------------------------------
+// GGX specular
+__device__ vec3f fwdPbrSpecular(const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness)
+{
+    float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
+    float alphaSqr = _alpha * _alpha;
+    vec3f h = safeNormalize(wo + wi);
+    float woDotN = dot(wo, nrm);
+    float wiDotN = dot(wi, nrm);
+    float woDotH = dot(wo, h);
+    float nDotH = dot(nrm, h);
+    float D = fwdNdfGGX(alphaSqr, nDotH);
+    float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
+    vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
+    vec3f w = F * D * G * 0.25 / woDotN;
+    bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
+    return frontfacing ? w : 0.0f;
+}
+__device__ void bwdPbrSpecular(
+    const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness,
+    vec3f& d_col, vec3f& d_nrm, vec3f& d_wo, vec3f& d_wi, float& d_alpha, const vec3f d_out)
+{
+    ///////////////////////////////////////////////////////////////////////
+    // FWD eval
+    float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
+    float alphaSqr = _alpha * _alpha;
+    vec3f h = safeNormalize(wo + wi);
+    float woDotN = dot(wo, nrm);
+    float wiDotN = dot(wi, nrm);
+    float woDotH = dot(wo, h);
+    float nDotH = dot(nrm, h);
+    float D = fwdNdfGGX(alphaSqr, nDotH);
+    float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
+    vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
+    vec3f w = F * D * G * 0.25 / woDotN;
+    bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
+    if (frontfacing)
+    {
+        ///////////////////////////////////////////////////////////////////////
+        // BWD eval
+        vec3f d_F = d_out * D * G * 0.25f / woDotN;
+        float d_D = sum(d_out * F * G * 0.25f / woDotN);
+        float d_G = sum(d_out * F * D * 0.25f / woDotN);
+        float d_woDotN = -sum(d_out * F * D * G * 0.25f / (woDotN * woDotN));
+        vec3f d_f90(0);
+        float d_woDotH(0), d_wiDotN(0), d_nDotH(0), d_alphaSqr(0);
+        bwdFresnelSchlick(col, 1.0f, woDotH, d_col, d_f90, d_woDotH, d_F);
+        bwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN, d_alphaSqr, d_woDotN, d_wiDotN, d_G);
+        bwdNdfGGX(alphaSqr, nDotH, d_alphaSqr, d_nDotH, d_D);
+        vec3f d_h(0);
+        bwdDot(nrm, h, d_nrm, d_h, d_nDotH);
+        bwdDot(wo, h, d_wo, d_h, d_woDotH);
+        bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
+        bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
+        vec3f d_h_unnorm(0);
+        bwdSafeNormalize(wo + wi, d_h_unnorm, d_h);
+        d_wo += d_h_unnorm;
+        d_wi += d_h_unnorm;
+        if (alpha > min_roughness * min_roughness)
+            d_alpha += d_alphaSqr * 2 * alpha;
+    }
+}
+//------------------------------------------------------------------------
+// Full PBR BSDF
+__device__ vec3f fwdPbrBSDF(const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF)
+{
+    vec3f wo = safeNormalize(view_pos - pos);
+    vec3f wi = safeNormalize(light_pos - pos);
+    float alpha = arm.y * arm.y;
+    vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
+    vec3f diff_col = kd * (1.0f - arm.z);
+    float diff = 0.0f;
+    if (BSDF == 0)
+        diff = fwdLambert(nrm, wi);
+    else
+        diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);
+    vec3f diffuse = diff_col * diff;
+    vec3f specular = fwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness);
+    return diffuse + specular;
+}
+__device__ void bwdPbrBSDF(
+    const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF,
+    vec3f& d_kd, vec3f& d_arm, vec3f& d_pos, vec3f& d_nrm, vec3f& d_view_pos, vec3f& d_light_pos, const vec3f d_out)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f _wi = light_pos - pos;
+    vec3f _wo = view_pos - pos;
+    vec3f wi = safeNormalize(_wi);
+    vec3f wo = safeNormalize(_wo);
+    float alpha = arm.y * arm.y;
+    vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
+    vec3f diff_col = kd * (1.0f - arm.z);
+    float diff = 0.0f;
+    if (BSDF == 0)
+        diff = fwdLambert(nrm, wi);
+    else
+        diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);
+    ////////////////////////////////////////////////////////////////////////
+    // BWD
+    float d_alpha(0);
+    vec3f d_spec_col(0), d_wi(0), d_wo(0);
+    bwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness, d_spec_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
+    float d_diff = sum(diff_col * d_out);
+    if (BSDF == 0)
+        bwdLambert(nrm, wi, d_nrm, d_wi, d_diff);
+    else
+        bwdFrostbiteDiffuse(nrm, wi, wo, arm.y, d_nrm, d_wi, d_wo, d_arm.y, d_diff);
+    // Backprop: diff_col = kd * (1.0f - arm.z)
+    vec3f d_diff_col = d_out * diff;
+    d_kd += d_diff_col * (1.0f - arm.z);
+    d_arm.z -= sum(d_diff_col * kd);
+    // Backprop: spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x)
+    d_kd -= d_spec_col * (arm.x - 1.0f) * arm.z;
+    d_arm.x += sum(d_spec_col * (arm.z * (0.04f - kd) - 0.04f));
+    d_arm.z -= sum(d_spec_col * (kd - 0.04f) * (arm.x - 1.0f));
+    // Backprop: alpha = arm.y * arm.y
+    d_arm.y += d_alpha * 2 * arm.y;
+    // Backprop: vec3f wi = safeNormalize(light_pos - pos);
+    vec3f d__wi(0);
+    bwdSafeNormalize(_wi, d__wi, d_wi);
+    d_light_pos += d__wi;
+    d_pos -= d__wi;
+    // Backprop: vec3f wo = safeNormalize(view_pos - pos);
+    vec3f d__wo(0);
+    bwdSafeNormalize(_wo, d__wo, d_wo);
+    d_view_pos += d__wo;
+    d_pos -= d__wo;
+}
+//------------------------------------------------------------------------
+// Kernels
+__global__ void LambertFwdKernel(LambertKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float res = fwdLambert(nrm, wi);
+    p.out.store(px, py, pz, res);
+}
+__global__ void LambertBwdKernel(LambertKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    vec3f d_nrm(0), d_wi(0);
+    bwdLambert(nrm, wi, d_nrm, d_wi, d_out);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wi.store_grad(px, py, pz, d_wi);
+}
+__global__ void FrostbiteDiffuseFwdKernel(FrostbiteDiffuseKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
+    float res = fwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness);
+    p.out.store(px, py, pz, res);
+}
+__global__ void FrostbiteDiffuseBwdKernel(FrostbiteDiffuseKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_linearRoughness = 0.0f;
+    vec3f d_nrm(0), d_wi(0), d_wo(0);
+    bwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness, d_nrm, d_wi, d_wo, d_linearRoughness, d_out);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wi.store_grad(px, py, pz, d_wi);
+    p.wo.store_grad(px, py, pz, d_wo);
+    p.linearRoughness.store_grad(px, py, pz, d_linearRoughness);
+}
+__global__ void FresnelShlickFwdKernel(FresnelShlickKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f f0 = p.f0.fetch3(px, py, pz);
+    vec3f f90 = p.f90.fetch3(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    vec3f res = fwdFresnelSchlick(f0, f90, cosTheta);
+    p.out.store(px, py, pz, res);
+}
+__global__ void FresnelShlickBwdKernel(FresnelShlickKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f f0 = p.f0.fetch3(px, py, pz);
+    vec3f f90 = p.f90.fetch3(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+    vec3f d_f0(0), d_f90(0);
+    float d_cosTheta(0);
+    bwdFresnelSchlick(f0, f90, cosTheta, d_f0, d_f90, d_cosTheta, d_out);
+    p.f0.store_grad(px, py, pz, d_f0);
+    p.f90.store_grad(px, py, pz, d_f90);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+__global__ void ndfGGXFwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float res = fwdNdfGGX(alphaSqr, cosTheta);
+    p.out.store(px, py, pz, res);
+}
+__global__ void ndfGGXBwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_alphaSqr(0), d_cosTheta(0);
+    bwdNdfGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+__global__ void lambdaGGXFwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float res = fwdLambdaGGX(alphaSqr, cosTheta);
+    p.out.store(px, py, pz, res);
+}
+__global__ void lambdaGGXBwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_alphaSqr(0), d_cosTheta(0);
+    bwdLambdaGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+__global__ void maskingSmithFwdKernel(MaskingSmithParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
+    float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
+    float res = fwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO);
+    p.out.store(px, py, pz, res);
+}
+__global__ void maskingSmithBwdKernel(MaskingSmithParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
+    float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_alphaSqr(0), d_cosThetaI(0), d_cosThetaO(0);
+    bwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO, d_alphaSqr, d_cosThetaI, d_cosThetaO, d_out);
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosThetaI.store_grad(px, py, pz, d_cosThetaI);
+    p.cosThetaO.store_grad(px, py, pz, d_cosThetaO);
+}
+__global__ void pbrSpecularFwdKernel(PbrSpecular p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f col = p.col.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float alpha = p.alpha.fetch1(px, py, pz);
+    vec3f res = fwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness);
+    p.out.store(px, py, pz, res);
+}
+__global__ void pbrSpecularBwdKernel(PbrSpecular p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f col = p.col.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float alpha = p.alpha.fetch1(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+    float d_alpha(0);
+    vec3f d_col(0), d_nrm(0), d_wo(0), d_wi(0);
+    bwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness, d_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
+    p.col.store_grad(px, py, pz, d_col);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wo.store_grad(px, py, pz, d_wo);
+    p.wi.store_grad(px, py, pz, d_wi);
+    p.alpha.store_grad(px, py, pz, d_alpha);
+}
+__global__ void pbrBSDFFwdKernel(PbrBSDF p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f kd = p.kd.fetch3(px, py, pz);
+    vec3f arm = p.arm.fetch3(px, py, pz);
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f light_pos = p.light_pos.fetch3(px, py, pz);
+    vec3f res = fwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF);
+    p.out.store(px, py, pz, res);
+}
+__global__ void pbrBSDFBwdKernel(PbrBSDF p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f kd = p.kd.fetch3(px, py, pz);
+    vec3f arm = p.arm.fetch3(px, py, pz);
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f light_pos = p.light_pos.fetch3(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+    vec3f d_kd(0), d_arm(0), d_pos(0), d_nrm(0), d_view_pos(0), d_light_pos(0);
+    bwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF, d_kd, d_arm, d_pos, d_nrm, d_view_pos, d_light_pos, d_out);
+    p.kd.store_grad(px, py, pz, d_kd);
+    p.arm.store_grad(px, py, pz, d_arm);
+    p.pos.store_grad(px, py, pz, d_pos);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.view_pos.store_grad(px, py, pz, d_view_pos);
+    p.light_pos.store_grad(px, py, pz, d_light_pos);
+}

video3d/render/renderutils/c_src/bsdf.h ADDED Viewed

	@@ -0,0 +1,84 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include "common.h"
+struct LambertKernelParams
+{
+    Tensor  nrm;
+    Tensor  wi;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct FrostbiteDiffuseKernelParams
+{
+    Tensor  nrm;
+    Tensor  wi;
+    Tensor  wo;
+    Tensor  linearRoughness;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct FresnelShlickKernelParams
+{
+    Tensor  f0;
+    Tensor  f90;
+    Tensor  cosTheta;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct NdfGGXParams
+{
+    Tensor  alphaSqr;
+    Tensor  cosTheta;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct MaskingSmithParams
+{
+    Tensor  alphaSqr;
+    Tensor  cosThetaI;
+    Tensor  cosThetaO;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct PbrSpecular
+{
+    Tensor  col;
+    Tensor  nrm;
+    Tensor  wo;
+    Tensor  wi;
+    Tensor  alpha;
+    Tensor  out;
+    dim3    gridSize;
+    float   min_roughness;
+};
+struct PbrBSDF
+{
+    Tensor  kd;
+    Tensor  arm;
+    Tensor  pos;
+    Tensor  nrm;
+    Tensor  view_pos;
+    Tensor  light_pos;
+    Tensor  out;
+    dim3    gridSize;
+    float   min_roughness;
+    int     BSDF;
+};

video3d/render/renderutils/c_src/common.cpp ADDED Viewed

	@@ -0,0 +1,74 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <cuda_runtime.h>
+#include <algorithm>
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (dims.x * dims.y) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+    // Optimizations for weirdly sized buffers.
+    if (dims.x < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= dims.x)
+            bw >>= 1;
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > dims.y)
+            bh = dims.y;
+    }
+    else if (dims.y < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > dims.y)
+        {
+            bh >>= 1;
+            if (bw < dims.x)
+                bw <<= 1;
+        }
+    }
+    // Done.
+    return dim3(bw, bh, 1);
+}
+// returns the size of a block that can be reduced using horizontal SIMD operations (e.g. __shfl_xor_sync)
+dim3 getWarpSize(dim3 blockSize)
+{
+    return dim3(
+        std::min(blockSize.x, 32u),
+        std::min(std::max(32u / blockSize.x, 1u), std::min(32u, blockSize.y)),
+        std::min(std::max(32u / (blockSize.x * blockSize.y), 1u), std::min(32u, blockSize.z))
+    );
+}
+dim3 getLaunchGridSize(dim3 blockSize, dim3 dims)
+{
+    dim3 gridSize;
+    gridSize.x = (dims.x  - 1) / blockSize.x + 1;
+    gridSize.y = (dims.y - 1) / blockSize.y + 1;
+    gridSize.z = (dims.z  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+//------------------------------------------------------------------------

video3d/render/renderutils/c_src/common.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+#include "vec3f.h"
+#include "vec4f.h"
+#include "tensor.h"
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims);
+dim3 getLaunchGridSize(dim3 blockSize, dim3 dims);
+#ifdef __CUDACC__
+#ifdef _MSC_VER
+#define M_PI 3.14159265358979323846f
+#endif
+__host__ __device__ static inline dim3 getWarpSize(dim3 blockSize)
+{
+    return dim3(
+        min(blockSize.x, 32u),
+        min(max(32u / blockSize.x, 1u), min(32u, blockSize.y)),
+        min(max(32u / (blockSize.x * blockSize.y), 1u), min(32u, blockSize.z))
+    );
+}
+__device__ static inline float clamp(float val, float mn, float mx) { return min(max(val, mn), mx); }
+#else
+dim3 getWarpSize(dim3 blockSize);
+#endif

video3d/render/renderutils/c_src/cubemap.cu ADDED Viewed

	@@ -0,0 +1,350 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include "common.h"
+#include "cubemap.h"
+#include <float.h>
+// https://cgvr.cs.uni-bremen.de/teaching/cg_literatur/Spherical,%20Cubic,%20and%20Parabolic%20Environment%20Mappings.pdf
+__device__ float pixel_area(int x, int y, int N)
+{
+    if (N > 1)
+    {
+        int H = N / 2;
+        x = abs(x - H);
+        y = abs(y - H);
+        float dx = atan((float)(x + 1) / (float)H) - atan((float)x / (float)H);
+        float dy = atan((float)(y + 1) / (float)H) - atan((float)y / (float)H);
+        return dx * dy;
+    }
+    else
+        return 1;
+}
+__device__ vec3f cube_to_dir(int x, int y, int side, int N)
+{
+    float fx = 2.0f * (((float)x + 0.5f) / (float)N) - 1.0f;
+    float fy = 2.0f * (((float)y + 0.5f) / (float)N) - 1.0f;
+    switch (side)
+    {
+        case 0: return safeNormalize(vec3f(1, -fy, -fx));
+        case 1: return safeNormalize(vec3f(-1, -fy, fx));
+        case 2: return safeNormalize(vec3f(fx, 1, fy));
+        case 3: return safeNormalize(vec3f(fx, -1, -fy));
+        case 4: return safeNormalize(vec3f(fx, -fy, 1));
+        case 5: return safeNormalize(vec3f(-fx, -fy, -1));
+    }
+    return vec3f(0,0,0); // Unreachable
+}
+__device__ vec3f dir_to_side(int side, vec3f v)
+{
+    switch (side)
+    {
+    case 0: return vec3f(-v.z, -v.y,  v.x);
+    case 1: return vec3f( v.z, -v.y, -v.x);
+    case 2: return vec3f( v.x,  v.z,  v.y);
+    case 3: return vec3f( v.x, -v.z, -v.y);
+    case 4: return vec3f( v.x, -v.y,  v.z);
+    case 5: return vec3f(-v.x, -v.y, -v.z);
+    }
+    return vec3f(0,0,0); // Unreachable
+}
+__device__ void extents_1d(float x, float z, float theta, float& _min, float& _max)
+{
+    float l = sqrtf(x * x + z * z);
+    float pxr = x + z * tan(theta) * l, pzr = z - x * tan(theta) * l;
+    float pxl = x - z * tan(theta) * l, pzl = z + x * tan(theta) * l;
+    if (pzl <= 0.00001f)
+        _min = pxl > 0.0f ? FLT_MAX : -FLT_MAX;
+    else
+        _min = pxl / pzl;
+    if (pzr <= 0.00001f)
+        _max = pxr > 0.0f ? FLT_MAX : -FLT_MAX;
+    else
+        _max = pxr / pzr;
+}
+__device__ void dir_extents(int side, int N, vec3f v, float theta, int &_xmin, int& _xmax, int& _ymin, int& _ymax)
+{
+    vec3f c = dir_to_side(side, v); // remap to (x,y,z) where side is at z = 1
+    if (theta < 0.785398f) // PI/4
+    {
+        float xmin, xmax, ymin, ymax;
+        extents_1d(c.x, c.z, theta, xmin, xmax);
+        extents_1d(c.y, c.z, theta, ymin, ymax);
+        if (xmin > 1.0f || xmax < -1.0f || ymin > 1.0f || ymax < -1.0f)
+        {
+            _xmin = -1; _xmax = -1; _ymin = -1; _ymax = -1; // Bad aabb
+        }
+        else
+        {
+            _xmin = (int)min(max((xmin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _xmax = (int)min(max((xmax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _ymin = (int)min(max((ymin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _ymax = (int)min(max((ymax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+        }
+    }
+    else
+    {
+            _xmin = 0.0f;
+            _xmax = (float)(N-1);
+            _ymin = 0.0f;
+            _ymax = (float)(N-1);
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Diffuse kernel
+__global__ void DiffuseCubemapFwdKernel(DiffuseCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f N = cube_to_dir(px, py, pz, Npx);
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        for (int y = 0; y < Npx; ++y)
+        {
+            for (int x = 0; x < Npx; ++x)
+            {
+                vec3f L = cube_to_dir(x, y, s, Npx);
+                float costheta = min(max(dot(N, L), 0.0f), 0.999f);
+                float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
+                col += p.cubemap.fetch3(x, y, s) * w;
+            }
+        }
+    }
+    p.out.store(px, py, pz, col);
+}
+__global__ void DiffuseCubemapBwdKernel(DiffuseCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f N = cube_to_dir(px, py, pz, Npx);
+    vec3f grad = p.out.fetch3(px, py, pz);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        for (int y = 0; y < Npx; ++y)
+        {
+            for (int x = 0; x < Npx; ++x)
+            {
+                vec3f L = cube_to_dir(x, y, s, Npx);
+                float costheta = min(max(dot(N, L), 0.0f), 0.999f);
+                float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
+            }
+        }
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// GGX splitsum kernel
+__device__ inline float ndfGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, 0.0, 1.0f);
+    float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
+    return alphaSqr / (d * d * M_PI);
+}
+__global__ void SpecularBoundsKernel(SpecularBoundsKernelParams p)
+{
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.gridSize.x;
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+    const int TILE_SIZE = 16;
+    // Brute force entire cubemap and compute bounds for the cone
+    for (int s = 0; s < p.gridSize.z; ++s)
+    {
+        // Assume empty BBox
+        int _min_x = p.gridSize.x - 1, _max_x = 0;
+        int _min_y = p.gridSize.y - 1, _max_y = 0;
+        // For each (8x8) tile
+        for (int tx = 0; tx < (p.gridSize.x + TILE_SIZE - 1) / TILE_SIZE; tx++)
+        {
+            for (int ty = 0; ty < (p.gridSize.y + TILE_SIZE - 1) / TILE_SIZE; ty++)
+            {
+                // Compute tile extents
+                int tsx = tx * TILE_SIZE, tsy = ty * TILE_SIZE;
+                int tex = min((tx + 1) * TILE_SIZE, p.gridSize.x), tey = min((ty + 1) * TILE_SIZE, p.gridSize.y);
+                // Use some blunt interval arithmetics to cull tiles
+                vec3f L0 = cube_to_dir(tsx, tsy, s, Npx), L1 = cube_to_dir(tex, tsy, s, Npx);
+                vec3f L2 = cube_to_dir(tsx, tey, s, Npx), L3 = cube_to_dir(tex, tey, s, Npx);
+                float minx = min(min(L0.x, L1.x), min(L2.x, L3.x)), maxx = max(max(L0.x, L1.x), max(L2.x, L3.x));
+                float miny = min(min(L0.y, L1.y), min(L2.y, L3.y)), maxy = max(max(L0.y, L1.y), max(L2.y, L3.y));
+                float minz = min(min(L0.z, L1.z), min(L2.z, L3.z)), maxz = max(max(L0.z, L1.z), max(L2.z, L3.z));
+                float maxdp = max(minx * VNR.x, maxx * VNR.x) + max(miny * VNR.y, maxy * VNR.y) + max(minz * VNR.z, maxz * VNR.z);
+                if (maxdp >= p.costheta_cutoff)
+                {
+                    // Test all pixels in tile.
+                    for (int y = tsy; y < tey; ++y)
+                    {
+                        for (int x = tsx; x < tex; ++x)
+                        {
+                            vec3f L = cube_to_dir(x, y, s, Npx);
+                            if (dot(L, VNR) >= p.costheta_cutoff)
+                            {
+                                _min_x = min(_min_x, x);
+                                _max_x = max(_max_x, x);
+                                _min_y = min(_min_y, y);
+                                _max_y = max(_max_y, y);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 0), _min_x);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 1), _max_x);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 2), _min_y);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 3), _max_y);
+    }
+}
+__global__ void SpecularCubemapFwdKernel(SpecularCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+    float alpha = p.roughness * p.roughness;
+    float alphaSqr = alpha * alpha;
+    float wsum = 0.0f;
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        int xmin, xmax, ymin, ymax;
+        xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
+        xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
+        ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
+        ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
+        if (xmin <= xmax)
+        {
+            for (int y = ymin; y <= ymax; ++y)
+            {
+                for (int x = xmin; x <= xmax; ++x)
+                {
+                    vec3f L = cube_to_dir(x, y, s, Npx);
+                    if (dot(L, VNR) >= p.costheta_cutoff)
+                    {
+                        vec3f H = safeNormalize(L + VNR);
+                        float wiDotN = max(dot(L, VNR), 0.0f);
+                        float VNRDotH = max(dot(VNR, H), 0.0f);
+                        float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
+                        col += p.cubemap.fetch3(x, y, s) * w;
+                        wsum += w;
+                    }
+                }
+            }
+        }
+    }
+    p.out.store(p.out._nhwcIndex(pz, py, px, 0), col.x);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 1), col.y);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 2), col.z);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 3), wsum);
+}
+__global__ void SpecularCubemapBwdKernel(SpecularCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+    vec3f grad = p.out.fetch3(px, py, pz);
+    float alpha = p.roughness * p.roughness;
+    float alphaSqr = alpha * alpha;
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        int xmin, xmax, ymin, ymax;
+        xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
+        xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
+        ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
+        ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
+        if (xmin <= xmax)
+        {
+            for (int y = ymin; y <= ymax; ++y)
+            {
+                for (int x = xmin; x <= xmax; ++x)
+                {
+                    vec3f L = cube_to_dir(x, y, s, Npx);
+                    if (dot(L, VNR) >= p.costheta_cutoff)
+                    {
+                        vec3f H = safeNormalize(L + VNR);
+                        float wiDotN = max(dot(L, VNR), 0.0f);
+                        float VNRDotH = max(dot(VNR, H), 0.0f);
+                        float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
+                    }
+                }
+            }
+        }
+    }
+}

video3d/render/renderutils/c_src/cubemap.h ADDED Viewed

	@@ -0,0 +1,38 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include "common.h"
+struct DiffuseCubemapKernelParams
+{
+    Tensor  cubemap;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct SpecularCubemapKernelParams
+{
+    Tensor  cubemap;
+    Tensor  bounds;
+    Tensor  out;
+    dim3    gridSize;
+    float   costheta_cutoff;
+    float   roughness;
+};
+struct SpecularBoundsKernelParams
+{
+    float   costheta_cutoff;
+    Tensor  out;
+    dim3    gridSize;
+};

video3d/render/renderutils/c_src/loss.cu ADDED Viewed

	@@ -0,0 +1,210 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <cuda.h>
+#include "common.h"
+#include "loss.h"
+//------------------------------------------------------------------------
+// Utils
+__device__ inline float bwdAbs(float x) { return x == 0.0f ? 0.0f : x < 0.0f ? -1.0f : 1.0f; }
+__device__ float warpSum(float val) {
+    for (int i = 1; i < 32; i *= 2)
+        val += __shfl_xor_sync(0xFFFFFFFF, val, i);
+    return val;
+}
+//------------------------------------------------------------------------
+// Tonemapping
+__device__ inline float fwdSRGB(float x)
+{
+    return x > 0.0031308f ? powf(max(x, 0.0031308f), 1.0f / 2.4f) * 1.055f - 0.055f : 12.92f * max(x, 0.0f);
+}
+__device__ inline void bwdSRGB(float x, float &d_x, float d_out)
+{
+    if (x > 0.0031308f)
+        d_x += d_out * 0.439583f / powf(x, 0.583333f);
+    else if (x > 0.0f)
+        d_x += d_out * 12.92f;
+}
+__device__ inline vec3f fwdTonemapLogSRGB(vec3f x)
+{
+    return vec3f(fwdSRGB(logf(x.x + 1.0f)), fwdSRGB(logf(x.y + 1.0f)), fwdSRGB(logf(x.z + 1.0f)));
+}
+__device__ inline void bwdTonemapLogSRGB(vec3f x, vec3f& d_x, vec3f d_out)
+{
+    if (x.x > 0.0f && x.x < 65535.0f)
+    {
+        bwdSRGB(logf(x.x + 1.0f), d_x.x, d_out.x);
+        d_x.x *= 1 / (x.x + 1.0f);
+    }
+    if (x.y > 0.0f && x.y < 65535.0f)
+    {
+        bwdSRGB(logf(x.y + 1.0f), d_x.y, d_out.y);
+        d_x.y *= 1 / (x.y + 1.0f);
+    }
+    if (x.z > 0.0f && x.z < 65535.0f)
+    {
+        bwdSRGB(logf(x.z + 1.0f), d_x.z, d_out.z);
+        d_x.z *= 1 / (x.z + 1.0f);
+    }
+}
+__device__ inline float fwdRELMSE(float img, float target, float eps = 0.1f)
+{
+    return (img - target) * (img - target) / (img * img + target * target + eps);
+}
+__device__ inline void bwdRELMSE(float img, float target, float &d_img, float &d_target, float d_out, float eps = 0.1f)
+{
+    float denom  = (target * target + img * img + eps);
+    d_img    += d_out * 2 * (img - target) * (target * (target + img) + eps) / (denom * denom);
+    d_target -= d_out * 2 * (img - target) * (img * (target + img) + eps) / (denom * denom);
+}
+__device__ inline float fwdSMAPE(float img, float target, float eps=0.01f)
+{
+    return abs(img - target) / (img + target + eps);
+}
+__device__ inline void bwdSMAPE(float img, float target, float& d_img, float& d_target, float d_out, float eps = 0.01f)
+{
+    float denom = (target + img + eps);
+    d_img    += d_out * bwdAbs(img - target) * (2 * target + eps) / (denom * denom);
+    d_target -= d_out * bwdAbs(img - target) * (2 * img + eps) / (denom * denom);
+}
+//------------------------------------------------------------------------
+// Kernels
+__global__ void imgLossFwdKernel(LossKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    float floss = 0.0f;
+    if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z)
+    {
+        vec3f img = p.img.fetch3(px, py, pz);
+        vec3f target = p.target.fetch3(px, py, pz);
+        img = vec3f(clamp(img.x, 0.0f, 65535.0f), clamp(img.y, 0.0f, 65535.0f), clamp(img.z, 0.0f, 65535.0f));
+        target = vec3f(clamp(target.x, 0.0f, 65535.0f), clamp(target.y, 0.0f, 65535.0f), clamp(target.z, 0.0f, 65535.0f));
+        if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+        {
+            img = fwdTonemapLogSRGB(img);
+            target = fwdTonemapLogSRGB(target);
+        }
+        vec3f vloss(0);
+        if (p.loss == LOSS_MSE)
+            vloss = (img - target) * (img - target);
+        else if (p.loss == LOSS_RELMSE)
+            vloss = vec3f(fwdRELMSE(img.x, target.x), fwdRELMSE(img.y, target.y), fwdRELMSE(img.z, target.z));
+        else if (p.loss == LOSS_SMAPE)
+            vloss = vec3f(fwdSMAPE(img.x, target.x), fwdSMAPE(img.y, target.y), fwdSMAPE(img.z, target.z));
+        else
+            vloss = vec3f(abs(img.x - target.x), abs(img.y - target.y), abs(img.z - target.z));
+        floss = sum(vloss) / 3.0f;
+    }
+    floss = warpSum(floss);
+    dim3 warpSize = getWarpSize(blockDim);
+    if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z && threadIdx.x % warpSize.x == 0 && threadIdx.y % warpSize.y == 0 && threadIdx.z % warpSize.z == 0)
+        p.out.store(px / warpSize.x, py / warpSize.y, pz / warpSize.z, floss);
+}
+__global__ void imgLossBwdKernel(LossKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    dim3 warpSize = getWarpSize(blockDim);
+    vec3f _img = p.img.fetch3(px, py, pz);
+    vec3f _target = p.target.fetch3(px, py, pz);
+    float d_out = p.out.fetch1(px / warpSize.x, py / warpSize.y, pz / warpSize.z);
+    /////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f img = _img, target = _target;
+    if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+    {
+        img = fwdTonemapLogSRGB(img);
+        target = fwdTonemapLogSRGB(target);
+    }
+    /////////////////////////////////////////////////////////////////////
+    // BWD
+    vec3f d_vloss = vec3f(d_out, d_out, d_out) / 3.0f;
+    vec3f d_img(0), d_target(0);
+    if (p.loss == LOSS_MSE)
+    {
+        d_img = vec3f(d_vloss.x * 2 * (img.x - target.x), d_vloss.y * 2 * (img.y - target.y), d_vloss.x * 2 * (img.z - target.z));
+        d_target = -d_img;
+    }
+    else if (p.loss == LOSS_RELMSE)
+    {
+        bwdRELMSE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
+        bwdRELMSE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
+        bwdRELMSE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
+    }
+    else if (p.loss == LOSS_SMAPE)
+    {
+        bwdSMAPE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
+        bwdSMAPE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
+        bwdSMAPE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
+    }
+    else
+    {
+        d_img = d_vloss * vec3f(bwdAbs(img.x - target.x), bwdAbs(img.y - target.y), bwdAbs(img.z - target.z));
+        d_target = -d_img;
+    }
+    if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+    {
+        vec3f d__img(0), d__target(0);
+        bwdTonemapLogSRGB(_img, d__img, d_img);
+        bwdTonemapLogSRGB(_target, d__target, d_target);
+        d_img = d__img; d_target = d__target;
+    }
+    if (_img.x <= 0.0f || _img.x >= 65535.0f) d_img.x = 0;
+    if (_img.y <= 0.0f || _img.y >= 65535.0f) d_img.y = 0;
+    if (_img.z <= 0.0f || _img.z >= 65535.0f) d_img.z = 0;
+    if (_target.x <= 0.0f || _target.x >= 65535.0f) d_target.x = 0;
+    if (_target.y <= 0.0f || _target.y >= 65535.0f) d_target.y = 0;
+    if (_target.z <= 0.0f || _target.z >= 65535.0f) d_target.z = 0;
+    p.img.store_grad(px, py, pz, d_img);
+    p.target.store_grad(px, py, pz, d_target);
+}

video3d/render/renderutils/c_src/loss.h ADDED Viewed

	@@ -0,0 +1,38 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include "common.h"
+enum TonemapperType
+{
+    TONEMAPPER_NONE = 0,
+    TONEMAPPER_LOG_SRGB = 1
+};
+enum LossType
+{
+    LOSS_L1 = 0,
+    LOSS_MSE = 1,
+    LOSS_RELMSE = 2,
+    LOSS_SMAPE = 3
+};
+struct LossKernelParams
+{
+    Tensor          img;
+    Tensor          target;
+    Tensor          out;
+    dim3            gridSize;
+    TonemapperType  tonemapper;
+    LossType        loss;
+};

video3d/render/renderutils/c_src/mesh.cu ADDED Viewed

	@@ -0,0 +1,94 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <cuda.h>
+#include <stdio.h>
+#include "common.h"
+#include "mesh.h"
+//------------------------------------------------------------------------
+// Kernels
+__global__ void xfmPointsFwdKernel(XfmKernelParams p)
+{
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
+    __shared__ float mtx[4][4];
+    if (threadIdx.x < 16)
+        mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
+    __syncthreads();
+    if (px >= p.gridSize.x)
+        return;
+    vec3f pos(
+        p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
+    );
+    if (p.isPoints)
+    {
+        p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0] + mtx[3][0]);
+        p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1] + mtx[3][1]);
+        p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2] + mtx[3][2]);
+        p.out.store(p.out.nhwcIndex(pz, px, 3, 0), pos.x * mtx[0][3] + pos.y * mtx[1][3] + pos.z * mtx[2][3] + mtx[3][3]);
+    }
+    else
+    {
+        p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0]);
+        p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1]);
+        p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2]);
+    }
+}
+__global__ void xfmPointsBwdKernel(XfmKernelParams p)
+{
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
+    __shared__ float mtx[4][4];
+    if (threadIdx.x < 16)
+        mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
+    __syncthreads();
+    if (px >= p.gridSize.x)
+        return;
+    vec3f pos(
+        p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
+    );
+    vec4f d_out(
+        p.out.fetch(p.out.nhwcIndex(pz, px, 0, 0)),
+        p.out.fetch(p.out.nhwcIndex(pz, px, 1, 0)),
+        p.out.fetch(p.out.nhwcIndex(pz, px, 2, 0)),
+        p.out.fetch(p.out.nhwcIndex(pz, px, 3, 0))
+    );
+    if (p.isPoints)
+    {
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2] + d_out.w * mtx[0][3]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2] + d_out.w * mtx[1][3]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2] + d_out.w * mtx[2][3]);
+    }
+    else
+    {
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2]);
+    }
+}

video3d/render/renderutils/c_src/mesh.h ADDED Viewed

	@@ -0,0 +1,23 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include "common.h"
+struct XfmKernelParams
+{
+    bool            isPoints;
+    Tensor          points;
+    Tensor          matrix;
+    Tensor          out;
+    dim3            gridSize;
+};

video3d/render/renderutils/c_src/normal.cu ADDED Viewed

	@@ -0,0 +1,182 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include "common.h"
+#include "normal.h"
+#define NORMAL_THRESHOLD 0.1f
+//------------------------------------------------------------------------
+// Perturb shading normal by tangent frame
+__device__ vec3f fwdPerturbNormal(const vec3f perturbed_nrm, const vec3f smooth_nrm, const vec3f smooth_tng, bool opengl)
+{
+    vec3f _smooth_bitng = cross(smooth_tng, smooth_nrm);
+    vec3f smooth_bitng = safeNormalize(_smooth_bitng);
+    vec3f _shading_nrm = smooth_tng * perturbed_nrm.x + (opengl ? -1 : 1) * smooth_bitng * perturbed_nrm.y + smooth_nrm * max(perturbed_nrm.z, 0.0f);
+    return safeNormalize(_shading_nrm);
+}
+__device__ void bwdPerturbNormal(const vec3f perturbed_nrm, const vec3f smooth_nrm, const vec3f smooth_tng, vec3f &d_perturbed_nrm, vec3f &d_smooth_nrm, vec3f &d_smooth_tng, const vec3f d_out, bool opengl)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f _smooth_bitng = cross(smooth_tng, smooth_nrm);
+    vec3f smooth_bitng = safeNormalize(_smooth_bitng);
+    vec3f _shading_nrm = smooth_tng * perturbed_nrm.x + (opengl ? -1 : 1) * smooth_bitng * perturbed_nrm.y + smooth_nrm * max(perturbed_nrm.z, 0.0f);
+    ////////////////////////////////////////////////////////////////////////
+    // BWD
+    vec3f d_shading_nrm(0);
+    bwdSafeNormalize(_shading_nrm, d_shading_nrm, d_out);
+    vec3f d_smooth_bitng(0);
+    if (perturbed_nrm.z > 0.0f)
+    {
+        d_smooth_nrm += d_shading_nrm * perturbed_nrm.z;
+        d_perturbed_nrm.z += sum(d_shading_nrm * smooth_nrm);
+    }
+    d_smooth_bitng += (opengl ? -1 : 1) * d_shading_nrm * perturbed_nrm.y;
+    d_perturbed_nrm.y += (opengl ? -1 : 1) * sum(d_shading_nrm * smooth_bitng);
+    d_smooth_tng += d_shading_nrm * perturbed_nrm.x;
+    d_perturbed_nrm.x += sum(d_shading_nrm * smooth_tng);
+    vec3f d__smooth_bitng(0);
+    bwdSafeNormalize(_smooth_bitng, d__smooth_bitng, d_smooth_bitng);
+    bwdCross(smooth_tng, smooth_nrm, d_smooth_tng, d_smooth_nrm, d__smooth_bitng);
+}
+//------------------------------------------------------------------------
+#define bent_nrm_eps 0.001f
+__device__ vec3f fwdBendNormal(const vec3f view_vec, const vec3f smooth_nrm, const vec3f geom_nrm)
+{
+    float dp = dot(view_vec, smooth_nrm);
+    float t = clamp(dp / NORMAL_THRESHOLD, 0.0f, 1.0f);
+    return geom_nrm * (1.0f - t) + smooth_nrm * t;
+}
+__device__ void bwdBendNormal(const vec3f view_vec, const vec3f smooth_nrm, const vec3f geom_nrm, vec3f& d_view_vec, vec3f& d_smooth_nrm, vec3f& d_geom_nrm, const vec3f d_out)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // FWD
+    float dp = dot(view_vec, smooth_nrm);
+    float t = clamp(dp / NORMAL_THRESHOLD, 0.0f, 1.0f);
+    ////////////////////////////////////////////////////////////////////////
+    // BWD
+    if (dp > NORMAL_THRESHOLD)
+        d_smooth_nrm += d_out;
+    else
+    {
+        // geom_nrm * (1.0f - t) + smooth_nrm * t;
+        d_geom_nrm   += d_out * (1.0f - t);
+        d_smooth_nrm += d_out * t;
+        float d_t = sum(d_out * (smooth_nrm - geom_nrm));
+        float d_dp = dp < 0.0f || dp > NORMAL_THRESHOLD ? 0.0f : d_t / NORMAL_THRESHOLD;
+        bwdDot(view_vec, smooth_nrm, d_view_vec, d_smooth_nrm, d_dp);
+    }
+}
+//------------------------------------------------------------------------
+// Kernels
+__global__ void PrepareShadingNormalFwdKernel(PrepareShadingNormalKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f perturbed_nrm = p.perturbed_nrm.fetch3(px, py, pz);
+    vec3f _smooth_nrm = p.smooth_nrm.fetch3(px, py, pz);
+    vec3f _smooth_tng = p.smooth_tng.fetch3(px, py, pz);
+    vec3f geom_nrm = p.geom_nrm.fetch3(px, py, pz);
+    vec3f smooth_nrm = safeNormalize(_smooth_nrm);
+    vec3f smooth_tng = safeNormalize(_smooth_tng);
+    vec3f view_vec = safeNormalize(view_pos - pos);
+    vec3f shading_nrm = fwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, p.opengl);
+    vec3f res;
+    if (p.two_sided_shading && dot(view_vec, geom_nrm) < 0.0f)
+        res = fwdBendNormal(view_vec, -shading_nrm, -geom_nrm);
+    else
+        res = fwdBendNormal(view_vec, shading_nrm, geom_nrm);
+    p.out.store(px, py, pz, res);
+}
+__global__ void PrepareShadingNormalBwdKernel(PrepareShadingNormalKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f perturbed_nrm = p.perturbed_nrm.fetch3(px, py, pz);
+    vec3f _smooth_nrm = p.smooth_nrm.fetch3(px, py, pz);
+    vec3f _smooth_tng = p.smooth_tng.fetch3(px, py, pz);
+    vec3f geom_nrm = p.geom_nrm.fetch3(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f smooth_nrm = safeNormalize(_smooth_nrm);
+    vec3f smooth_tng = safeNormalize(_smooth_tng);
+    vec3f _view_vec = view_pos - pos;
+    vec3f view_vec = safeNormalize(view_pos - pos);
+    vec3f shading_nrm = fwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, p.opengl);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    // BWD
+    vec3f d_view_vec(0), d_shading_nrm(0), d_geom_nrm(0);
+    if (p.two_sided_shading && dot(view_vec, geom_nrm) < 0.0f)
+    {
+        bwdBendNormal(view_vec, -shading_nrm, -geom_nrm, d_view_vec, d_shading_nrm, d_geom_nrm, d_out);
+        d_shading_nrm = -d_shading_nrm;
+        d_geom_nrm = -d_geom_nrm;
+    }
+    else
+        bwdBendNormal(view_vec, shading_nrm, geom_nrm, d_view_vec, d_shading_nrm, d_geom_nrm, d_out);
+    vec3f d_perturbed_nrm(0), d_smooth_nrm(0), d_smooth_tng(0);
+    bwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, d_perturbed_nrm, d_smooth_nrm, d_smooth_tng, d_shading_nrm, p.opengl);
+    vec3f d__view_vec(0), d__smooth_nrm(0), d__smooth_tng(0);
+    bwdSafeNormalize(_view_vec, d__view_vec, d_view_vec);
+    bwdSafeNormalize(_smooth_nrm, d__smooth_nrm, d_smooth_nrm);
+    bwdSafeNormalize(_smooth_tng, d__smooth_tng, d_smooth_tng);
+    p.pos.store_grad(px, py, pz, -d__view_vec);
+    p.view_pos.store_grad(px, py, pz, d__view_vec);
+    p.perturbed_nrm.store_grad(px, py, pz, d_perturbed_nrm);
+    p.smooth_nrm.store_grad(px, py, pz, d__smooth_nrm);
+    p.smooth_tng.store_grad(px, py, pz, d__smooth_tng);
+    p.geom_nrm.store_grad(px, py, pz, d_geom_nrm);
+}

video3d/render/renderutils/c_src/normal.h ADDED Viewed

	@@ -0,0 +1,27 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include "common.h"
+struct PrepareShadingNormalKernelParams
+{
+    Tensor  pos;
+    Tensor  view_pos;
+    Tensor  perturbed_nrm;
+    Tensor  smooth_nrm;
+    Tensor  smooth_tng;
+    Tensor  geom_nrm;
+    Tensor  out;
+    dim3    gridSize;
+    bool    two_sided_shading, opengl;
+};

video3d/render/renderutils/c_src/tensor.h ADDED Viewed

	@@ -0,0 +1,92 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#if defined(__CUDACC__) && defined(BFLOAT16)
+#include <cuda_bf16.h> // bfloat16 is float32 compatible with less mantissa bits
+#endif
+//---------------------------------------------------------------------------------
+// CUDA-side Tensor class for in/out parameter parsing. Can be float32 or bfloat16
+struct Tensor
+{
+    void*   val;
+    void*   d_val;
+    int     dims[4], _dims[4];
+    int     strides[4];
+    bool    fp16;
+#if defined(__CUDA__) && !defined(__CUDA_ARCH__)
+    Tensor() : val(nullptr), d_val(nullptr), fp16(true), dims{ 0, 0, 0, 0 }, _dims{ 0, 0, 0, 0 }, strides{ 0, 0, 0, 0 } {}
+#endif
+#ifdef __CUDACC__
+    // Helpers to index and read/write a single element
+    __device__ inline int   _nhwcIndex(int n, int h, int w, int c) const { return n * strides[0] + h * strides[1] + w * strides[2] + c * strides[3]; }
+    __device__ inline int   nhwcIndex(int n, int h, int w, int c) const { return (dims[0] == 1 ? 0 : n * strides[0]) + (dims[1] == 1 ? 0 : h * strides[1]) + (dims[2] == 1 ? 0 : w * strides[2]) + (dims[3] == 1 ? 0 : c * strides[3]); }
+    __device__ inline int   nhwcIndexContinuous(int n, int h, int w, int c) const { return ((n * _dims[1] + h) * _dims[2] + w) * _dims[3] + c; }
+#ifdef BFLOAT16
+    __device__ inline float fetch(unsigned int idx) const { return fp16 ? __bfloat162float(((__nv_bfloat16*)val)[idx]) : ((float*)val)[idx]; }
+    __device__ inline void  store(unsigned int idx, float _val) { if (fp16) ((__nv_bfloat16*)val)[idx] = __float2bfloat16(_val); else ((float*)val)[idx] = _val; }
+    __device__ inline void  store_grad(unsigned int idx, float _val) { if (fp16) ((__nv_bfloat16*)d_val)[idx] = __float2bfloat16(_val); else ((float*)d_val)[idx] = _val; }
+#else
+    __device__ inline float fetch(unsigned int idx) const { return ((float*)val)[idx]; }
+    __device__ inline void  store(unsigned int idx, float _val) { ((float*)val)[idx] = _val; }
+    __device__ inline void  store_grad(unsigned int idx, float _val) { ((float*)d_val)[idx] = _val; }
+#endif
+    //////////////////////////////////////////////////////////////////////////////////////////
+    // Fetch, use broadcasting for tensor dimensions of size 1
+    __device__ inline float fetch1(unsigned int x, unsigned int y, unsigned int z) const
+    {
+        return fetch(nhwcIndex(z, y, x, 0));
+    }
+    __device__ inline vec3f fetch3(unsigned int x, unsigned int y, unsigned int z) const
+    {
+        return vec3f(
+            fetch(nhwcIndex(z, y, x, 0)),
+            fetch(nhwcIndex(z, y, x, 1)),
+            fetch(nhwcIndex(z, y, x, 2))
+        );
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Store, no broadcasting here. Assume we output full res gradient and then reduce using torch.sum outside
+    __device__ inline void store(unsigned int x, unsigned int y, unsigned int z, float _val)
+    {
+        store(_nhwcIndex(z, y, x, 0), _val);
+    }
+    __device__ inline void store(unsigned int x, unsigned int y, unsigned int z, vec3f _val)
+    {
+        store(_nhwcIndex(z, y, x, 0), _val.x);
+        store(_nhwcIndex(z, y, x, 1), _val.y);
+        store(_nhwcIndex(z, y, x, 2), _val.z);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Store gradient , no broadcasting here. Assume we output full res gradient and then reduce using torch.sum outside
+    __device__ inline void store_grad(unsigned int x, unsigned int y, unsigned int z, float _val)
+    {
+        store_grad(nhwcIndexContinuous(z, y, x, 0), _val);
+    }
+    __device__ inline void store_grad(unsigned int x, unsigned int y, unsigned int z, vec3f _val)
+    {
+        store_grad(nhwcIndexContinuous(z, y, x, 0), _val.x);
+        store_grad(nhwcIndexContinuous(z, y, x, 1), _val.y);
+        store_grad(nhwcIndexContinuous(z, y, x, 2), _val.z);
+    }
+#endif
+};

video3d/render/renderutils/c_src/torch_bindings.cpp ADDED Viewed

	@@ -0,0 +1,1062 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#ifdef _MSC_VER
+#pragma warning(push, 0)
+#include <torch/extension.h>
+#pragma warning(pop)
+#else
+#include <torch/extension.h>
+#endif
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <algorithm>
+#include <string>
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) { cudaError_t err = CUDA_CALL; AT_CUDA_CHECK(cudaGetLastError()); }
+#define NVDR_CHECK_GL_ERROR(GL_CALL) { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); }
+#define CHECK_TENSOR(X, DIMS, CHANNELS) \
+    TORCH_CHECK(X.is_cuda(), #X " must be a cuda tensor") \
+    TORCH_CHECK(X.scalar_type() == torch::kFloat || X.scalar_type() == torch::kBFloat16, #X " must be fp32 or bf16") \
+    TORCH_CHECK(X.dim() == DIMS, #X " must have " #DIMS " dimensions") \
+    TORCH_CHECK(X.size(DIMS - 1) == CHANNELS, #X " must have " #CHANNELS " channels")
+#include "common.h"
+#include "loss.h"
+#include "normal.h"
+#include "cubemap.h"
+#include "bsdf.h"
+#include "mesh.h"
+#define BLOCK_X 8
+#define BLOCK_Y 8
+//------------------------------------------------------------------------
+// mesh.cu
+void xfmPointsFwdKernel(XfmKernelParams p);
+void xfmPointsBwdKernel(XfmKernelParams p);
+//------------------------------------------------------------------------
+// loss.cu
+void imgLossFwdKernel(LossKernelParams p);
+void imgLossBwdKernel(LossKernelParams p);
+//------------------------------------------------------------------------
+// normal.cu
+void PrepareShadingNormalFwdKernel(PrepareShadingNormalKernelParams p);
+void PrepareShadingNormalBwdKernel(PrepareShadingNormalKernelParams p);
+//------------------------------------------------------------------------
+// cubemap.cu
+void DiffuseCubemapFwdKernel(DiffuseCubemapKernelParams p);
+void DiffuseCubemapBwdKernel(DiffuseCubemapKernelParams p);
+void SpecularBoundsKernel(SpecularBoundsKernelParams p);
+void SpecularCubemapFwdKernel(SpecularCubemapKernelParams p);
+void SpecularCubemapBwdKernel(SpecularCubemapKernelParams p);
+//------------------------------------------------------------------------
+// bsdf.cu
+void LambertFwdKernel(LambertKernelParams p);
+void LambertBwdKernel(LambertKernelParams p);
+void FrostbiteDiffuseFwdKernel(FrostbiteDiffuseKernelParams p);
+void FrostbiteDiffuseBwdKernel(FrostbiteDiffuseKernelParams p);
+void FresnelShlickFwdKernel(FresnelShlickKernelParams p);
+void FresnelShlickBwdKernel(FresnelShlickKernelParams p);
+void ndfGGXFwdKernel(NdfGGXParams p);
+void ndfGGXBwdKernel(NdfGGXParams p);
+void lambdaGGXFwdKernel(NdfGGXParams p);
+void lambdaGGXBwdKernel(NdfGGXParams p);
+void maskingSmithFwdKernel(MaskingSmithParams p);
+void maskingSmithBwdKernel(MaskingSmithParams p);
+void pbrSpecularFwdKernel(PbrSpecular p);
+void pbrSpecularBwdKernel(PbrSpecular p);
+void pbrBSDFFwdKernel(PbrBSDF p);
+void pbrBSDFBwdKernel(PbrBSDF p);
+//------------------------------------------------------------------------
+// Tensor helpers
+void update_grid(dim3 &gridSize, torch::Tensor x)
+{
+    gridSize.x = std::max(gridSize.x, (uint32_t)x.size(2));
+    gridSize.y = std::max(gridSize.y, (uint32_t)x.size(1));
+    gridSize.z = std::max(gridSize.z, (uint32_t)x.size(0));
+}
+template<typename... Ts>
+void update_grid(dim3& gridSize, torch::Tensor x, Ts&&... vs)
+{
+    gridSize.x = std::max(gridSize.x, (uint32_t)x.size(2));
+    gridSize.y = std::max(gridSize.y, (uint32_t)x.size(1));
+    gridSize.z = std::max(gridSize.z, (uint32_t)x.size(0));
+    update_grid(gridSize, std::forward<Ts>(vs)...);
+}
+Tensor make_cuda_tensor(torch::Tensor val)
+{
+    Tensor res;
+    for (int i = 0; i < val.dim(); ++i)
+    {
+        res.dims[i] = val.size(i);
+        res.strides[i] = val.stride(i);
+    }
+    res.fp16 = val.scalar_type() == torch::kBFloat16;
+    res.val = res.fp16 ? (void*)val.data_ptr<torch::BFloat16>() : (void*)val.data_ptr<float>();
+    res.d_val = nullptr;
+    return res;
+}
+Tensor make_cuda_tensor(torch::Tensor val, dim3 outDims, torch::Tensor* grad = nullptr)
+{
+    Tensor res;
+    for (int i = 0; i < val.dim(); ++i)
+    {
+        res.dims[i] = val.size(i);
+        res.strides[i] = val.stride(i);
+    }
+    if (val.dim() == 4)
+        res._dims[0] = outDims.z, res._dims[1] = outDims.y, res._dims[2] = outDims.x, res._dims[3] = val.size(3);
+    else
+        res._dims[0] = outDims.z, res._dims[1] = outDims.x, res._dims[2] = val.size(2), res._dims[3] = 1; // Add a trailing one for indexing math to work out
+    res.fp16 = val.scalar_type() == torch::kBFloat16;
+    res.val = res.fp16 ? (void*)val.data_ptr<torch::BFloat16>() : (void*)val.data_ptr<float>();
+    res.d_val = nullptr;
+    if (grad != nullptr)
+    {
+        if (val.dim() == 4)
+            *grad = torch::empty({ outDims.z, outDims.y, outDims.x, val.size(3) }, torch::TensorOptions().dtype(res.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA));
+        else // 3
+            *grad = torch::empty({ outDims.z, outDims.x, val.size(2) }, torch::TensorOptions().dtype(res.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA));
+        res.d_val = res.fp16 ? (void*)grad->data_ptr<torch::BFloat16>() : (void*)grad->data_ptr<float>();
+    }
+    return res;
+}
+//------------------------------------------------------------------------
+// prepare_shading_normal
+torch::Tensor prepare_shading_normal_fwd(torch::Tensor pos, torch::Tensor view_pos, torch::Tensor perturbed_nrm, torch::Tensor smooth_nrm, torch::Tensor smooth_tng, torch::Tensor geom_nrm, bool two_sided_shading, bool opengl, bool fp16)
+{
+    CHECK_TENSOR(pos, 4, 3);
+    CHECK_TENSOR(view_pos, 4, 3);
+    CHECK_TENSOR(perturbed_nrm, 4, 3);
+    CHECK_TENSOR(smooth_nrm, 4, 3);
+    CHECK_TENSOR(smooth_tng, 4, 3);
+    CHECK_TENSOR(geom_nrm, 4, 3);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    PrepareShadingNormalKernelParams p;
+    p.two_sided_shading = two_sided_shading;
+    p.opengl = opengl;
+    p.out.fp16 = fp16;
+    update_grid(p.gridSize, pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 3 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Setup tensors
+    p.pos = make_cuda_tensor(pos, p.gridSize);
+    p.view_pos = make_cuda_tensor(view_pos, p.gridSize);
+    p.perturbed_nrm = make_cuda_tensor(perturbed_nrm, p.gridSize);
+    p.smooth_nrm = make_cuda_tensor(smooth_nrm, p.gridSize);
+    p.smooth_tng = make_cuda_tensor(smooth_tng, p.gridSize);
+    p.geom_nrm = make_cuda_tensor(geom_nrm, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)PrepareShadingNormalFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> prepare_shading_normal_bwd(torch::Tensor pos, torch::Tensor view_pos, torch::Tensor perturbed_nrm, torch::Tensor smooth_nrm, torch::Tensor smooth_tng, torch::Tensor geom_nrm, torch::Tensor grad, bool two_sided_shading, bool opengl)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    PrepareShadingNormalKernelParams p;
+    p.two_sided_shading = two_sided_shading;
+    p.opengl = opengl;
+    update_grid(p.gridSize, pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Setup tensors
+    torch::Tensor pos_grad, view_pos_grad, perturbed_nrm_grad, smooth_nrm_grad, smooth_tng_grad, geom_nrm_grad;
+    p.pos = make_cuda_tensor(pos, p.gridSize, &pos_grad);
+    p.view_pos = make_cuda_tensor(view_pos, p.gridSize, &view_pos_grad);
+    p.perturbed_nrm = make_cuda_tensor(perturbed_nrm, p.gridSize, &perturbed_nrm_grad);
+    p.smooth_nrm = make_cuda_tensor(smooth_nrm, p.gridSize, &smooth_nrm_grad);
+    p.smooth_tng = make_cuda_tensor(smooth_tng, p.gridSize, &smooth_tng_grad);
+    p.geom_nrm = make_cuda_tensor(geom_nrm, p.gridSize, &geom_nrm_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)PrepareShadingNormalBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(pos_grad, view_pos_grad, perturbed_nrm_grad, smooth_nrm_grad, smooth_tng_grad, geom_nrm_grad);
+}
+//------------------------------------------------------------------------
+// lambert
+torch::Tensor lambert_fwd(torch::Tensor nrm, torch::Tensor wi, bool fp16)
+{
+    CHECK_TENSOR(nrm, 4, 3);
+    CHECK_TENSOR(wi, 4, 3);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    LambertKernelParams p;
+    p.out.fp16 = fp16;
+    update_grid(p.gridSize, nrm, wi);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 1 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.nrm = make_cuda_tensor(nrm, p.gridSize);
+    p.wi = make_cuda_tensor(wi, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)LambertFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor> lambert_bwd(torch::Tensor nrm, torch::Tensor wi, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    LambertKernelParams p;
+    update_grid(p.gridSize, nrm, wi);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor nrm_grad, wi_grad;
+    p.nrm = make_cuda_tensor(nrm, p.gridSize, &nrm_grad);
+    p.wi = make_cuda_tensor(wi, p.gridSize, &wi_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)LambertBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor>(nrm_grad, wi_grad);
+}
+//------------------------------------------------------------------------
+// frostbite diffuse
+torch::Tensor frostbite_fwd(torch::Tensor nrm, torch::Tensor wi, torch::Tensor wo, torch::Tensor linearRoughness, bool fp16)
+{
+    CHECK_TENSOR(nrm, 4, 3);
+    CHECK_TENSOR(wi, 4, 3);
+    CHECK_TENSOR(wo, 4, 3);
+    CHECK_TENSOR(linearRoughness, 4, 1);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    FrostbiteDiffuseKernelParams p;
+    p.out.fp16 = fp16;
+    update_grid(p.gridSize, nrm, wi, wo, linearRoughness);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 1 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.nrm = make_cuda_tensor(nrm, p.gridSize);
+    p.wi = make_cuda_tensor(wi, p.gridSize);
+    p.wo = make_cuda_tensor(wo, p.gridSize);
+    p.linearRoughness = make_cuda_tensor(linearRoughness, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)FrostbiteDiffuseFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> frostbite_bwd(torch::Tensor nrm, torch::Tensor wi, torch::Tensor wo, torch::Tensor linearRoughness, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    FrostbiteDiffuseKernelParams p;
+    update_grid(p.gridSize, nrm, wi, wo, linearRoughness);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor nrm_grad, wi_grad, wo_grad, linearRoughness_grad;
+    p.nrm = make_cuda_tensor(nrm, p.gridSize, &nrm_grad);
+    p.wi = make_cuda_tensor(wi, p.gridSize, &wi_grad);
+    p.wo = make_cuda_tensor(wo, p.gridSize, &wo_grad);
+    p.linearRoughness = make_cuda_tensor(linearRoughness, p.gridSize, &linearRoughness_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)FrostbiteDiffuseBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(nrm_grad, wi_grad, wo_grad, linearRoughness_grad);
+}
+//------------------------------------------------------------------------
+// fresnel_shlick
+torch::Tensor fresnel_shlick_fwd(torch::Tensor f0, torch::Tensor f90, torch::Tensor cosTheta, bool fp16)
+{
+    CHECK_TENSOR(f0, 4, 3);
+    CHECK_TENSOR(f90, 4, 3);
+    CHECK_TENSOR(cosTheta, 4, 1);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    FresnelShlickKernelParams p;
+    p.out.fp16 = fp16;
+    update_grid(p.gridSize, f0, f90, cosTheta);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 3 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.f0 = make_cuda_tensor(f0, p.gridSize);
+    p.f90 = make_cuda_tensor(f90, p.gridSize);
+    p.cosTheta = make_cuda_tensor(cosTheta, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)FresnelShlickFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> fresnel_shlick_bwd(torch::Tensor f0, torch::Tensor f90, torch::Tensor cosTheta, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    FresnelShlickKernelParams p;
+    update_grid(p.gridSize, f0, f90, cosTheta);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor f0_grad, f90_grad, cosT_grad;
+    p.f0 = make_cuda_tensor(f0, p.gridSize, &f0_grad);
+    p.f90 = make_cuda_tensor(f90, p.gridSize, &f90_grad);
+    p.cosTheta = make_cuda_tensor(cosTheta, p.gridSize, &cosT_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)FresnelShlickBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(f0_grad, f90_grad, cosT_grad);
+}
+//------------------------------------------------------------------------
+// ndf_ggd
+torch::Tensor ndf_ggx_fwd(torch::Tensor alphaSqr, torch::Tensor cosTheta, bool fp16)
+{
+    CHECK_TENSOR(alphaSqr, 4, 1);
+    CHECK_TENSOR(cosTheta, 4, 1);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    NdfGGXParams p;
+    p.out.fp16 = fp16;
+    update_grid(p.gridSize, alphaSqr, cosTheta);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 1 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.alphaSqr = make_cuda_tensor(alphaSqr, p.gridSize);
+    p.cosTheta = make_cuda_tensor(cosTheta, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)ndfGGXFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor> ndf_ggx_bwd(torch::Tensor alphaSqr, torch::Tensor cosTheta, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    NdfGGXParams p;
+    update_grid(p.gridSize, alphaSqr, cosTheta);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor alphaSqr_grad, cosTheta_grad;
+    p.alphaSqr = make_cuda_tensor(alphaSqr, p.gridSize, &alphaSqr_grad);
+    p.cosTheta = make_cuda_tensor(cosTheta, p.gridSize, &cosTheta_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)ndfGGXBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor>(alphaSqr_grad, cosTheta_grad);
+}
+//------------------------------------------------------------------------
+// lambda_ggx
+torch::Tensor lambda_ggx_fwd(torch::Tensor alphaSqr, torch::Tensor cosTheta, bool fp16)
+{
+    CHECK_TENSOR(alphaSqr, 4, 1);
+    CHECK_TENSOR(cosTheta, 4, 1);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    NdfGGXParams p;
+    p.out.fp16 = fp16;
+    update_grid(p.gridSize, alphaSqr, cosTheta);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 1 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.alphaSqr = make_cuda_tensor(alphaSqr, p.gridSize);
+    p.cosTheta = make_cuda_tensor(cosTheta, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)lambdaGGXFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor> lambda_ggx_bwd(torch::Tensor alphaSqr, torch::Tensor cosTheta, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    NdfGGXParams p;
+    update_grid(p.gridSize, alphaSqr, cosTheta);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor alphaSqr_grad, cosTheta_grad;
+    p.alphaSqr = make_cuda_tensor(alphaSqr, p.gridSize, &alphaSqr_grad);
+    p.cosTheta = make_cuda_tensor(cosTheta, p.gridSize, &cosTheta_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)lambdaGGXBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor>(alphaSqr_grad, cosTheta_grad);
+}
+//------------------------------------------------------------------------
+// masking_smith
+torch::Tensor masking_smith_fwd(torch::Tensor alphaSqr, torch::Tensor cosThetaI, torch::Tensor cosThetaO, bool fp16)
+{
+    CHECK_TENSOR(alphaSqr, 4, 1);
+    CHECK_TENSOR(cosThetaI, 4, 1);
+    CHECK_TENSOR(cosThetaO, 4, 1);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    MaskingSmithParams p;
+    p.out.fp16 = fp16;
+    update_grid(p.gridSize, alphaSqr, cosThetaI, cosThetaO);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 1 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.alphaSqr = make_cuda_tensor(alphaSqr, p.gridSize);
+    p.cosThetaI = make_cuda_tensor(cosThetaI, p.gridSize);
+    p.cosThetaO = make_cuda_tensor(cosThetaO, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)maskingSmithFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> masking_smith_bwd(torch::Tensor alphaSqr, torch::Tensor cosThetaI, torch::Tensor cosThetaO, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    MaskingSmithParams p;
+    update_grid(p.gridSize, alphaSqr, cosThetaI, cosThetaO);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor alphaSqr_grad, cosThetaI_grad, cosThetaO_grad;
+    p.alphaSqr = make_cuda_tensor(alphaSqr, p.gridSize, &alphaSqr_grad);
+    p.cosThetaI = make_cuda_tensor(cosThetaI, p.gridSize, &cosThetaI_grad);
+    p.cosThetaO = make_cuda_tensor(cosThetaO, p.gridSize, &cosThetaO_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)maskingSmithBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(alphaSqr_grad, cosThetaI_grad, cosThetaO_grad);
+}
+//------------------------------------------------------------------------
+// pbr_specular
+torch::Tensor pbr_specular_fwd(torch::Tensor col, torch::Tensor nrm, torch::Tensor wo, torch::Tensor wi, torch::Tensor alpha, float min_roughness, bool fp16)
+{
+    CHECK_TENSOR(col, 4, 3);
+    CHECK_TENSOR(nrm, 4, 3);
+    CHECK_TENSOR(wo, 4, 3);
+    CHECK_TENSOR(wi, 4, 3);
+    CHECK_TENSOR(alpha, 4, 1);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    PbrSpecular p;
+    p.out.fp16 = fp16;
+    p.min_roughness = min_roughness;
+    update_grid(p.gridSize, col, nrm, wo, wi, alpha);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 3 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.col = make_cuda_tensor(col, p.gridSize);
+    p.nrm = make_cuda_tensor(nrm, p.gridSize);
+    p.wo = make_cuda_tensor(wo, p.gridSize);
+    p.wi = make_cuda_tensor(wi, p.gridSize);
+    p.alpha = make_cuda_tensor(alpha, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)pbrSpecularFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> pbr_specular_bwd(torch::Tensor col, torch::Tensor nrm, torch::Tensor wo, torch::Tensor wi, torch::Tensor alpha, float min_roughness, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    PbrSpecular p;
+    update_grid(p.gridSize, col, nrm, wo, wi, alpha);
+    p.min_roughness = min_roughness;
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor col_grad, nrm_grad, wo_grad, wi_grad, alpha_grad;
+    p.col = make_cuda_tensor(col, p.gridSize, &col_grad);
+    p.nrm = make_cuda_tensor(nrm, p.gridSize, &nrm_grad);
+    p.wo = make_cuda_tensor(wo, p.gridSize, &wo_grad);
+    p.wi = make_cuda_tensor(wi, p.gridSize, &wi_grad);
+    p.alpha = make_cuda_tensor(alpha, p.gridSize, &alpha_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)pbrSpecularBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(col_grad, nrm_grad, wo_grad, wi_grad, alpha_grad);
+}
+//------------------------------------------------------------------------
+// pbr_bsdf
+torch::Tensor pbr_bsdf_fwd(torch::Tensor kd, torch::Tensor arm, torch::Tensor pos, torch::Tensor nrm, torch::Tensor view_pos, torch::Tensor light_pos, float min_roughness, int BSDF, bool fp16)
+{
+    CHECK_TENSOR(kd, 4, 3);
+    CHECK_TENSOR(arm, 4, 3);
+    CHECK_TENSOR(pos, 4, 3);
+    CHECK_TENSOR(nrm, 4, 3);
+    CHECK_TENSOR(view_pos, 4, 3);
+    CHECK_TENSOR(light_pos, 4, 3);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    PbrBSDF p;
+    p.out.fp16 = fp16;
+    p.min_roughness = min_roughness;
+    p.BSDF = BSDF;
+    update_grid(p.gridSize, kd, arm, pos, nrm, view_pos, light_pos);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 3 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    p.kd = make_cuda_tensor(kd, p.gridSize);
+    p.arm = make_cuda_tensor(arm, p.gridSize);
+    p.pos = make_cuda_tensor(pos, p.gridSize);
+    p.nrm = make_cuda_tensor(nrm, p.gridSize);
+    p.view_pos = make_cuda_tensor(view_pos, p.gridSize);
+    p.light_pos = make_cuda_tensor(light_pos, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)pbrBSDFFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> pbr_bsdf_bwd(torch::Tensor kd, torch::Tensor arm, torch::Tensor pos, torch::Tensor nrm, torch::Tensor view_pos, torch::Tensor light_pos, float min_roughness, int BSDF, torch::Tensor grad)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    PbrBSDF p;
+    update_grid(p.gridSize, kd, arm, pos, nrm, view_pos, light_pos);
+    p.min_roughness = min_roughness;
+    p.BSDF = BSDF;
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor kd_grad, arm_grad, pos_grad, nrm_grad, view_pos_grad, light_pos_grad;
+    p.kd = make_cuda_tensor(kd, p.gridSize, &kd_grad);
+    p.arm = make_cuda_tensor(arm, p.gridSize, &arm_grad);
+    p.pos = make_cuda_tensor(pos, p.gridSize, &pos_grad);
+    p.nrm = make_cuda_tensor(nrm, p.gridSize, &nrm_grad);
+    p.view_pos = make_cuda_tensor(view_pos, p.gridSize, &view_pos_grad);
+    p.light_pos = make_cuda_tensor(light_pos, p.gridSize, &light_pos_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)pbrBSDFBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(kd_grad, arm_grad, pos_grad, nrm_grad, view_pos_grad, light_pos_grad);
+}
+//------------------------------------------------------------------------
+// filter_cubemap
+torch::Tensor diffuse_cubemap_fwd(torch::Tensor cubemap)
+{
+    CHECK_TENSOR(cubemap, 4, 3);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    DiffuseCubemapKernelParams p;
+    update_grid(p.gridSize, cubemap);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 3 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Setup tensors
+    p.cubemap = make_cuda_tensor(cubemap, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)DiffuseCubemapFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+torch::Tensor diffuse_cubemap_bwd(torch::Tensor cubemap, torch::Tensor grad)
+{
+    CHECK_TENSOR(cubemap, 4, 3);
+    CHECK_TENSOR(grad, 4, 3);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    DiffuseCubemapKernelParams p;
+    update_grid(p.gridSize, cubemap);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Setup tensors
+    torch::Tensor cubemap_grad;
+    p.cubemap = make_cuda_tensor(cubemap, p.gridSize);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    cubemap_grad = torch::zeros({ p.gridSize.z, p.gridSize.y, p.gridSize.x, cubemap.size(3) }, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA));
+    p.cubemap.d_val = (void*)cubemap_grad.data_ptr<float>();
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)DiffuseCubemapBwdKernel, gridSize, blockSize, args, 0, stream));
+    return cubemap_grad;
+}
+torch::Tensor specular_bounds(int resolution, float costheta_cutoff)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    SpecularBoundsKernelParams p;
+    p.costheta_cutoff = costheta_cutoff;
+    p.gridSize = dim3(resolution, resolution, 6);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::zeros({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 6*4 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Setup tensors
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)SpecularBoundsKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+torch::Tensor specular_cubemap_fwd(torch::Tensor cubemap, torch::Tensor bounds, float roughness, float costheta_cutoff)
+{
+    CHECK_TENSOR(cubemap, 4, 3);
+    CHECK_TENSOR(bounds, 4, 6*4);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    SpecularCubemapKernelParams p;
+    p.roughness = roughness;
+    p.costheta_cutoff = costheta_cutoff;
+    update_grid(p.gridSize, cubemap);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ p.gridSize.z, p.gridSize.y, p.gridSize.x, 4 }, opts);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Setup tensors
+    p.cubemap = make_cuda_tensor(cubemap, p.gridSize);
+    p.bounds = make_cuda_tensor(bounds, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)SpecularCubemapFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+torch::Tensor specular_cubemap_bwd(torch::Tensor cubemap, torch::Tensor bounds, torch::Tensor grad, float roughness, float costheta_cutoff)
+{
+    CHECK_TENSOR(cubemap, 4, 3);
+    CHECK_TENSOR(bounds, 4, 6*4);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    SpecularCubemapKernelParams p;
+    p.roughness = roughness;
+    p.costheta_cutoff = costheta_cutoff;
+    update_grid(p.gridSize, cubemap);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Setup tensors
+    torch::Tensor cubemap_grad;
+    p.cubemap = make_cuda_tensor(cubemap, p.gridSize);
+    p.bounds = make_cuda_tensor(bounds, p.gridSize);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    cubemap_grad = torch::zeros({ p.gridSize.z, p.gridSize.y, p.gridSize.x, cubemap.size(3) }, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA));
+    p.cubemap.d_val = (void*)cubemap_grad.data_ptr<float>();
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)SpecularCubemapBwdKernel, gridSize, blockSize, args, 0, stream));
+    return cubemap_grad;
+}
+//------------------------------------------------------------------------
+// loss function
+LossType strToLoss(std::string str)
+{
+    if (str == "mse")
+        return LOSS_MSE;
+    else if (str == "relmse")
+        return LOSS_RELMSE;
+    else if (str == "smape")
+        return LOSS_SMAPE;
+    else
+        return LOSS_L1;
+}
+torch::Tensor image_loss_fwd(torch::Tensor img, torch::Tensor target, std::string loss, std::string tonemapper, bool fp16)
+{
+    CHECK_TENSOR(img, 4, 3);
+    CHECK_TENSOR(target, 4, 3);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    LossKernelParams p;
+    p.out.fp16 = fp16;
+    p.loss = strToLoss(loss);
+    p.tonemapper = tonemapper == "log_srgb" ? TONEMAPPER_LOG_SRGB : TONEMAPPER_NONE;
+    update_grid(p.gridSize, img, target);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 warpSize = getWarpSize(blockSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({ (p.gridSize.z - 1)/ warpSize.z + 1, (p.gridSize.y - 1) / warpSize.y + 1, (p.gridSize.x - 1) / warpSize.x + 1, 1 }, opts);
+    p.img = make_cuda_tensor(img, p.gridSize);
+    p.target = make_cuda_tensor(target, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)imgLossFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+std::tuple<torch::Tensor, torch::Tensor> image_loss_bwd(torch::Tensor img, torch::Tensor target, torch::Tensor grad, std::string loss, std::string tonemapper)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    LossKernelParams p;
+    p.loss = strToLoss(loss);
+    p.tonemapper = tonemapper == "log_srgb" ? TONEMAPPER_LOG_SRGB : TONEMAPPER_NONE;
+    update_grid(p.gridSize, img, target);
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(BLOCK_X, BLOCK_Y, p.gridSize);
+    dim3 warpSize = getWarpSize(blockSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor img_grad, target_grad;
+    p.img = make_cuda_tensor(img, p.gridSize, &img_grad);
+    p.target = make_cuda_tensor(target, p.gridSize, &target_grad);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)imgLossBwdKernel, gridSize, blockSize, args, 0, stream));
+    return std::tuple<torch::Tensor, torch::Tensor>(img_grad, target_grad);
+}
+//------------------------------------------------------------------------
+// transform function
+torch::Tensor xfm_fwd(torch::Tensor points, torch::Tensor matrix, bool isPoints, bool fp16)
+{
+    CHECK_TENSOR(points, 3, 3);
+    CHECK_TENSOR(matrix, 3, 4);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    XfmKernelParams p;
+    p.out.fp16 = fp16;
+    p.isPoints = isPoints;
+    p.gridSize.x = points.size(1);
+    p.gridSize.y = 1;
+    p.gridSize.z = std::max(matrix.size(0), points.size(0));
+    // Choose launch parameters.
+    dim3 blockSize(BLOCK_X * BLOCK_Y, 1, 1);
+    dim3 warpSize = getWarpSize(blockSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(p.out.fp16 ? torch::kBFloat16 : torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = isPoints ? torch::empty({ matrix.size(0), points.size(1), 4 }, opts) : torch::empty({ matrix.size(0), points.size(1), 3 }, opts);
+    p.points = make_cuda_tensor(points, p.gridSize);
+    p.matrix = make_cuda_tensor(matrix, p.gridSize);
+    p.out = make_cuda_tensor(out, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)xfmPointsFwdKernel, gridSize, blockSize, args, 0, stream));
+    return out;
+}
+torch::Tensor xfm_bwd(torch::Tensor points, torch::Tensor matrix, torch::Tensor grad, bool isPoints)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Extract input parameters.
+    XfmKernelParams p;
+    p.isPoints = isPoints;
+    p.gridSize.x = points.size(1);
+    p.gridSize.y = 1;
+    p.gridSize.z = std::max(matrix.size(0), points.size(0));
+    // Choose launch parameters.
+    dim3 blockSize(BLOCK_X * BLOCK_Y, 1, 1);
+    dim3 warpSize = getWarpSize(blockSize);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.gridSize);
+    torch::Tensor points_grad;
+    p.points = make_cuda_tensor(points, p.gridSize, &points_grad);
+    p.matrix = make_cuda_tensor(matrix, p.gridSize);
+    p.out = make_cuda_tensor(grad, p.gridSize);
+    // Launch CUDA kernel.
+    void* args[] = { &p };
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((const void*)xfmPointsBwdKernel, gridSize, blockSize, args, 0, stream));
+    return points_grad;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("prepare_shading_normal_fwd", &prepare_shading_normal_fwd, "prepare_shading_normal_fwd");
+    m.def("prepare_shading_normal_bwd", &prepare_shading_normal_bwd, "prepare_shading_normal_bwd");
+    m.def("lambert_fwd", &lambert_fwd, "lambert_fwd");
+    m.def("lambert_bwd", &lambert_bwd, "lambert_bwd");
+    m.def("frostbite_fwd", &frostbite_fwd, "frostbite_fwd");
+    m.def("frostbite_bwd", &frostbite_bwd, "frostbite_bwd");
+    m.def("fresnel_shlick_fwd", &fresnel_shlick_fwd, "fresnel_shlick_fwd");
+    m.def("fresnel_shlick_bwd", &fresnel_shlick_bwd, "fresnel_shlick_bwd");
+    m.def("ndf_ggx_fwd", &ndf_ggx_fwd, "ndf_ggx_fwd");
+    m.def("ndf_ggx_bwd", &ndf_ggx_bwd, "ndf_ggx_bwd");
+    m.def("lambda_ggx_fwd", &lambda_ggx_fwd, "lambda_ggx_fwd");
+    m.def("lambda_ggx_bwd", &lambda_ggx_bwd, "lambda_ggx_bwd");
+    m.def("masking_smith_fwd", &masking_smith_fwd, "masking_smith_fwd");
+    m.def("masking_smith_bwd", &masking_smith_bwd, "masking_smith_bwd");
+    m.def("pbr_specular_fwd", &pbr_specular_fwd, "pbr_specular_fwd");
+    m.def("pbr_specular_bwd", &pbr_specular_bwd, "pbr_specular_bwd");
+    m.def("pbr_bsdf_fwd", &pbr_bsdf_fwd, "pbr_bsdf_fwd");
+    m.def("pbr_bsdf_bwd", &pbr_bsdf_bwd, "pbr_bsdf_bwd");
+    m.def("diffuse_cubemap_fwd", &diffuse_cubemap_fwd, "diffuse_cubemap_fwd");
+    m.def("diffuse_cubemap_bwd", &diffuse_cubemap_bwd, "diffuse_cubemap_bwd");
+    m.def("specular_bounds", &specular_bounds, "specular_bounds");
+    m.def("specular_cubemap_fwd", &specular_cubemap_fwd, "specular_cubemap_fwd");
+    m.def("specular_cubemap_bwd", &specular_cubemap_bwd, "specular_cubemap_bwd");
+    m.def("image_loss_fwd", &image_loss_fwd, "image_loss_fwd");
+    m.def("image_loss_bwd", &image_loss_bwd, "image_loss_bwd");
+    m.def("xfm_fwd", &xfm_fwd, "xfm_fwd");
+    m.def("xfm_bwd", &xfm_bwd, "xfm_bwd");
+}

video3d/render/renderutils/c_src/vec3f.h ADDED Viewed

	@@ -0,0 +1,109 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+struct vec3f
+{
+    float x, y, z;
+#ifdef __CUDACC__
+    __device__ vec3f() { }
+    __device__ vec3f(float v) { x = v; y = v; z = v; }
+    __device__ vec3f(float _x, float _y, float _z) { x = _x; y = _y; z = _z; }
+    __device__ vec3f(float3 v) { x = v.x; y = v.y; z = v.z; }
+    __device__ inline vec3f& operator+=(const vec3f& b) { x += b.x; y += b.y; z += b.z; return *this; }
+    __device__ inline vec3f& operator-=(const vec3f& b) { x -= b.x; y -= b.y; z -= b.z; return *this; }
+    __device__ inline vec3f& operator*=(const vec3f& b) { x *= b.x; y *= b.y; z *= b.z; return *this; }
+    __device__ inline vec3f& operator/=(const vec3f& b) { x /= b.x; y /= b.y; z /= b.z; return *this; }
+#endif
+};
+#ifdef __CUDACC__
+__device__ static inline vec3f operator+(const vec3f& a, const vec3f& b) { return vec3f(a.x + b.x, a.y + b.y, a.z + b.z); }
+__device__ static inline vec3f operator-(const vec3f& a, const vec3f& b) { return vec3f(a.x - b.x, a.y - b.y, a.z - b.z); }
+__device__ static inline vec3f operator*(const vec3f& a, const vec3f& b) { return vec3f(a.x * b.x, a.y * b.y, a.z * b.z); }
+__device__ static inline vec3f operator/(const vec3f& a, const vec3f& b) { return vec3f(a.x / b.x, a.y / b.y, a.z / b.z); }
+__device__ static inline vec3f operator-(const vec3f& a) { return vec3f(-a.x, -a.y, -a.z); }
+__device__ static inline float sum(vec3f a)
+{
+    return a.x + a.y + a.z;
+}
+__device__ static inline vec3f cross(vec3f a, vec3f b)
+{
+    vec3f out;
+    out.x = a.y * b.z - a.z * b.y;
+    out.y = a.z * b.x - a.x * b.z;
+    out.z = a.x * b.y - a.y * b.x;
+    return out;
+}
+__device__ static inline void bwdCross(vec3f a, vec3f b, vec3f &d_a, vec3f &d_b, vec3f d_out)
+{
+    d_a.x += d_out.z * b.y - d_out.y * b.z;
+    d_a.y += d_out.x * b.z - d_out.z * b.x;
+    d_a.z += d_out.y * b.x - d_out.x * b.y;
+    d_b.x += d_out.y * a.z - d_out.z * a.y;
+    d_b.y += d_out.z * a.x - d_out.x * a.z;
+    d_b.z += d_out.x * a.y - d_out.y * a.x;
+}
+__device__ static inline float dot(vec3f a, vec3f b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+__device__ static inline void bwdDot(vec3f a, vec3f b, vec3f& d_a, vec3f& d_b, float d_out)
+{
+    d_a.x += d_out * b.x; d_a.y += d_out * b.y; d_a.z += d_out * b.z;
+    d_b.x += d_out * a.x; d_b.y += d_out * a.y; d_b.z += d_out * a.z;
+}
+__device__ static inline vec3f reflect(vec3f x, vec3f n)
+{
+    return n * 2.0f * dot(n, x) - x;
+}
+__device__ static inline void bwdReflect(vec3f x, vec3f n, vec3f& d_x, vec3f& d_n, const vec3f d_out)
+{
+    d_x.x += d_out.x * (2 * n.x * n.x - 1) + d_out.y * (2 * n.x * n.y) + d_out.z * (2 * n.x * n.z);
+    d_x.y += d_out.x * (2 * n.x * n.y) + d_out.y * (2 * n.y * n.y - 1) + d_out.z * (2 * n.y * n.z);
+    d_x.z += d_out.x * (2 * n.x * n.z) + d_out.y * (2 * n.y * n.z) + d_out.z * (2 * n.z * n.z - 1);
+    d_n.x += d_out.x * (2 * (2 * n.x * x.x + n.y * x.y + n.z * x.z)) + d_out.y * (2 * n.y * x.x) + d_out.z * (2 * n.z * x.x);
+    d_n.y += d_out.x * (2 * n.x * x.y) + d_out.y * (2 * (n.x * x.x + 2 * n.y * x.y + n.z * x.z)) + d_out.z * (2 * n.z * x.y);
+    d_n.z += d_out.x * (2 * n.x * x.z) + d_out.y * (2 * n.y * x.z) + d_out.z * (2 * (n.x * x.x + n.y * x.y + 2 * n.z * x.z));
+}
+__device__ static inline vec3f safeNormalize(vec3f v)
+{
+    float l = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+    return l > 0.0f ? (v / l) : vec3f(0.0f);
+}
+__device__ static inline void bwdSafeNormalize(const vec3f v, vec3f& d_v, const vec3f d_out)
+{
+    float l = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+    if (l > 0.0f)
+    {
+        float fac = 1.0 / powf(v.x * v.x + v.y * v.y + v.z * v.z, 1.5f);
+        d_v.x += (d_out.x * (v.y * v.y + v.z * v.z) - d_out.y * (v.x * v.y) - d_out.z * (v.x * v.z)) * fac;
+        d_v.y += (d_out.y * (v.x * v.x + v.z * v.z) - d_out.x * (v.y * v.x) - d_out.z * (v.y * v.z)) * fac;
+        d_v.z += (d_out.z * (v.x * v.x + v.y * v.y) - d_out.x * (v.z * v.x) - d_out.y * (v.z * v.y)) * fac;
+    }
+}
+#endif

video3d/render/renderutils/c_src/vec4f.h ADDED Viewed

	@@ -0,0 +1,25 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+struct vec4f
+{
+    float x, y, z, w;
+#ifdef __CUDACC__
+    __device__ vec4f() { }
+    __device__ vec4f(float v) { x = v; y = v; z = v; w = v; }
+    __device__ vec4f(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
+    __device__ vec4f(float4 v) { x = v.x; y = v.y; z = v.z; w = v.w; }
+#endif
+};

video3d/render/renderutils/loss.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import torch
+#----------------------------------------------------------------------------
+# HDR image losses
+#----------------------------------------------------------------------------
+def _tonemap_srgb(f):
+    return torch.where(f > 0.0031308, torch.pow(torch.clamp(f, min=0.0031308), 1.0/2.4)*1.055 - 0.055, 12.92*f)
+def _SMAPE(img, target, eps=0.01):
+    nom = torch.abs(img - target)
+    denom = torch.abs(img) + torch.abs(target) + 0.01
+    return torch.mean(nom / denom)
+def _RELMSE(img, target, eps=0.1):
+    nom = (img - target) * (img - target)
+    denom = img * img + target * target + 0.1
+    return torch.mean(nom / denom)
+def image_loss_fn(img, target, loss, tonemapper):
+    if tonemapper == 'log_srgb':
+        img    = _tonemap_srgb(torch.log(torch.clamp(img, min=0, max=65535) + 1))
+        target = _tonemap_srgb(torch.log(torch.clamp(target, min=0, max=65535) + 1))
+    if loss == 'mse':
+        return torch.nn.functional.mse_loss(img, target)
+    elif loss == 'smape':
+        return _SMAPE(img, target)
+    elif loss == 'relmse':
+        return _RELMSE(img, target)
+    else:
+        return torch.nn.functional.l1_loss(img, target)

video3d/render/renderutils/ops.py ADDED Viewed

	@@ -0,0 +1,554 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import numpy as np
+import os
+import sys
+import torch
+import torch.utils.cpp_extension
+from .bsdf import *
+from .loss import *
+#----------------------------------------------------------------------------
+# C++/Cuda plugin compiler/loader.
+_cached_plugin = None
+def _get_plugin():
+    # Return cached plugin if already loaded.
+    global _cached_plugin
+    if _cached_plugin is not None:
+        return _cached_plugin
+    # Make sure we can find the necessary compiler and libary binaries.
+    if os.name == 'nt':
+        def find_cl_path():
+            import glob
+            for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
+                paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
+                if paths:
+                    return paths[0]
+        # If cl.exe is not on path, try to find it.
+        if os.system("where cl.exe >nul 2>nul") != 0:
+            cl_path = find_cl_path()
+            if cl_path is None:
+                raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+            os.environ['PATH'] += ';' + cl_path
+    # Compiler options.
+    opts = ['-DNVDR_TORCH']
+    # Linker options.
+    if os.name == 'posix':
+        ldflags = ['-lcuda', '-lnvrtc']
+    elif os.name == 'nt':
+        ldflags = ['cuda.lib', 'advapi32.lib', 'nvrtc.lib']
+    # List of sources.
+    source_files = [
+        'c_src/mesh.cu',
+        'c_src/loss.cu',
+        'c_src/bsdf.cu',
+        'c_src/normal.cu',
+        'c_src/cubemap.cu',
+        'c_src/common.cpp',
+        'c_src/torch_bindings.cpp'
+    ]
+    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
+    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
+    try:
+        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory('renderutils_plugin', False), 'lock')
+        if os.path.exists(lock_fn):
+            print("Warning: Lock file exists in build directory: '%s'" % lock_fn)
+    except:
+        pass
+    # Compile and load.
+    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
+    torch.utils.cpp_extension.load(name='renderutils_plugin', sources=source_paths, extra_cflags=opts,
+         extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=True)
+    # Import, cache, and return the compiled module.
+    import renderutils_plugin
+    _cached_plugin = renderutils_plugin
+    return _cached_plugin
+#----------------------------------------------------------------------------
+# Internal kernels, just used for testing functionality
+class _fresnel_shlick_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, f0, f90, cosTheta):
+        out = _get_plugin().fresnel_shlick_fwd(f0, f90, cosTheta, False)
+        ctx.save_for_backward(f0, f90, cosTheta)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        f0, f90, cosTheta = ctx.saved_variables
+        return _get_plugin().fresnel_shlick_bwd(f0, f90, cosTheta, dout) + (None,)
+def _fresnel_shlick(f0, f90, cosTheta, use_python=False):
+    if use_python:
+        out = bsdf_fresnel_shlick(f0, f90, cosTheta)
+    else:
+        out = _fresnel_shlick_func.apply(f0, f90, cosTheta)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _fresnel_shlick contains inf or NaN"
+    return out
+class _ndf_ggx_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alphaSqr, cosTheta):
+        out = _get_plugin().ndf_ggx_fwd(alphaSqr, cosTheta, False)
+        ctx.save_for_backward(alphaSqr, cosTheta)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        alphaSqr, cosTheta = ctx.saved_variables
+        return _get_plugin().ndf_ggx_bwd(alphaSqr, cosTheta, dout) + (None,)
+def _ndf_ggx(alphaSqr, cosTheta, use_python=False):
+    if use_python:
+        out = bsdf_ndf_ggx(alphaSqr, cosTheta)
+    else:
+        out = _ndf_ggx_func.apply(alphaSqr, cosTheta)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _ndf_ggx contains inf or NaN"
+    return out
+class _lambda_ggx_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alphaSqr, cosTheta):
+        out = _get_plugin().lambda_ggx_fwd(alphaSqr, cosTheta, False)
+        ctx.save_for_backward(alphaSqr, cosTheta)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        alphaSqr, cosTheta = ctx.saved_variables
+        return _get_plugin().lambda_ggx_bwd(alphaSqr, cosTheta, dout) + (None,)
+def _lambda_ggx(alphaSqr, cosTheta, use_python=False):
+    if use_python:
+        out = bsdf_lambda_ggx(alphaSqr, cosTheta)
+    else:
+        out = _lambda_ggx_func.apply(alphaSqr, cosTheta)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _lambda_ggx contains inf or NaN"
+    return out
+class _masking_smith_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alphaSqr, cosThetaI, cosThetaO):
+        ctx.save_for_backward(alphaSqr, cosThetaI, cosThetaO)
+        out = _get_plugin().masking_smith_fwd(alphaSqr, cosThetaI, cosThetaO, False)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        alphaSqr, cosThetaI, cosThetaO = ctx.saved_variables
+        return _get_plugin().masking_smith_bwd(alphaSqr, cosThetaI, cosThetaO, dout) + (None,)
+def _masking_smith(alphaSqr, cosThetaI, cosThetaO, use_python=False):
+    if use_python:
+        out = bsdf_masking_smith_ggx_correlated(alphaSqr, cosThetaI, cosThetaO)
+    else:
+        out = _masking_smith_func.apply(alphaSqr, cosThetaI, cosThetaO)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _masking_smith contains inf or NaN"
+    return out
+#----------------------------------------------------------------------------
+# Shading normal setup (bump mapping + bent normals)
+class _prepare_shading_normal_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl):
+        ctx.two_sided_shading, ctx.opengl = two_sided_shading, opengl
+        out = _get_plugin().prepare_shading_normal_fwd(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl, False)
+        ctx.save_for_backward(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm = ctx.saved_variables
+        return _get_plugin().prepare_shading_normal_bwd(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, dout, ctx.two_sided_shading, ctx.opengl) + (None, None, None)
+def prepare_shading_normal(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading=True, opengl=True, use_python=False):
+    '''Takes care of all corner cases and produces a final normal used for shading:
+        - Constructs tangent space
+        - Flips normal direction based on geometric normal for two sided Shading
+        - Perturbs shading normal by normal map
+        - Bends backfacing normals towards the camera to avoid shading artifacts
+        All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent.
+    Args:
+        pos: World space g-buffer position.
+        view_pos: Camera position in world space (typically using broadcasting).
+        perturbed_nrm: Trangent-space normal perturbation from normal map lookup.
+        smooth_nrm: Interpolated vertex normals.
+        smooth_tng: Interpolated vertex tangents.
+        geom_nrm: Geometric (face) normals.
+        two_sided_shading: Use one/two sided shading
+        opengl: Use OpenGL/DirectX normal map conventions
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Final shading normal
+    '''
+    if perturbed_nrm is None:
+        perturbed_nrm = torch.tensor([0, 0, 1], dtype=torch.float32, device='cuda', requires_grad=False)[None, None, None, ...]
+    if use_python:
+        out = bsdf_prepare_shading_normal(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl)
+    else:
+        out = _prepare_shading_normal_func.apply(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of prepare_shading_normal contains inf or NaN"
+    return out
+#----------------------------------------------------------------------------
+# BSDF functions
+class _lambert_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, nrm, wi):
+        out = _get_plugin().lambert_fwd(nrm, wi, False)
+        ctx.save_for_backward(nrm, wi)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        nrm, wi = ctx.saved_variables
+        return _get_plugin().lambert_bwd(nrm, wi, dout) + (None,)
+def lambert(nrm, wi, use_python=False):
+    '''Lambertian bsdf.
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent.
+    Args:
+        nrm: World space shading normal.
+        wi: World space light vector.
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Shaded diffuse value with shape [minibatch_size, height, width, 1]
+    '''
+    if use_python:
+        out = bsdf_lambert(nrm, wi)
+    else:
+        out = _lambert_func.apply(nrm, wi)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of lambert contains inf or NaN"
+    return out
+class _frostbite_diffuse_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, nrm, wi, wo, linearRoughness):
+        out = _get_plugin().frostbite_fwd(nrm, wi, wo, linearRoughness, False)
+        ctx.save_for_backward(nrm, wi, wo, linearRoughness)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        nrm, wi, wo, linearRoughness = ctx.saved_variables
+        return _get_plugin().frostbite_bwd(nrm, wi, wo, linearRoughness, dout) + (None,)
+def frostbite_diffuse(nrm, wi, wo, linearRoughness, use_python=False):
+    '''Frostbite, normalized Disney Diffuse bsdf.
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent.
+    Args:
+        nrm: World space shading normal.
+        wi: World space light vector.
+        wo: World space camera vector.
+        linearRoughness: Material roughness
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Shaded diffuse value with shape [minibatch_size, height, width, 1]
+    '''
+    if use_python:
+        out = bsdf_frostbite(nrm, wi, wo, linearRoughness)
+    else:
+        out = _frostbite_diffuse_func.apply(nrm, wi, wo, linearRoughness)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of lambert contains inf or NaN"
+    return out
+class _pbr_specular_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, col, nrm, wo, wi, alpha, min_roughness):
+        ctx.save_for_backward(col, nrm, wo, wi, alpha)
+        ctx.min_roughness = min_roughness
+        out = _get_plugin().pbr_specular_fwd(col, nrm, wo, wi, alpha, min_roughness, False)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        col, nrm, wo, wi, alpha = ctx.saved_variables
+        return _get_plugin().pbr_specular_bwd(col, nrm, wo, wi, alpha, ctx.min_roughness, dout) + (None, None)
+def pbr_specular(col, nrm, wo, wi, alpha, min_roughness=0.08, use_python=False):
+    '''Physically-based specular bsdf.
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent unless otherwise noted.
+    Args:
+        col: Specular lobe color
+        nrm: World space shading normal.
+        wo: World space camera vector.
+        wi: World space light vector
+        alpha: Specular roughness parameter with shape [minibatch_size, height, width, 1]
+        min_roughness: Scalar roughness clamping threshold
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Shaded specular color
+    '''
+    if use_python:
+        out = bsdf_pbr_specular(col, nrm, wo, wi, alpha, min_roughness=min_roughness)
+    else:
+        out = _pbr_specular_func.apply(col, nrm, wo, wi, alpha, min_roughness)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of pbr_specular contains inf or NaN"
+    return out
+class _pbr_bsdf_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF):
+        ctx.save_for_backward(kd, arm, pos, nrm, view_pos, light_pos)
+        ctx.min_roughness = min_roughness
+        ctx.BSDF = BSDF
+        out = _get_plugin().pbr_bsdf_fwd(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF, False)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        kd, arm, pos, nrm, view_pos, light_pos = ctx.saved_variables
+        return _get_plugin().pbr_bsdf_bwd(kd, arm, pos, nrm, view_pos, light_pos, ctx.min_roughness, ctx.BSDF, dout) + (None, None, None)
+def pbr_bsdf(kd, arm, pos, nrm, view_pos, light_pos, min_roughness=0.08, bsdf="lambert", use_python=False):
+    '''Physically-based bsdf, both diffuse & specular lobes
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent unless otherwise noted.
+    Args:
+        kd: Diffuse albedo.
+        arm: Specular parameters (attenuation, linear roughness, metalness).
+        pos: World space position.
+        nrm: World space shading normal.
+        view_pos: Camera position in world space, typically using broadcasting.
+        light_pos: Light position in world space, typically using broadcasting.
+        min_roughness: Scalar roughness clamping threshold
+        bsdf: Controls diffuse BSDF, can be either 'lambert' or 'frostbite'
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Shaded color.
+    '''
+    BSDF = 0
+    if bsdf == 'frostbite':
+        BSDF = 1
+    if use_python:
+        out = bsdf_pbr(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF)
+    else:
+        out = _pbr_bsdf_func.apply(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of pbr_bsdf contains inf or NaN"
+    return out
+#----------------------------------------------------------------------------
+# cubemap filter with filtering across edges
+class _diffuse_cubemap_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, cubemap):
+        out = _get_plugin().diffuse_cubemap_fwd(cubemap)
+        ctx.save_for_backward(cubemap)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        cubemap, = ctx.saved_variables
+        cubemap_grad = _get_plugin().diffuse_cubemap_bwd(cubemap, dout)
+        return cubemap_grad, None
+def diffuse_cubemap(cubemap, use_python=False):
+    if use_python:
+        assert False
+    else:
+        out = _diffuse_cubemap_func.apply(cubemap)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of diffuse_cubemap contains inf or NaN"
+    return out
+class _specular_cubemap(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, cubemap, roughness, costheta_cutoff, bounds):
+        out = _get_plugin().specular_cubemap_fwd(cubemap, bounds, roughness, costheta_cutoff)
+        ctx.save_for_backward(cubemap, bounds)
+        ctx.roughness, ctx.theta_cutoff = roughness, costheta_cutoff
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        cubemap, bounds = ctx.saved_variables
+        cubemap_grad = _get_plugin().specular_cubemap_bwd(cubemap, bounds, dout, ctx.roughness, ctx.theta_cutoff)
+        return cubemap_grad, None, None, None
+# Compute the bounds of the GGX NDF lobe to retain "cutoff" percent of the energy
+def __ndfBounds(res, roughness, cutoff):
+    def ndfGGX(alphaSqr, costheta):
+        costheta = np.clip(costheta, 0.0, 1.0)
+        d = (costheta * alphaSqr - costheta) * costheta + 1.0
+        return alphaSqr / (d * d * np.pi)
+    # Sample out cutoff angle
+    nSamples = 1000000
+    costheta = np.cos(np.linspace(0, np.pi/2.0, nSamples))
+    D = np.cumsum(ndfGGX(roughness**4, costheta))
+    idx = np.argmax(D >= D[..., -1] * cutoff)
+    # Brute force compute lookup table with bounds
+    bounds = _get_plugin().specular_bounds(res, costheta[idx])
+    return costheta[idx], bounds
+__ndfBoundsDict = {}
+def specular_cubemap(cubemap, roughness, cutoff=0.99, use_python=False):
+    assert cubemap.shape[0] == 6 and cubemap.shape[1] == cubemap.shape[2], "Bad shape for cubemap tensor: %s" % str(cubemap.shape)
+    if use_python:
+        assert False
+    else:
+        key = (cubemap.shape[1], roughness, cutoff)
+        if key not in __ndfBoundsDict:
+            __ndfBoundsDict[key] = __ndfBounds(*key)
+        out = _specular_cubemap.apply(cubemap, roughness, *__ndfBoundsDict[key])
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of specular_cubemap contains inf or NaN"
+    return out[..., 0:3] / out[..., 3:]
+#----------------------------------------------------------------------------
+# Fast image loss function
+class _image_loss_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, img, target, loss, tonemapper):
+        ctx.loss, ctx.tonemapper = loss, tonemapper
+        ctx.save_for_backward(img, target)
+        out = _get_plugin().image_loss_fwd(img, target, loss, tonemapper, False)
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        img, target = ctx.saved_variables
+        return _get_plugin().image_loss_bwd(img, target, dout, ctx.loss, ctx.tonemapper) + (None, None, None)
+def image_loss(img, target, loss='l1', tonemapper='none', use_python=False):
+    '''Compute HDR image loss. Combines tonemapping and loss into a single kernel for better perf.
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent unless otherwise noted.
+    Args:
+        img: Input image.
+        target: Target (reference) image.
+        loss: Type of loss. Valid options are ['l1', 'mse', 'smape', 'relmse']
+        tonemapper: Tonemapping operations. Valid options are ['none', 'log_srgb']
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Image space loss (scalar value).
+    '''
+    if use_python:
+        out = image_loss_fn(img, target, loss, tonemapper)
+    else:
+        out = _image_loss_func.apply(img, target, loss, tonemapper)
+        out = torch.sum(out) / (img.shape[0]*img.shape[1]*img.shape[2])
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of image_loss contains inf or NaN"
+    return out
+#----------------------------------------------------------------------------
+# Transform points function
+class _xfm_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, points, matrix, isPoints):
+        ctx.save_for_backward(points, matrix)
+        ctx.isPoints = isPoints
+        return _get_plugin().xfm_fwd(points, matrix, isPoints, False)
+    @staticmethod
+    def backward(ctx, dout):
+        points, matrix = ctx.saved_variables
+        return (_get_plugin().xfm_bwd(points, matrix, dout, ctx.isPoints),) + (None, None, None)
+def xfm_points(points, matrix, use_python=False):
+    '''Transform points.
+    Args:
+        points: Tensor containing 3D points with shape [minibatch_size, num_vertices, 3] or [1, num_vertices, 3]
+        matrix: A 4x4 transform matrix with shape [minibatch_size, 4, 4]
+        use_python: Use PyTorch's torch.matmul (for validation)
+    Returns:
+        Transformed points in homogeneous 4D with shape [minibatch_size, num_vertices, 4].
+    '''
+    if use_python:
+        out = torch.matmul(torch.nn.functional.pad(points, pad=(0,1), mode='constant', value=1.0), torch.transpose(matrix, 1, 2))
+    else:
+        out = _xfm_func.apply(points, matrix, True)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of xfm_points contains inf or NaN"
+    return out
+def xfm_vectors(vectors, matrix, use_python=False):
+    '''Transform vectors.
+    Args:
+        vectors: Tensor containing 3D vectors with shape [minibatch_size, num_vertices, 3] or [1, num_vertices, 3]
+        matrix: A 4x4 transform matrix with shape [minibatch_size, 4, 4]
+        use_python: Use PyTorch's torch.matmul (for validation)
+    Returns:
+        Transformed vectors in homogeneous 4D with shape [minibatch_size, num_vertices, 4].
+    '''
+    if use_python:
+        out = torch.matmul(torch.nn.functional.pad(vectors, pad=(0,1), mode='constant', value=0.0), torch.transpose(matrix, 1, 2))[..., 0:3].contiguous()
+    else:
+        out = _xfm_func.apply(vectors, matrix, False)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of xfm_vectors contains inf or NaN"
+    return out

video3d/render/renderutils/tests/test_bsdf.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import torch
+import os
+import sys
+sys.path.insert(0, os.path.join(sys.path[0], '../..'))
+import renderutils as ru
+RES = 4
+DTYPE = torch.float32
+def relative_loss(name, ref, cuda):
+	ref = ref.float()
+	cuda = cuda.float()
+	print(name, torch.max(torch.abs(ref - cuda) / torch.abs(ref + 1e-7)).item())
+def test_normal():
+	pos_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	pos_ref = pos_cuda.clone().detach().requires_grad_(True)
+	view_pos_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	view_pos_ref = view_pos_cuda.clone().detach().requires_grad_(True)
+	perturbed_nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	perturbed_nrm_ref = perturbed_nrm_cuda.clone().detach().requires_grad_(True)
+	smooth_nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	smooth_nrm_ref = smooth_nrm_cuda.clone().detach().requires_grad_(True)
+	smooth_tng_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	smooth_tng_ref = smooth_tng_cuda.clone().detach().requires_grad_(True)
+	geom_nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	geom_nrm_ref = geom_nrm_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+	ref = ru.prepare_shading_normal(pos_ref, view_pos_ref, perturbed_nrm_ref, smooth_nrm_ref, smooth_tng_ref, geom_nrm_ref, True, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru.prepare_shading_normal(pos_cuda, view_pos_cuda, perturbed_nrm_cuda, smooth_nrm_cuda, smooth_tng_cuda, geom_nrm_cuda, True)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    bent normal")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("pos:", pos_ref.grad, pos_cuda.grad)
+	relative_loss("view_pos:", view_pos_ref.grad, view_pos_cuda.grad)
+	relative_loss("perturbed_nrm:", perturbed_nrm_ref.grad, perturbed_nrm_cuda.grad)
+	relative_loss("smooth_nrm:", smooth_nrm_ref.grad, smooth_nrm_cuda.grad)
+	relative_loss("smooth_tng:", smooth_tng_ref.grad, smooth_tng_cuda.grad)
+	relative_loss("geom_nrm:", geom_nrm_ref.grad, geom_nrm_cuda.grad)
+def test_schlick():
+	f0_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	f0_ref = f0_cuda.clone().detach().requires_grad_(True)
+	f90_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	f90_ref = f90_cuda.clone().detach().requires_grad_(True)
+	cosT_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True) * 2.0
+	cosT_cuda = cosT_cuda.clone().detach().requires_grad_(True)
+	cosT_ref = cosT_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+	ref = ru._fresnel_shlick(f0_ref, f90_ref, cosT_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru._fresnel_shlick(f0_cuda, f90_cuda, cosT_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Fresnel shlick")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("f0:", f0_ref.grad, f0_cuda.grad)
+	relative_loss("f90:", f90_ref.grad, f90_cuda.grad)
+	relative_loss("cosT:", cosT_ref.grad, cosT_cuda.grad)
+def test_ndf_ggx():
+	alphaSqr_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alphaSqr_cuda = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	alphaSqr_ref = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	cosT_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True) * 3.0 - 1
+	cosT_cuda = cosT_cuda.clone().detach().requires_grad_(True)
+	cosT_ref = cosT_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+	ref = ru._ndf_ggx(alphaSqr_ref, cosT_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru._ndf_ggx(alphaSqr_cuda, cosT_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Ndf GGX")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("alpha:", alphaSqr_ref.grad, alphaSqr_cuda.grad)
+	relative_loss("cosT:", cosT_ref.grad, cosT_cuda.grad)
+def test_lambda_ggx():
+	alphaSqr_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alphaSqr_ref = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	cosT_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True) * 3.0 - 1
+	cosT_cuda = cosT_cuda.clone().detach().requires_grad_(True)
+	cosT_ref = cosT_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+	ref = ru._lambda_ggx(alphaSqr_ref, cosT_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru._lambda_ggx(alphaSqr_cuda, cosT_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Lambda GGX")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("alpha:", alphaSqr_ref.grad, alphaSqr_cuda.grad)
+	relative_loss("cosT:", cosT_ref.grad, cosT_cuda.grad)
+def test_masking_smith():
+	alphaSqr_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alphaSqr_ref = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	cosThetaI_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	cosThetaI_ref = cosThetaI_cuda.clone().detach().requires_grad_(True)
+	cosThetaO_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	cosThetaO_ref = cosThetaO_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+	ref = ru._masking_smith(alphaSqr_ref, cosThetaI_ref, cosThetaO_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru._masking_smith(alphaSqr_cuda, cosThetaI_cuda, cosThetaO_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Smith masking term")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("alpha:", alphaSqr_ref.grad, alphaSqr_cuda.grad)
+	relative_loss("cosThetaI:", cosThetaI_ref.grad, cosThetaI_cuda.grad)
+	relative_loss("cosThetaO:", cosThetaO_ref.grad, cosThetaO_cuda.grad)
+def test_lambert():
+	normals_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	normals_ref = normals_cuda.clone().detach().requires_grad_(True)
+	wi_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wi_ref = wi_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+	ref = ru.lambert(normals_ref, wi_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru.lambert(normals_cuda, wi_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Lambert")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("nrm:", normals_ref.grad, normals_cuda.grad)
+	relative_loss("wi:", wi_ref.grad, wi_cuda.grad)
+def test_frostbite():
+	normals_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	normals_ref = normals_cuda.clone().detach().requires_grad_(True)
+	wi_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wi_ref = wi_cuda.clone().detach().requires_grad_(True)
+	wo_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wo_ref = wo_cuda.clone().detach().requires_grad_(True)
+	rough_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	rough_ref = rough_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+	ref = ru.frostbite_diffuse(normals_ref, wi_ref, wo_ref, rough_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru.frostbite_diffuse(normals_cuda, wi_cuda, wo_cuda, rough_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Frostbite")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("nrm:", normals_ref.grad, normals_cuda.grad)
+	relative_loss("wo:", wo_ref.grad, wo_cuda.grad)
+	relative_loss("wi:", wi_ref.grad, wi_cuda.grad)
+	relative_loss("rough:", rough_ref.grad, rough_cuda.grad)
+def test_pbr_specular():
+	col_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	col_ref = col_cuda.clone().detach().requires_grad_(True)
+	nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	nrm_ref = nrm_cuda.clone().detach().requires_grad_(True)
+	wi_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wi_ref = wi_cuda.clone().detach().requires_grad_(True)
+	wo_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wo_ref = wo_cuda.clone().detach().requires_grad_(True)
+	alpha_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alpha_ref = alpha_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+	ref = ru.pbr_specular(col_ref, nrm_ref, wo_ref, wi_ref, alpha_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru.pbr_specular(col_cuda, nrm_cuda, wo_cuda, wi_cuda, alpha_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Pbr specular")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	if col_ref.grad is not None:
+		relative_loss("col:", col_ref.grad, col_cuda.grad)
+	if nrm_ref.grad is not None:
+		relative_loss("nrm:", nrm_ref.grad, nrm_cuda.grad)
+	if wi_ref.grad is not None:
+		relative_loss("wi:", wi_ref.grad, wi_cuda.grad)
+	if wo_ref.grad is not None:
+		relative_loss("wo:", wo_ref.grad, wo_cuda.grad)
+	if alpha_ref.grad is not None:
+		relative_loss("alpha:", alpha_ref.grad, alpha_cuda.grad)
+def test_pbr_bsdf(bsdf):
+	kd_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	kd_ref = kd_cuda.clone().detach().requires_grad_(True)
+	arm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	arm_ref = arm_cuda.clone().detach().requires_grad_(True)
+	pos_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	pos_ref = pos_cuda.clone().detach().requires_grad_(True)
+	nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	nrm_ref = nrm_cuda.clone().detach().requires_grad_(True)
+	view_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	view_ref = view_cuda.clone().detach().requires_grad_(True)
+	light_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	light_ref = light_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+	ref = ru.pbr_bsdf(kd_ref, arm_ref, pos_ref, nrm_ref, view_ref, light_ref, use_python=True, bsdf=bsdf)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru.pbr_bsdf(kd_cuda, arm_cuda, pos_cuda, nrm_cuda, view_cuda, light_cuda, bsdf=bsdf)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Pbr BSDF")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	if kd_ref.grad is not None:
+		relative_loss("kd:", kd_ref.grad, kd_cuda.grad)
+	if arm_ref.grad is not None:
+		relative_loss("arm:", arm_ref.grad, arm_cuda.grad)
+	if pos_ref.grad is not None:
+		relative_loss("pos:", pos_ref.grad, pos_cuda.grad)
+	if nrm_ref.grad is not None:
+		relative_loss("nrm:", nrm_ref.grad, nrm_cuda.grad)
+	if view_ref.grad is not None:
+		relative_loss("view:", view_ref.grad, view_cuda.grad)
+	if light_ref.grad is not None:
+		relative_loss("light:", light_ref.grad, light_cuda.grad)
+test_normal()
+test_schlick()
+test_ndf_ggx()
+test_lambda_ggx()
+test_masking_smith()
+test_lambert()
+test_frostbite()
+test_pbr_specular()
+test_pbr_bsdf('lambert')
+test_pbr_bsdf('frostbite')

video3d/render/renderutils/tests/test_cubemap.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import torch
+import os
+import sys
+sys.path.insert(0, os.path.join(sys.path[0], '../..'))
+import renderutils as ru
+RES = 4
+DTYPE = torch.float32
+def relative_loss(name, ref, cuda):
+	ref = ref.float()
+	cuda = cuda.float()
+	print(name, torch.max(torch.abs(ref - cuda) / torch.abs(ref + 1e-7)).item())
+def test_cubemap():
+	cubemap_cuda = torch.rand(6, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	cubemap_ref = cubemap_cuda.clone().detach().requires_grad_(True)
+	weights = torch.rand(3, 3, 1, dtype=DTYPE, device='cuda')
+	target = torch.rand(6, RES, RES, 3, dtype=DTYPE, device='cuda')
+	ref = ru.filter_cubemap(cubemap_ref, weights, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+	cuda = ru.filter_cubemap(cubemap_cuda, weights, use_python=False)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+	print("-------------------------------------------------------------")
+	print("    Cubemap:")
+	print("-------------------------------------------------------------")
+	relative_loss("flt:", ref, cuda)
+	relative_loss("cubemap:", cubemap_ref.grad, cubemap_cuda.grad)
+test_cubemap()