GLM

Runtime error

App Files Files Community

Work commited on Mar 31

Commit

206b602

•

1 Parent(s): 9997a4d

update from lgm to lgm hf

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
acc_configs/gpu1.yaml +0 -15
acc_configs/gpu4.yaml +0 -15
acc_configs/gpu6.yaml +0 -15
acc_configs/gpu8.yaml +0 -15
app.py +56 -25
convert.py +0 -462
core/__pycache__/__init__.cpython-39.pyc +0 -0
core/__pycache__/attention.cpython-39.pyc +0 -0
core/__pycache__/gs.cpython-39.pyc +0 -0
core/__pycache__/models.cpython-39.pyc +0 -0
core/__pycache__/options.cpython-39.pyc +0 -0
core/__pycache__/provider_objaverse.cpython-39.pyc +0 -0
core/__pycache__/unet.cpython-39.pyc +0 -0
core/__pycache__/utils.cpython-39.pyc +0 -0
core/models.py +7 -4
core/options.py +7 -7
core/unet.py +7 -7
data_test/anya_rgba.png +0 -0
data_test/bird.jpg +0 -0
data_test/bird_rgba.png +0 -0
data_test/boy.jpg +0 -0
data_test/cat_statue.jpg +0 -0
data_test/catstatue_rgba.png +0 -0
data_test/dragontoy.jpg +0 -0
data_test/frog_sweater.jpg +0 -0
data_test/gso_rabbit.jpg +0 -0
diff-gaussian-rasterization/.gitignore +7 -0
diff-gaussian-rasterization/.gitmodules +3 -0
diff-gaussian-rasterization/CMakeLists.txt +36 -0
diff-gaussian-rasterization/LICENSE.md +83 -0
diff-gaussian-rasterization/README.md +35 -0
diff-gaussian-rasterization/cuda_rasterizer/auxiliary.h +175 -0
diff-gaussian-rasterization/cuda_rasterizer/backward.cu +712 -0
diff-gaussian-rasterization/cuda_rasterizer/backward.h +70 -0
diff-gaussian-rasterization/cuda_rasterizer/config.h +19 -0
diff-gaussian-rasterization/cuda_rasterizer/forward.cu +466 -0
diff-gaussian-rasterization/cuda_rasterizer/forward.h +68 -0
diff-gaussian-rasterization/cuda_rasterizer/rasterizer.h +94 -0
diff-gaussian-rasterization/cuda_rasterizer/rasterizer_impl.cu +447 -0
diff-gaussian-rasterization/cuda_rasterizer/rasterizer_impl.h +73 -0
diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py +224 -0
diff-gaussian-rasterization/ext.cpp +19 -0
diff-gaussian-rasterization/rasterize_points.cu +229 -0
diff-gaussian-rasterization/rasterize_points.h +70 -0
diff-gaussian-rasterization/setup.py +34 -0
diff-gaussian-rasterization/third_party/glm/.appveyor.yml +92 -0
diff-gaussian-rasterization/third_party/glm/.gitignore +61 -0
diff-gaussian-rasterization/third_party/glm/.travis.yml +388 -0
diff-gaussian-rasterization/third_party/glm/CMakeLists.txt +45 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

acc_configs/gpu1.yaml DELETED Viewed

@@ -1,15 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: 'NO'
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 1
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false

acc_configs/gpu4.yaml DELETED Viewed

@@ -1,15 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false

acc_configs/gpu6.yaml DELETED Viewed

@@ -1,15 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16
-num_machines: 1
-num_processes: 6
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false

acc_configs/gpu8.yaml DELETED Viewed

@@ -1,15 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 8
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
 import tyro
 import imageio
 import numpy as np
@@ -11,6 +13,12 @@ from safetensors.torch import load_file
 import rembg
 import gradio as gr
 import kiui
 from kiui.op import recenter
 from kiui.cam import orbit_camera
@@ -19,12 +27,26 @@ from core.options import AllConfigs, Options
 from core.models import LGM
 from mvdream.pipeline_mvdream import MVDreamPipeline
 IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
 GRADIO_VIDEO_PATH = 'gradio_output.mp4'
 GRADIO_PLY_PATH = 'gradio_output.ply'
-opt = tyro.cli(AllConfigs)
 # model
 model = LGM(opt)
@@ -45,7 +67,7 @@ model = model.half().to(device)
 model.eval()
 tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
-proj_matrix = torch.zeros(4, 4, dtype=torch.float32, device=device)
 proj_matrix[0, 0] = 1 / tan_half_fov
 proj_matrix[1, 1] = 1 / tan_half_fov
 proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
@@ -73,6 +95,7 @@ pipe_image = pipe_image.to(device)
 bg_remover = rembg.new_session()
 # process function
 def process(input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42):
     # seed
@@ -105,7 +128,7 @@ def process(input_image, prompt, prompt_neg='', input_elevation=0, input_num_ste
         image = image.astype(np.float32) / 255.0
         image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
         mv_image = pipe_image(prompt, image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0,  elevation=input_elevation)
     mv_image_grid = np.concatenate([
         np.concatenate([mv_image[1], mv_image[2]], axis=1),
         np.concatenate([mv_image[3], mv_image[0]], axis=1),
@@ -124,21 +147,21 @@ def process(input_image, prompt, prompt_neg='', input_elevation=0, input_num_ste
         with torch.autocast(device_type='cuda', dtype=torch.float16):
             # generate gaussians
             gaussians = model.forward_gaussians(input_image)
         # save gaussians
         model.gs.save_ply(gaussians, output_ply_path)
-        # render 360 video
         images = []
         elevation = 0
         if opt.fancy_video:
             azimuth = np.arange(0, 720, 4, dtype=np.int32)
             for azi in tqdm.tqdm(azimuth):
                 cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
                 cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
                 # cameras needed by gaussian rasterizer
                 cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
                 cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
@@ -151,11 +174,11 @@ def process(input_image, prompt, prompt_neg='', input_elevation=0, input_num_ste
         else:
             azimuth = np.arange(0, 360, 2, dtype=np.int32)
             for azi in tqdm.tqdm(azimuth):
                 cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
                 cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
                 # cameras needed by gaussian rasterizer
                 cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
                 cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
@@ -179,7 +202,8 @@ _DESCRIPTION = '''
 <a style="display:inline-block; margin-left: .5em" href="https://github.com/3DTopia/LGM"><img src='https://img.shields.io/github/stars/3DTopia/LGM?style=social'/></a>
 </div>
-* Input can be only text, only image, or both image and text.
 * If you find the output unsatisfying, try using different seeds!
 '''
@@ -189,7 +213,7 @@ with block:
         with gr.Column(scale=1):
             gr.Markdown('# ' + _TITLE)
     gr.Markdown(_DESCRIPTION)
     with gr.Row(variant='panel'):
         with gr.Column(scale=1):
             # input image
@@ -207,43 +231,50 @@ with block:
             # gen button
             button_gen = gr.Button("Generate")
         with gr.Column(scale=1):
             with gr.Tab("Video"):
                 # final video results
                 output_video = gr.Video(label="video")
                 # ply file
-                output_file = gr.File(label="ply")
             with gr.Tab("Multi-view Image"):
                 # multi-view results
                 output_image = gr.Image(interactive=False, show_label=False)
         button_gen.click(process, inputs=[input_image, input_text, input_neg_text, input_elevation, input_num_steps, input_seed], outputs=[output_image, output_video, output_file])
     gr.Examples(
         examples=[
-            "data_test/anya_rgba.png",
-            "data_test/bird_rgba.png",
-            "data_test/catstatue_rgba.png",
         ],
         inputs=[input_image],
         outputs=[output_image, output_video, output_file],
         fn=lambda x: process(input_image=x, prompt=''),
-        cache_examples=False,
         label='Image-to-3D Examples'
     )
     gr.Examples(
         examples=[
-            "a motorbike",
-            "a hamburger",
-            "a furry red fox head",
         ],
         inputs=[input_text],
         outputs=[output_image, output_video, output_file],
         fn=lambda x: process(input_image=None, prompt=x),
-        cache_examples=False,
         label='Text-to-3D Examples'
     )
-block.launch(server_name="0.0.0.0", share=False)

 import os
+import shlex
+import subprocess
 import tyro
 import imageio
 import numpy as np
 import rembg
 import gradio as gr
+# download checkpoints
+from huggingface_hub import hf_hub_download
+ckpt_path = hf_hub_download(repo_id="ashawkey/LGM", filename="model_fp16.safetensors")
+subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl"))
 import kiui
 from kiui.op import recenter
 from kiui.cam import orbit_camera
 from core.models import LGM
 from mvdream.pipeline_mvdream import MVDreamPipeline
+import spaces
 IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
 GRADIO_VIDEO_PATH = 'gradio_output.mp4'
 GRADIO_PLY_PATH = 'gradio_output.ply'
+# opt = tyro.cli(AllConfigs)
+opt = Options(
+    input_size=256,
+    up_channels=(1024, 1024, 512, 256, 128), # one more decoder
+    up_attention=(True, True, True, False, False),
+    splat_size=128,
+    output_size=512, # render & supervise Gaussians at a higher resolution.
+    batch_size=8,
+    num_views=8,
+    gradient_accumulation_steps=1,
+    mixed_precision='bf16',
+    resume=ckpt_path,
+)
 # model
 model = LGM(opt)
 model.eval()
 tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
+proj_matrix = torch.zeros(4, 4, dtype=torch.float32).to(device)
 proj_matrix[0, 0] = 1 / tan_half_fov
 proj_matrix[1, 1] = 1 / tan_half_fov
 proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
 bg_remover = rembg.new_session()
 # process function
+@spaces.GPU
 def process(input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42):
     # seed
         image = image.astype(np.float32) / 255.0
         image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
         mv_image = pipe_image(prompt, image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0,  elevation=input_elevation)
     mv_image_grid = np.concatenate([
         np.concatenate([mv_image[1], mv_image[2]], axis=1),
         np.concatenate([mv_image[3], mv_image[0]], axis=1),
         with torch.autocast(device_type='cuda', dtype=torch.float16):
             # generate gaussians
             gaussians = model.forward_gaussians(input_image)
         # save gaussians
         model.gs.save_ply(gaussians, output_ply_path)
+        # render 360 video
         images = []
         elevation = 0
         if opt.fancy_video:
             azimuth = np.arange(0, 720, 4, dtype=np.int32)
             for azi in tqdm.tqdm(azimuth):
                 cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
                 cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
                 # cameras needed by gaussian rasterizer
                 cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
                 cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
         else:
             azimuth = np.arange(0, 360, 2, dtype=np.int32)
             for azi in tqdm.tqdm(azimuth):
                 cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
                 cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
                 # cameras needed by gaussian rasterizer
                 cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
                 cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
 <a style="display:inline-block; margin-left: .5em" href="https://github.com/3DTopia/LGM"><img src='https://img.shields.io/github/stars/3DTopia/LGM?style=social'/></a>
 </div>
+* Input can be only text, only image, or both image and text.
+* Output is a `ply` file containing the 3D Gaussians, please check our [repo](https://github.com/3DTopia/LGM/blob/main/readme.md) for visualization and mesh conversion.
 * If you find the output unsatisfying, try using different seeds!
 '''
         with gr.Column(scale=1):
             gr.Markdown('# ' + _TITLE)
     gr.Markdown(_DESCRIPTION)
     with gr.Row(variant='panel'):
         with gr.Column(scale=1):
             # input image
             # gen button
             button_gen = gr.Button("Generate")
         with gr.Column(scale=1):
             with gr.Tab("Video"):
                 # final video results
                 output_video = gr.Video(label="video")
                 # ply file
+                output_file = gr.File(label="3D Gaussians (ply format)")
             with gr.Tab("Multi-view Image"):
                 # multi-view results
                 output_image = gr.Image(interactive=False, show_label=False)
         button_gen.click(process, inputs=[input_image, input_text, input_neg_text, input_elevation, input_num_steps, input_seed], outputs=[output_image, output_video, output_file])
     gr.Examples(
         examples=[
+            "data_test/frog_sweater.jpg",
+            "data_test/bird.jpg",
+            "data_test/boy.jpg",
+            "data_test/cat_statue.jpg",
+            "data_test/dragontoy.jpg",
+            "data_test/gso_rabbit.jpg",
         ],
         inputs=[input_image],
         outputs=[output_image, output_video, output_file],
         fn=lambda x: process(input_image=x, prompt=''),
+        cache_examples=True,
         label='Image-to-3D Examples'
     )
     gr.Examples(
         examples=[
+            "teddy bear",
+            "hamburger",
+            "oldman's head sculpture",
+            "headphone",
+            "motorbike",
+            "mech suit"
         ],
         inputs=[input_text],
         outputs=[output_image, output_video, output_file],
         fn=lambda x: process(input_image=None, prompt=x),
+        cache_examples=True,
         label='Text-to-3D Examples'
     )
+block.launch()

convert.py DELETED Viewed

@@ -1,462 +0,0 @@
-import os
-import tyro
-import tqdm
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from core.options import AllConfigs, Options
-from core.gs import GaussianRenderer
-import mcubes
-import nerfacc
-import nvdiffrast.torch as dr
-import kiui
-from kiui.mesh import Mesh
-from kiui.mesh_utils import clean_mesh, decimate_mesh
-from kiui.mesh_utils import laplacian_smooth_loss, normal_consistency
-from kiui.op import uv_padding, safe_normalize, inverse_sigmoid
-from kiui.cam import orbit_camera, get_perspective
-from kiui.nn import MLP, trunc_exp
-from kiui.gridencoder import GridEncoder
-def get_rays(pose, h, w, fovy, opengl=True):
-    x, y = torch.meshgrid(
-        torch.arange(w, device=pose.device),
-        torch.arange(h, device=pose.device),
-        indexing="xy",
-    )
-    x = x.flatten()
-    y = y.flatten()
-    cx = w * 0.5
-    cy = h * 0.5
-    focal = h * 0.5 / np.tan(0.5 * np.deg2rad(fovy))
-    camera_dirs = F.pad(
-        torch.stack(
-            [
-                (x - cx + 0.5) / focal,
-                (y - cy + 0.5) / focal * (-1.0 if opengl else 1.0),
-            ],
-            dim=-1,
-        ),
-        (0, 1),
-        value=(-1.0 if opengl else 1.0),
-    )  # [hw, 3]
-    rays_d = camera_dirs @ pose[:3, :3].transpose(0, 1)  # [hw, 3]
-    rays_o = pose[:3, 3].unsqueeze(0).expand_as(rays_d) # [hw, 3]
-    rays_d = safe_normalize(rays_d)
-    return rays_o, rays_d
-# Triple renderer of gaussians, gaussian, and diso mesh.
-# gaussian --> nerf --> mesh
-class Converter(nn.Module):
-    def __init__(self, opt: Options):
-        super().__init__()
-        self.opt = opt
-        self.device = torch.device("cuda")
-        # gs renderer
-        self.tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
-        self.proj_matrix = torch.zeros(4, 4, dtype=torch.float32, device=self.device)
-        self.proj_matrix[0, 0] = 1 / self.tan_half_fov
-        self.proj_matrix[1, 1] = 1 / self.tan_half_fov
-        self.proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
-        self.proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
-        self.proj_matrix[2, 3] = 1
-        self.gs_renderer = GaussianRenderer(opt)
-        self.gaussians = self.gs_renderer.load_ply(opt.test_path).to(self.device)
-        # nerf renderer
-        if not self.opt.force_cuda_rast:
-            self.glctx = dr.RasterizeGLContext()
-        else:
-            self.glctx = dr.RasterizeCudaContext()
-        self.step = 0
-        self.render_step_size = 5e-3
-        self.aabb = torch.tensor([-1.0, -1.0, -1.0, 1.0, 1.0, 1.0], device=self.device)
-        self.estimator = nerfacc.OccGridEstimator(roi_aabb=self.aabb, resolution=64, levels=1)
-        self.encoder_density = GridEncoder(num_levels=12) # VMEncoder(output_dim=16, mode='sum')
-        self.encoder = GridEncoder(num_levels=12)
-        self.mlp_density = MLP(self.encoder_density.output_dim, 1, 32, 2, bias=False)
-        self.mlp = MLP(self.encoder.output_dim, 3, 32, 2, bias=False)
-        # mesh renderer
-        self.proj = torch.from_numpy(get_perspective(self.opt.fovy)).float().to(self.device)
-        self.v = self.f = None
-        self.vt = self.ft = None
-        self.deform = None
-        self.albedo = None
-    @torch.no_grad()
-    def render_gs(self, pose):
-        cam_poses = torch.from_numpy(pose).unsqueeze(0).to(self.device)
-        cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
-        # cameras needed by gaussian rasterizer
-        cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
-        cam_view_proj = cam_view @ self.proj_matrix # [V, 4, 4]
-        cam_pos = - cam_poses[:, :3, 3] # [V, 3]
-        out = self.gs_renderer.render(self.gaussians.unsqueeze(0), cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0))
-        image = out['image'].squeeze(1).squeeze(0) # [C, H, W]
-        alpha = out['alpha'].squeeze(2).squeeze(1).squeeze(0) # [H, W]
-        return image, alpha
-    def get_density(self, xs):
-        # xs: [..., 3]
-        prefix = xs.shape[:-1]
-        xs = xs.view(-1, 3)
-        feats = self.encoder_density(xs)
-        density = trunc_exp(self.mlp_density(feats))
-        density = density.view(*prefix, 1)
-        return density
-    def render_nerf(self, pose):
-        pose = torch.from_numpy(pose.astype(np.float32)).to(self.device)
-        # get rays
-        resolution = self.opt.output_size
-        rays_o, rays_d = get_rays(pose, resolution, resolution, self.opt.fovy)
-        # update occ grid
-        if self.training:
-            def occ_eval_fn(xs):
-                sigmas = self.get_density(xs)
-                return self.render_step_size * sigmas
-            self.estimator.update_every_n_steps(self.step, occ_eval_fn=occ_eval_fn, occ_thre=0.01, n=8)
-            self.step += 1
-        # render
-        def sigma_fn(t_starts, t_ends, ray_indices):
-            t_origins = rays_o[ray_indices]
-            t_dirs = rays_d[ray_indices]
-            xs = t_origins + t_dirs * (t_starts + t_ends)[:, None] / 2.0
-            sigmas = self.get_density(xs)
-            return sigmas.squeeze(-1)
-        with torch.no_grad():
-            ray_indices, t_starts, t_ends = self.estimator.sampling(
-                rays_o,
-                rays_d,
-                sigma_fn=sigma_fn,
-                near_plane=0.01,
-                far_plane=100,
-                render_step_size=self.render_step_size,
-                stratified=self.training,
-                cone_angle=0,
-            )
-        t_origins = rays_o[ray_indices]
-        t_dirs = rays_d[ray_indices]
-        xs = t_origins + t_dirs * (t_starts + t_ends)[:, None] / 2.0
-        sigmas = self.get_density(xs).squeeze(-1)
-        rgbs = torch.sigmoid(self.mlp(self.encoder(xs)))
-        n_rays=rays_o.shape[0]
-        weights, trans, alphas = nerfacc.render_weight_from_density(t_starts, t_ends, sigmas, ray_indices=ray_indices, n_rays=n_rays)
-        color = nerfacc.accumulate_along_rays(weights, values=rgbs, ray_indices=ray_indices, n_rays=n_rays)
-        alpha = nerfacc.accumulate_along_rays(weights, values=None, ray_indices=ray_indices, n_rays=n_rays)
-        color = color + 1 * (1.0 - alpha)
-        color = color.view(resolution, resolution, 3).clamp(0, 1).permute(2, 0, 1).contiguous()
-        alpha = alpha.view(resolution, resolution).clamp(0, 1).contiguous()
-        return color, alpha
-    def fit_nerf(self, iters=512, resolution=128):
-        self.opt.output_size = resolution
-        optimizer = torch.optim.Adam([
-            {'params': self.encoder_density.parameters(), 'lr': 1e-2},
-            {'params': self.encoder.parameters(), 'lr': 1e-2},
-            {'params': self.mlp_density.parameters(), 'lr': 1e-3},
-            {'params': self.mlp.parameters(), 'lr': 1e-3},
-        ])
-        print(f"[INFO] fitting nerf...")
-        pbar = tqdm.trange(iters)
-        for i in pbar:
-            ver = np.random.randint(-45, 45)
-            hor = np.random.randint(-180, 180)
-            rad = np.random.uniform(1.5, 3.0)
-            pose = orbit_camera(ver, hor, rad)
-            image_gt, alpha_gt = self.render_gs(pose)
-            image_pred, alpha_pred = self.render_nerf(pose)
-            # if i % 200 == 0:
-            #     kiui.vis.plot_image(image_gt, alpha_gt, image_pred, alpha_pred)
-            loss_mse = F.mse_loss(image_pred, image_gt) + 0.1 * F.mse_loss(alpha_pred, alpha_gt)
-            loss = loss_mse #+ 0.1 * self.encoder_density.tv_loss() #+ 0.0001 * self.encoder_density.density_loss()
-            loss.backward()
-            self.encoder_density.grad_total_variation(1e-8)
-            optimizer.step()
-            optimizer.zero_grad()
-            pbar.set_description(f"MSE = {loss_mse.item():.6f}")
-        print(f"[INFO] finished fitting nerf!")
-    def render_mesh(self, pose):
-        h = w = self.opt.output_size
-        v = self.v + self.deform
-        f = self.f
-        pose = torch.from_numpy(pose.astype(np.float32)).to(v.device)
-        # get v_clip and render rgb
-        v_cam = torch.matmul(F.pad(v, pad=(0, 1), mode='constant', value=1.0), torch.inverse(pose).T).float().unsqueeze(0)
-        v_clip = v_cam @ self.proj.T
-        rast, rast_db = dr.rasterize(self.glctx, v_clip, f, (h, w))
-        alpha = torch.clamp(rast[..., -1:], 0, 1).contiguous() # [1, H, W, 1]
-        alpha = dr.antialias(alpha, rast, v_clip, f).clamp(0, 1).squeeze(-1).squeeze(0) # [H, W] important to enable gradients!
-        if self.albedo is None:
-            xyzs, _ = dr.interpolate(v.unsqueeze(0), rast, f) # [1, H, W, 3]
-            xyzs = xyzs.view(-1, 3)
-            mask = (alpha > 0).view(-1)
-            image = torch.zeros_like(xyzs, dtype=torch.float32)
-            if mask.any():
-                masked_albedo = torch.sigmoid(self.mlp(self.encoder(xyzs[mask].detach(), bound=1)))
-                image[mask] = masked_albedo.float()
-        else:
-            texc, texc_db = dr.interpolate(self.vt.unsqueeze(0), rast, self.ft, rast_db=rast_db, diff_attrs='all')
-            image = torch.sigmoid(dr.texture(self.albedo.unsqueeze(0), texc, uv_da=texc_db)) # [1, H, W, 3]
-        image = image.view(1, h, w, 3)
-        # image = dr.antialias(image, rast, v_clip, f).clamp(0, 1)
-        image = image.squeeze(0).permute(2, 0, 1).contiguous() # [3, H, W]
-        image = alpha * image + (1 - alpha)
-        return image, alpha
-    def fit_mesh(self, iters=2048, resolution=512, decimate_target=5e4):
-        self.opt.output_size = resolution
-        # init mesh from nerf
-        grid_size = 256
-        sigmas = np.zeros([grid_size, grid_size, grid_size], dtype=np.float32)
-        S = 128
-        density_thresh = 10
-        X = torch.linspace(-1, 1, grid_size).split(S)
-        Y = torch.linspace(-1, 1, grid_size).split(S)
-        Z = torch.linspace(-1, 1, grid_size).split(S)
-        for xi, xs in enumerate(X):
-            for yi, ys in enumerate(Y):
-                for zi, zs in enumerate(Z):
-                    xx, yy, zz = torch.meshgrid(xs, ys, zs, indexing='ij')
-                    pts = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1) # [S, 3]
-                    val = self.get_density(pts.to(self.device))
-                    sigmas[xi * S: xi * S + len(xs), yi * S: yi * S + len(ys), zi * S: zi * S + len(zs)] = val.reshape(len(xs), len(ys), len(zs)).detach().cpu().numpy() # [S, 1] --> [x, y, z]
-        print(f'[INFO] marching cubes thresh: {density_thresh} ({sigmas.min()} ~ {sigmas.max()})')
-        vertices, triangles = mcubes.marching_cubes(sigmas, density_thresh)
-        vertices = vertices / (grid_size - 1.0) * 2 - 1
-        # clean
-        vertices = vertices.astype(np.float32)
-        triangles = triangles.astype(np.int32)
-        vertices, triangles = clean_mesh(vertices, triangles, remesh=True, remesh_size=0.01)
-        if triangles.shape[0] > decimate_target:
-            vertices, triangles = decimate_mesh(vertices, triangles, decimate_target, optimalplacement=False)
-        self.v = torch.from_numpy(vertices).contiguous().float().to(self.device)
-        self.f = torch.from_numpy(triangles).contiguous().int().to(self.device)
-        self.deform = nn.Parameter(torch.zeros_like(self.v)).to(self.device)
-        # fit mesh from gs
-        lr_factor = 1
-        optimizer = torch.optim.Adam([
-            {'params': self.encoder.parameters(), 'lr': 1e-3 * lr_factor},
-            {'params': self.mlp.parameters(), 'lr': 1e-3 * lr_factor},
-            {'params': self.deform, 'lr': 1e-4},
-        ])
-        print(f"[INFO] fitting mesh...")
-        pbar = tqdm.trange(iters)
-        for i in pbar:
-            ver = np.random.randint(-10, 10)
-            hor = np.random.randint(-180, 180)
-            rad = self.opt.cam_radius # np.random.uniform(1, 2)
-            pose = orbit_camera(ver, hor, rad)
-            image_gt, alpha_gt = self.render_gs(pose)
-            image_pred, alpha_pred = self.render_mesh(pose)
-            loss_mse = F.mse_loss(image_pred, image_gt) + 0.1 * F.mse_loss(alpha_pred, alpha_gt)
-            # loss_lap = laplacian_smooth_loss(self.v + self.deform, self.f)
-            loss_normal = normal_consistency(self.v + self.deform, self.f)
-            loss_offsets = (self.deform ** 2).sum(-1).mean()
-            loss = loss_mse + 0.001 * loss_normal + 0.1 * loss_offsets
-            loss.backward()
-            optimizer.step()
-            optimizer.zero_grad()
-            # remesh periodically
-            if i > 0 and i % 512 == 0:
-                vertices = (self.v + self.deform).detach().cpu().numpy()
-                triangles = self.f.detach().cpu().numpy()
-                vertices, triangles = clean_mesh(vertices, triangles, remesh=True, remesh_size=0.01)
-                if triangles.shape[0] > decimate_target:
-                    vertices, triangles = decimate_mesh(vertices, triangles, decimate_target, optimalplacement=False)
-                self.v = torch.from_numpy(vertices).contiguous().float().to(self.device)
-                self.f = torch.from_numpy(triangles).contiguous().int().to(self.device)
-                self.deform = nn.Parameter(torch.zeros_like(self.v)).to(self.device)
-                lr_factor *= 0.5
-                optimizer = torch.optim.Adam([
-                    {'params': self.encoder.parameters(), 'lr': 1e-3 * lr_factor},
-                    {'params': self.mlp.parameters(), 'lr': 1e-3 * lr_factor},
-                    {'params': self.deform, 'lr': 1e-4},
-                ])
-            pbar.set_description(f"MSE = {loss_mse.item():.6f}")
-        # last clean
-        vertices = (self.v + self.deform).detach().cpu().numpy()
-        triangles = self.f.detach().cpu().numpy()
-        vertices, triangles = clean_mesh(vertices, triangles, remesh=False)
-        self.v = torch.from_numpy(vertices).contiguous().float().to(self.device)
-        self.f = torch.from_numpy(triangles).contiguous().int().to(self.device)
-        self.deform = nn.Parameter(torch.zeros_like(self.v).to(self.device))
-        print(f"[INFO] finished fitting mesh!")
-    # uv mesh refine
-    def fit_mesh_uv(self, iters=512, resolution=512, texture_resolution=1024, padding=2):
-        self.opt.output_size = resolution
-        # unwrap uv
-        print(f"[INFO] uv unwrapping...")
-        mesh = Mesh(v=self.v, f=self.f, albedo=None, device=self.device)
-        mesh.auto_normal()
-        mesh.auto_uv()
-        self.vt = mesh.vt
-        self.ft = mesh.ft
-        # render uv maps
-        h = w = texture_resolution
-        uv = mesh.vt * 2.0 - 1.0 # uvs to range [-1, 1]
-        uv = torch.cat((uv, torch.zeros_like(uv[..., :1]), torch.ones_like(uv[..., :1])), dim=-1) # [N, 4]
-        rast, _ = dr.rasterize(self.glctx, uv.unsqueeze(0), mesh.ft, (h, w)) # [1, h, w, 4]
-        xyzs, _ = dr.interpolate(mesh.v.unsqueeze(0), rast, mesh.f) # [1, h, w, 3]
-        mask, _ = dr.interpolate(torch.ones_like(mesh.v[:, :1]).unsqueeze(0), rast, mesh.f) # [1, h, w, 1]
-        # masked query
-        xyzs = xyzs.view(-1, 3)
-        mask = (mask > 0).view(-1)
-        albedo = torch.zeros(h * w, 3, device=self.device, dtype=torch.float32)
-        if mask.any():
-            print(f"[INFO] querying texture...")
-            xyzs = xyzs[mask] # [M, 3]
-            # batched inference to avoid OOM
-            batch = []
-            head = 0
-            while head < xyzs.shape[0]:
-                tail = min(head + 640000, xyzs.shape[0])
-                batch.append(torch.sigmoid(self.mlp(self.encoder(xyzs[head:tail]))).float())
-                head += 640000
-            albedo[mask] = torch.cat(batch, dim=0)
-        albedo = albedo.view(h, w, -1)
-        mask = mask.view(h, w)
-        albedo = uv_padding(albedo, mask, padding)
-        # optimize texture
-        self.albedo = nn.Parameter(inverse_sigmoid(albedo)).to(self.device)
-        optimizer = torch.optim.Adam([
-            {'params': self.albedo, 'lr': 1e-3},
-        ])
-        print(f"[INFO] fitting mesh texture...")
-        pbar = tqdm.trange(iters)
-        for i in pbar:
-            # shrink to front view as we care more about it...
-            ver = np.random.randint(-5, 5)
-            hor = np.random.randint(-15, 15)
-            rad = self.opt.cam_radius # np.random.uniform(1, 2)
-            pose = orbit_camera(ver, hor, rad)
-            image_gt, alpha_gt = self.render_gs(pose)
-            image_pred, alpha_pred = self.render_mesh(pose)
-            loss_mse = F.mse_loss(image_pred, image_gt)
-            loss = loss_mse
-            loss.backward()
-            optimizer.step()
-            optimizer.zero_grad()
-            pbar.set_description(f"MSE = {loss_mse.item():.6f}")
-        print(f"[INFO] finished fitting mesh texture!")
-    @torch.no_grad()
-    def export_mesh(self, path):
-        mesh = Mesh(v=self.v, f=self.f, vt=self.vt, ft=self.ft, albedo=torch.sigmoid(self.albedo), device=self.device)
-        mesh.auto_normal()
-        mesh.write(path)
-opt = tyro.cli(AllConfigs)
-# load a saved ply and convert to mesh
-assert opt.test_path.endswith('.ply'), '--test_path must be a .ply file saved by infer.py'
-converter = Converter(opt).cuda()
-converter.fit_nerf()
-converter.fit_mesh()
-converter.fit_mesh_uv()
-converter.export_mesh(opt.test_path.replace('.ply', '.glb'))

core/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (123 Bytes). View file

core/__pycache__/attention.cpython-39.pyc ADDED Viewed

Binary file (4.36 kB). View file

core/__pycache__/gs.cpython-39.pyc ADDED Viewed

Binary file (5.48 kB). View file

core/__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (4.47 kB). View file

core/__pycache__/options.cpython-39.pyc ADDED Viewed

Binary file (2.46 kB). View file

core/__pycache__/provider_objaverse.cpython-39.pyc ADDED Viewed

Binary file (7.74 kB). View file

core/__pycache__/unet.cpython-39.pyc ADDED Viewed

Binary file (7.45 kB). View file

core/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (2.54 kB). View file

core/models.py CHANGED Viewed

@@ -131,9 +131,12 @@ class LGM(nn.Module):
         results['gaussians'] = gaussians
-        # always use white bg
-        bg_color = torch.ones(3, dtype=torch.float32, device=gaussians.device)
         # use the other views for rendering and supervision
         results = self.gs.render(gaussians, data['cam_view'], data['cam_view_proj'], data['cam_pos'], bg_color=bg_color)
         pred_images = results['image'] # [B, V, C, output_size, output_size]
@@ -168,4 +171,4 @@ class LGM(nn.Module):
             psnr = -10 * torch.log10(torch.mean((pred_images.detach() - gt_images) ** 2))
             results['psnr'] = psnr
-        return results

         results['gaussians'] = gaussians
+        # random bg for training
+        if self.training:
+            bg_color = torch.rand(3, dtype=torch.float32, device=gaussians.device)
+        else:
+            bg_color = torch.ones(3, dtype=torch.float32, device=gaussians.device)
         # use the other views for rendering and supervision
         results = self.gs.render(gaussians, data['cam_view'], data['cam_view_proj'], data['cam_pos'], bg_color=bg_color)
         pred_images = results['image'] # [B, V, C, output_size, output_size]
             psnr = -10 * torch.log10(torch.mean((pred_images.detach() - gt_images) ** 2))
             results['psnr'] = psnr
+        return results

core/options.py CHANGED Viewed

@@ -9,16 +9,16 @@ class Options:
     # Unet image input size
     input_size: int = 256
     # Unet definition
-    down_channels: Tuple[int, ...] = (64, 128, 256, 512, 1024, 1024)
-    down_attention: Tuple[bool, ...] = (False, False, False, True, True, True)
     mid_attention: bool = True
-    up_channels: Tuple[int, ...] = (1024, 1024, 512, 256)
-    up_attention: Tuple[bool, ...] = (True, True, True, False)
     # Unet output size, dependent on the input_size and U-Net structure!
     splat_size: int = 64
     # gaussian render size
     output_size: int = 256
     ### dataset
     # data mode (only support s3 now)
     data_mode: Literal['s3'] = 's3'
@@ -40,7 +40,7 @@ class Options:
     ### training
     # workspace
     workspace: str = './workspace'
-    # resume
     resume: Optional[str] = None
     # batch size (per-GPU)
     batch_size: int = 8
@@ -117,4 +117,4 @@ config_defaults['tiny'] = Options(
     mixed_precision='bf16',
 )
-AllConfigs = tyro.extras.subcommand_type_from_defaults(config_defaults, config_doc)

     # Unet image input size
     input_size: int = 256
     # Unet definition
+    down_channels: Tuple[int] = (64, 128, 256, 512, 1024, 1024)
+    down_attention: Tuple[bool] = (False, False, False, True, True, True)
     mid_attention: bool = True
+    up_channels: Tuple[int] = (1024, 1024, 512, 256)
+    up_attention: Tuple[bool] = (True, True, True, False)
     # Unet output size, dependent on the input_size and U-Net structure!
     splat_size: int = 64
     # gaussian render size
     output_size: int = 256
     ### dataset
     # data mode (only support s3 now)
     data_mode: Literal['s3'] = 's3'
     ### training
     # workspace
     workspace: str = './workspace'
+    # resume
     resume: Optional[str] = None
     # batch size (per-GPU)
     batch_size: int = 8
     mixed_precision='bf16',
 )
+AllConfigs = tyro.extras.subcommand_type_from_defaults(config_defaults, config_doc)

core/unet.py CHANGED Viewed

@@ -3,10 +3,10 @@ import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
-from typing import Tuple, Literal
 from functools import partial
-from core.attention import MemEffAttention
 class MVAttention(nn.Module):
     def __init__(
@@ -236,11 +236,11 @@ class UNet(nn.Module):
         self,
         in_channels: int = 3,
         out_channels: int = 3,
-        down_channels: Tuple[int, ...] = (64, 128, 256, 512, 1024),
-        down_attention: Tuple[bool, ...] = (False, False, False, True, True),
         mid_attention: bool = True,
-        up_channels: Tuple[int, ...] = (1024, 512, 256),
-        up_attention: Tuple[bool, ...] = (True, True, False),
         layers_per_block: int = 2,
         skip_scale: float = np.sqrt(0.5),
     ):
@@ -316,4 +316,4 @@ class UNet(nn.Module):
         x = F.silu(x)
         x = self.conv_out(x) # [B, Cout, H', W']
-        return x

 import torch.nn.functional as F
 import numpy as np
+from typing import Tuple, Optional, Literal
 from functools import partial
+from core.attention import MemEffAttention, MemEffCrossAttention
 class MVAttention(nn.Module):
     def __init__(
         self,
         in_channels: int = 3,
         out_channels: int = 3,
+        down_channels: Tuple[int] = (64, 128, 256, 512, 1024),
+        down_attention: Tuple[bool] = (False, False, False, True, True),
         mid_attention: bool = True,
+        up_channels: Tuple[int] = (1024, 512, 256),
+        up_attention: Tuple[bool] = (True, True, False),
         layers_per_block: int = 2,
         skip_scale: float = np.sqrt(0.5),
     ):
         x = F.silu(x)
         x = self.conv_out(x) # [B, Cout, H', W']
+        return x

data_test/anya_rgba.png DELETED Viewed

Binary file (32.9 kB)

data_test/bird.jpg ADDED Viewed

data_test/bird_rgba.png DELETED Viewed

Binary file (56.2 kB)

data_test/boy.jpg ADDED Viewed

data_test/cat_statue.jpg ADDED Viewed

data_test/catstatue_rgba.png DELETED Viewed

Binary file (45.5 kB)

data_test/dragontoy.jpg ADDED Viewed

data_test/frog_sweater.jpg ADDED Viewed

data_test/gso_rabbit.jpg ADDED Viewed

diff-gaussian-rasterization/.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+build/
+diff_gaussian_rasterization.egg-info/
+dist/
+__pycache__
+*.so

diff-gaussian-rasterization/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "third_party/glm"]
+	path = third_party/glm
+	url = https://github.com/g-truc/glm.git

diff-gaussian-rasterization/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  george.drettakis@inria.fr
+#
+cmake_minimum_required(VERSION 3.20)
+project(DiffRast LANGUAGES CUDA CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+add_library(CudaRasterizer
+	cuda_rasterizer/backward.h
+	cuda_rasterizer/backward.cu
+	cuda_rasterizer/forward.h
+	cuda_rasterizer/forward.cu
+	cuda_rasterizer/auxiliary.h
+	cuda_rasterizer/rasterizer_impl.cu
+	cuda_rasterizer/rasterizer_impl.h
+	cuda_rasterizer/rasterizer.h
+)
+set_target_properties(CudaRasterizer PROPERTIES CUDA_ARCHITECTURES "75;86")
+target_include_directories(CudaRasterizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rasterizer)
+target_include_directories(CudaRasterizer PRIVATE third_party/glm ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

diff-gaussian-rasterization/LICENSE.md ADDED Viewed

	@@ -0,0 +1,83 @@

+Gaussian-Splatting License
+===========================
+**Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**.
+The *Software* is in the process of being registered with the Agence pour la Protection des
+Programmes (APP).
+The *Software* is still being developed by the *Licensor*.
+*Licensor*'s goal is to allow the research community to use, test and evaluate
+the *Software*.
+## 1.  Definitions
+*Licensee* means any person or entity that uses the *Software* and distributes
+its *Work*.
+*Licensor* means the owners of the *Software*, i.e Inria and MPII
+*Software* means the original work of authorship made available under this
+License ie gaussian-splatting.
+*Work* means the *Software* and any additions to or derivative works of the
+*Software* that are made available under this License.
+## 2.  Purpose
+This license is intended to define the rights granted to the *Licensee* by
+Licensors under the *Software*.
+## 3.  Rights granted
+For the above reasons Licensors have decided to distribute the *Software*.
+Licensors grant non-exclusive rights to use the *Software* for research purposes
+to research users (both academic and industrial), free of charge, without right
+to sublicense.. The *Software* may be used "non-commercially", i.e., for research
+and/or evaluation purposes only.
+Subject to the terms and conditions of this License, you are granted a
+non-exclusive, royalty-free, license to reproduce, prepare derivative works of,
+publicly display, publicly perform and distribute its *Work* and any resulting
+derivative works in any form.
+## 4.  Limitations
+**4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do
+so under this License, (b) you include a complete copy of this License with
+your distribution, and (c) you retain without modification any copyright,
+patent, trademark, or attribution notices that are present in the *Work*.
+**4.2 Derivative Works.** You may specify that additional or different terms apply
+to the use, reproduction, and distribution of your derivative works of the *Work*
+("Your Terms") only if (a) Your Terms provide that the use limitation in
+Section 2 applies to your derivative works, and (b) you identify the specific
+derivative works that are subject to Your Terms. Notwithstanding Your Terms,
+this License (including the redistribution requirements in Section 3.1) will
+continue to apply to the *Work* itself.
+**4.3** Any other use without of prior consent of Licensors is prohibited. Research
+users explicitly acknowledge having received from Licensors all information
+allowing to appreciate the adequacy between of the *Software* and their needs and
+to undertake all necessary precautions for its execution and use.
+**4.4** The *Software* is provided both as a compiled library file and as source
+code. In case of using the *Software* for a publication or other results obtained
+through the use of the *Software*, users are strongly encouraged to cite the
+corresponding publications as explained in the documentation of the *Software*.
+## 5.  Disclaimer
+THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES
+WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY
+UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL
+CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES
+OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL
+USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR
+ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE
+AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*.

diff-gaussian-rasterization/README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Differential Gaussian Rasterization
+**NOTE**: this is a modified version to support depth & alpha rendering (both forward and backward) from the [original repository](https://github.com/graphdeco-inria/diff-gaussian-rasterization).
+```python
+rendered_image, radii, rendered_depth, rendered_alpha = rasterizer(
+    means3D=means3D,
+    means2D=means2D,
+    shs=shs,
+    colors_precomp=colors_precomp,
+    opacities=opacity,
+    scales=scales,
+    rotations=rotations,
+    cov3D_precomp=cov3D_precomp,
+)
+```
+Used as the rasterization engine for the paper "3D Gaussian Splatting for Real-Time Rendering of Radiance Fields". If you can make use of it in your own research, please be so kind to cite us.
+<section class="section" id="BibTeX">
+  <div class="container is-max-desktop content">
+    <h2 class="title">BibTeX</h2>
+    <pre><code>@Article{kerbl3Dgaussians,
+      author       = {Kerbl, Bernhard and Kopanas, Georgios and Leimk{\"u}hler, Thomas and Drettakis, George},
+      title        = {3D Gaussian Splatting for Real-Time Radiance Field Rendering},
+      journal      = {ACM Transactions on Graphics},
+      number       = {4},
+      volume       = {42},
+      month        = {July},
+      year         = {2023},
+      url          = {https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/}
+}</code></pre>
+  </div>
+</section>

diff-gaussian-rasterization/cuda_rasterizer/auxiliary.h ADDED Viewed

	@@ -0,0 +1,175 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
+#define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
+#include "config.h"
+#include "stdio.h"
+#define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
+#define NUM_WARPS (BLOCK_SIZE/32)
+// Spherical harmonics coefficients
+__device__ const float SH_C0 = 0.28209479177387814f;
+__device__ const float SH_C1 = 0.4886025119029199f;
+__device__ const float SH_C2[] = {
+	1.0925484305920792f,
+	-1.0925484305920792f,
+	0.31539156525252005f,
+	-1.0925484305920792f,
+	0.5462742152960396f
+};
+__device__ const float SH_C3[] = {
+	-0.5900435899266435f,
+	2.890611442640554f,
+	-0.4570457994644658f,
+	0.3731763325901154f,
+	-0.4570457994644658f,
+	1.445305721320277f,
+	-0.5900435899266435f
+};
+__forceinline__ __device__ float ndc2Pix(float v, int S)
+{
+	return ((v + 1.0) * S - 1.0) * 0.5;
+}
+__forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
+{
+	rect_min = {
+		min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
+		min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y)))
+	};
+	rect_max = {
+		min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))),
+		min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y)))
+	};
+}
+__forceinline__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix)
+{
+	float3 transformed = {
+		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
+		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
+		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
+	};
+	return transformed;
+}
+__forceinline__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix)
+{
+	float4 transformed = {
+		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
+		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
+		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
+		matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15]
+	};
+	return transformed;
+}
+__forceinline__ __device__ float3 transformVec4x3(const float3& p, const float* matrix)
+{
+	float3 transformed = {
+		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z,
+		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z,
+		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z,
+	};
+	return transformed;
+}
+__forceinline__ __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix)
+{
+	float3 transformed = {
+		matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z,
+		matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z,
+		matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z,
+	};
+	return transformed;
+}
+__forceinline__ __device__ float dnormvdz(float3 v, float3 dv)
+{
+	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
+	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
+	float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
+	return dnormvdz;
+}
+__forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv)
+{
+	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
+	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
+	float3 dnormvdv;
+	dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32;
+	dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32;
+	dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
+	return dnormvdv;
+}
+__forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv)
+{
+	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
+	float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w };
+	float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w;
+	float4 dnormvdv;
+	dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32;
+	dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32;
+	dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32;
+	dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32;
+	return dnormvdv;
+}
+__forceinline__ __device__ float sigmoid(float x)
+{
+	return 1.0f / (1.0f + expf(-x));
+}
+__forceinline__ __device__ bool in_frustum(int idx,
+	const float* orig_points,
+	const float* viewmatrix,
+	const float* projmatrix,
+	bool prefiltered,
+	float3& p_view)
+{
+	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
+	// Bring points to screen space
+	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
+	float p_w = 1.0f / (p_hom.w + 0.0000001f);
+	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
+	p_view = transformPoint4x3(p_orig, viewmatrix);
+	if (p_view.z <= 0.2f)// || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y < -1.3 || p_proj.y > 1.3)))
+	{
+		if (prefiltered)
+		{
+			printf("Point is filtered although prefiltered is set. This shouldn't happen!");
+			__trap();
+		}
+		return false;
+	}
+	return true;
+}
+#define CHECK_CUDA(A, debug) \
+A; if(debug) { \
+auto ret = cudaDeviceSynchronize(); \
+if (ret != cudaSuccess) { \
+std::cerr << "\n[CUDA ERROR] in " << __FILE__ << "\nLine " << __LINE__ << ": " << cudaGetErrorString(ret); \
+throw std::runtime_error(cudaGetErrorString(ret)); \
+} \
+}
+#endif

diff-gaussian-rasterization/cuda_rasterizer/backward.cu ADDED Viewed

	@@ -0,0 +1,712 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "backward.h"
+#include "auxiliary.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+// Backward pass for conversion of spherical harmonics to RGB for
+// each Gaussian.
+__device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs)
+{
+	// Compute intermediate values, as it is done during forward
+	glm::vec3 pos = means[idx];
+	glm::vec3 dir_orig = pos - campos;
+	glm::vec3 dir = dir_orig / glm::length(dir_orig);
+	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
+	// Use PyTorch rule for clamping: if clamping was applied,
+	// gradient becomes 0.
+	glm::vec3 dL_dRGB = dL_dcolor[idx];
+	dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1;
+	dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1;
+	dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1;
+	glm::vec3 dRGBdx(0, 0, 0);
+	glm::vec3 dRGBdy(0, 0, 0);
+	glm::vec3 dRGBdz(0, 0, 0);
+	float x = dir.x;
+	float y = dir.y;
+	float z = dir.z;
+	// Target location for this Gaussian to write SH gradients to
+	glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs;
+	// No tricks here, just high school-level calculus.
+	float dRGBdsh0 = SH_C0;
+	dL_dsh[0] = dRGBdsh0 * dL_dRGB;
+	if (deg > 0)
+	{
+		float dRGBdsh1 = -SH_C1 * y;
+		float dRGBdsh2 = SH_C1 * z;
+		float dRGBdsh3 = -SH_C1 * x;
+		dL_dsh[1] = dRGBdsh1 * dL_dRGB;
+		dL_dsh[2] = dRGBdsh2 * dL_dRGB;
+		dL_dsh[3] = dRGBdsh3 * dL_dRGB;
+		dRGBdx = -SH_C1 * sh[3];
+		dRGBdy = -SH_C1 * sh[1];
+		dRGBdz = SH_C1 * sh[2];
+		if (deg > 1)
+		{
+			float xx = x * x, yy = y * y, zz = z * z;
+			float xy = x * y, yz = y * z, xz = x * z;
+			float dRGBdsh4 = SH_C2[0] * xy;
+			float dRGBdsh5 = SH_C2[1] * yz;
+			float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy);
+			float dRGBdsh7 = SH_C2[3] * xz;
+			float dRGBdsh8 = SH_C2[4] * (xx - yy);
+			dL_dsh[4] = dRGBdsh4 * dL_dRGB;
+			dL_dsh[5] = dRGBdsh5 * dL_dRGB;
+			dL_dsh[6] = dRGBdsh6 * dL_dRGB;
+			dL_dsh[7] = dRGBdsh7 * dL_dRGB;
+			dL_dsh[8] = dRGBdsh8 * dL_dRGB;
+			dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8];
+			dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8];
+			dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7];
+			if (deg > 2)
+			{
+				float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy);
+				float dRGBdsh10 = SH_C3[1] * xy * z;
+				float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy);
+				float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy);
+				float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy);
+				float dRGBdsh14 = SH_C3[5] * z * (xx - yy);
+				float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy);
+				dL_dsh[9] = dRGBdsh9 * dL_dRGB;
+				dL_dsh[10] = dRGBdsh10 * dL_dRGB;
+				dL_dsh[11] = dRGBdsh11 * dL_dRGB;
+				dL_dsh[12] = dRGBdsh12 * dL_dRGB;
+				dL_dsh[13] = dRGBdsh13 * dL_dRGB;
+				dL_dsh[14] = dRGBdsh14 * dL_dRGB;
+				dL_dsh[15] = dRGBdsh15 * dL_dRGB;
+				dRGBdx += (
+					SH_C3[0] * sh[9] * 3.f * 2.f * xy +
+					SH_C3[1] * sh[10] * yz +
+					SH_C3[2] * sh[11] * -2.f * xy +
+					SH_C3[3] * sh[12] * -3.f * 2.f * xz +
+					SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) +
+					SH_C3[5] * sh[14] * 2.f * xz +
+					SH_C3[6] * sh[15] * 3.f * (xx - yy));
+				dRGBdy += (
+					SH_C3[0] * sh[9] * 3.f * (xx - yy) +
+					SH_C3[1] * sh[10] * xz +
+					SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) +
+					SH_C3[3] * sh[12] * -3.f * 2.f * yz +
+					SH_C3[4] * sh[13] * -2.f * xy +
+					SH_C3[5] * sh[14] * -2.f * yz +
+					SH_C3[6] * sh[15] * -3.f * 2.f * xy);
+				dRGBdz += (
+					SH_C3[1] * sh[10] * xy +
+					SH_C3[2] * sh[11] * 4.f * 2.f * yz +
+					SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) +
+					SH_C3[4] * sh[13] * 4.f * 2.f * xz +
+					SH_C3[5] * sh[14] * (xx - yy));
+			}
+		}
+	}
+	// The view direction is an input to the computation. View direction
+	// is influenced by the Gaussian's mean, so SHs gradients
+	// must propagate back into 3D position.
+	glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB));
+	// Account for normalization of direction
+	float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z });
+	// Gradients of loss w.r.t. Gaussian means, but only the portion
+	// that is caused because the mean affects the view-dependent color.
+	// Additional mean gradient is accumulated in below methods.
+	dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z);
+}
+// Backward version of INVERSE 2D covariance matrix computation
+// (due to length launched as separate kernel before other
+// backward steps contained in preprocess)
+__global__ void computeCov2DCUDA(int P,
+	const float3* means,
+	const int* radii,
+	const float* cov3Ds,
+	const float h_x, float h_y,
+	const float tan_fovx, float tan_fovy,
+	const float* view_matrix,
+	const float* dL_dconics,
+	float3* dL_dmeans,
+	float* dL_dcov)
+{
+	auto idx = cg::this_grid().thread_rank();
+	if (idx >= P || !(radii[idx] > 0))
+		return;
+	// Reading location of 3D covariance for this Gaussian
+	const float* cov3D = cov3Ds + 6 * idx;
+	// Fetch gradients, recompute 2D covariance and relevant
+	// intermediate forward results needed in the backward.
+	float3 mean = means[idx];
+	float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] };
+	float3 t = transformPoint4x3(mean, view_matrix);
+	const float limx = 1.3f * tan_fovx;
+	const float limy = 1.3f * tan_fovy;
+	const float txtz = t.x / t.z;
+	const float tytz = t.y / t.z;
+	t.x = min(limx, max(-limx, txtz)) * t.z;
+	t.y = min(limy, max(-limy, tytz)) * t.z;
+	const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1;
+	const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1;
+	glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z),
+		0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z),
+		0, 0, 0);
+	glm::mat3 W = glm::mat3(
+		view_matrix[0], view_matrix[4], view_matrix[8],
+		view_matrix[1], view_matrix[5], view_matrix[9],
+		view_matrix[2], view_matrix[6], view_matrix[10]);
+	glm::mat3 Vrk = glm::mat3(
+		cov3D[0], cov3D[1], cov3D[2],
+		cov3D[1], cov3D[3], cov3D[4],
+		cov3D[2], cov3D[4], cov3D[5]);
+	glm::mat3 T = W * J;
+	glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T;
+	// Use helper variables for 2D covariance entries. More compact.
+	float a = cov2D[0][0] += 0.3f;
+	float b = cov2D[0][1];
+	float c = cov2D[1][1] += 0.3f;
+	float denom = a * c - b * b;
+	float dL_da = 0, dL_db = 0, dL_dc = 0;
+	float denom2inv = 1.0f / ((denom * denom) + 0.0000001f);
+	if (denom2inv != 0)
+	{
+		// Gradients of loss w.r.t. entries of 2D covariance matrix,
+		// given gradients of loss w.r.t. conic matrix (inverse covariance matrix).
+		// e.g., dL / da = dL / d_conic_a * d_conic_a / d_a
+		dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z);
+		dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x);
+		dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z);
+		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry,
+		// given gradients w.r.t. 2D covariance matrix (diagonal).
+		// cov2D = transpose(T) * transpose(Vrk) * T;
+		dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc);
+		dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc);
+		dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc);
+		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry,
+		// given gradients w.r.t. 2D covariance matrix (off-diagonal).
+		// Off-diagonal elements appear twice --> double the gradient.
+		// cov2D = transpose(T) * transpose(Vrk) * T;
+		dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc;
+		dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc;
+		dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc;
+	}
+	else
+	{
+		for (int i = 0; i < 6; i++)
+			dL_dcov[6 * idx + i] = 0;
+	}
+	// Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T
+	// cov2D = transpose(T) * transpose(Vrk) * T;
+	float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da +
+		(T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db;
+	float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da +
+		(T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db;
+	float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da +
+		(T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db;
+	float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc +
+		(T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db;
+	float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc +
+		(T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db;
+	float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc +
+		(T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db;
+	// Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix
+	// T = W * J
+	float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02;
+	float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02;
+	float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12;
+	float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12;
+	float tz = 1.f / t.z;
+	float tz2 = tz * tz;
+	float tz3 = tz2 * tz;
+	// Gradients of loss w.r.t. transformed Gaussian mean t
+	float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02;
+	float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12;
+	float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12;
+	// Account for transformation of mean to t
+	// t = transformPoint4x3(mean, view_matrix);
+	float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix);
+	// Gradients of loss w.r.t. Gaussian means, but only the portion
+	// that is caused because the mean affects the covariance matrix.
+	// Additional mean gradient is accumulated in BACKWARD::preprocess.
+	dL_dmeans[idx] = dL_dmean;
+}
+// Backward pass for the conversion of scale and rotation to a
+// 3D covariance matrix for each Gaussian.
+__device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov3Ds, glm::vec3* dL_dscales, glm::vec4* dL_drots)
+{
+	// Recompute (intermediate) results for the 3D covariance computation.
+	glm::vec4 q = rot;// / glm::length(rot);
+	float r = q.x;
+	float x = q.y;
+	float y = q.z;
+	float z = q.w;
+	glm::mat3 R = glm::mat3(
+		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
+		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
+		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
+	);
+	glm::mat3 S = glm::mat3(1.0f);
+	glm::vec3 s = mod * scale;
+	S[0][0] = s.x;
+	S[1][1] = s.y;
+	S[2][2] = s.z;
+	glm::mat3 M = S * R;
+	const float* dL_dcov3D = dL_dcov3Ds + 6 * idx;
+	glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]);
+	glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]);
+	// Convert per-element covariance loss gradients to matrix form
+	glm::mat3 dL_dSigma = glm::mat3(
+		dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2],
+		0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4],
+		0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5]
+	);
+	// Compute loss gradient w.r.t. matrix M
+	// dSigma_dM = 2 * M
+	glm::mat3 dL_dM = 2.0f * M * dL_dSigma;
+	glm::mat3 Rt = glm::transpose(R);
+	glm::mat3 dL_dMt = glm::transpose(dL_dM);
+	// Gradients of loss w.r.t. scale
+	glm::vec3* dL_dscale = dL_dscales + idx;
+	dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]);
+	dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]);
+	dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]);
+	dL_dMt[0] *= s.x;
+	dL_dMt[1] *= s.y;
+	dL_dMt[2] *= s.z;
+	// Gradients of loss w.r.t. normalized quaternion
+	glm::vec4 dL_dq;
+	dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]);
+	dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]);
+	dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]);
+	dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]);
+	// Gradients of loss w.r.t. unnormalized quaternion
+	float4* dL_drot = (float4*)(dL_drots + idx);
+	*dL_drot = float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w };//dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w });
+}
+// Backward pass of the preprocessing steps, except
+// for the covariance computation and inversion
+// (those are handled by a previous kernel call)
+template<int C>
+__global__ void preprocessCUDA(
+	int P, int D, int M,
+	const float3* means,
+	const int* radii,
+	const float* shs,
+	const bool* clamped,
+	const glm::vec3* scales,
+	const glm::vec4* rotations,
+	const float scale_modifier,
+	const float* view,
+	const float* proj,
+	const glm::vec3* campos,
+	const float3* dL_dmean2D,
+	glm::vec3* dL_dmeans,
+	float* dL_dcolor,
+	float* dL_ddepth,
+	float* dL_dcov3D,
+	float* dL_dsh,
+	glm::vec3* dL_dscale,
+	glm::vec4* dL_drot)
+{
+	auto idx = cg::this_grid().thread_rank();
+	if (idx >= P || !(radii[idx] > 0))
+		return;
+	float3 m = means[idx];
+	// Taking care of gradients from the screenspace points
+	float4 m_hom = transformPoint4x4(m, proj);
+	float m_w = 1.0f / (m_hom.w + 0.0000001f);
+	// Compute loss gradient w.r.t. 3D means due to gradients of 2D means
+	// from rendering procedure
+	glm::vec3 dL_dmean;
+	float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w;
+	float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w;
+	dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y;
+	dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y;
+	dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y;
+	// That's the second part of the mean gradient. Previous computation
+	// of cov2D and following SH conversion also affects it.
+	dL_dmeans[idx] += dL_dmean;
+	// the w must be equal to 1 for view^T * [x,y,z,1]
+	float3 m_view = transformPoint4x3(m, view);
+	// Compute loss gradient w.r.t. 3D means due to gradients of depth
+	// from rendering procedure
+	glm::vec3 dL_dmean2;
+	float mul3 = view[2] * m.x + view[6] * m.y + view[10] * m.z + view[14];
+	dL_dmean2.x = (view[2] - view[3] * mul3) * dL_ddepth[idx];
+	dL_dmean2.y = (view[6] - view[7] * mul3) * dL_ddepth[idx];
+	dL_dmean2.z = (view[10] - view[11] * mul3) * dL_ddepth[idx];
+	// That's the third part of the mean gradient.
+	dL_dmeans[idx] += dL_dmean2;
+	// Compute gradient updates due to computing colors from SHs
+	if (shs)
+		computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh);
+	// Compute gradient updates due to computing covariance from scale/rotation
+	if (scales)
+		computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot);
+}
+// Backward version of the rendering procedure.
+template <uint32_t C>
+__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
+renderCUDA(
+	const uint2* __restrict__ ranges,
+	const uint32_t* __restrict__ point_list,
+	int W, int H,
+	const float* __restrict__ bg_color,
+	const float2* __restrict__ points_xy_image,
+	const float4* __restrict__ conic_opacity,
+	const float* __restrict__ colors,
+	const float* __restrict__ depths,
+	const float* __restrict__ alphas,
+	const uint32_t* __restrict__ n_contrib,
+	const float* __restrict__ dL_dpixels,
+	const float* __restrict__ dL_dpixel_depths,
+	const float* __restrict__ dL_dalphas,
+	float3* __restrict__ dL_dmean2D,
+	float4* __restrict__ dL_dconic2D,
+	float* __restrict__ dL_dopacity,
+	float* __restrict__ dL_dcolors,
+	float* __restrict__ dL_ddepths
+)
+{
+	// We rasterize again. Compute necessary block info.
+	auto block = cg::this_thread_block();
+	const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
+	const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
+	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
+	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
+	const uint32_t pix_id = W * pix.y + pix.x;
+	const float2 pixf = { (float)pix.x, (float)pix.y };
+	const bool inside = pix.x < W&& pix.y < H;
+	const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
+	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
+	bool done = !inside;
+	int toDo = range.y - range.x;
+	__shared__ int collected_id[BLOCK_SIZE];
+	__shared__ float2 collected_xy[BLOCK_SIZE];
+	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
+	__shared__ float collected_colors[C * BLOCK_SIZE];
+	__shared__ float collected_depths[BLOCK_SIZE];
+	// In the forward, we stored the final value for T, the
+	// product of all (1 - alpha) factors.
+	const float T_final = inside ? (1 - alphas[pix_id]) : 0;
+	float T = T_final;
+	// We start from the back. The ID of the last contributing
+	// Gaussian is known from each pixel from the forward.
+	uint32_t contributor = toDo;
+	const int last_contributor = inside ? n_contrib[pix_id] : 0;
+	float accum_rec[C] = { 0 };
+	float dL_dpixel[C];
+	float accum_depth_rec = 0;
+	float dL_dpixel_depth;
+	float accum_alpha_rec = 0;
+	float dL_dalpha;
+	if (inside) {
+		for (int i = 0; i < C; i++)
+			dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
+		dL_dpixel_depth = dL_dpixel_depths[pix_id];
+		dL_dalpha = dL_dalphas[pix_id];
+	}
+	float last_alpha = 0;
+	float last_color[C] = { 0 };
+	float last_depth = 0;
+	// Gradient of pixel coordinate w.r.t. normalized
+	// screen-space viewport corrdinates (-1 to 1)
+	const float ddelx_dx = 0.5 * W;
+	const float ddely_dy = 0.5 * H;
+	// Traverse all Gaussians
+	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
+	{
+		// Load auxiliary data into shared memory, start in the BACK
+		// and load them in revers order.
+		block.sync();
+		const int progress = i * BLOCK_SIZE + block.thread_rank();
+		if (range.x + progress < range.y)
+		{
+			const int coll_id = point_list[range.y - progress - 1];
+			collected_id[block.thread_rank()] = coll_id;
+			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
+			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
+			for (int i = 0; i < C; i++)
+				collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i];
+			collected_depths[block.thread_rank()] = depths[coll_id];
+		}
+		block.sync();
+		// Iterate over Gaussians
+		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
+		{
+			// Keep track of current Gaussian ID. Skip, if this one
+			// is behind the last contributor for this pixel.
+			contributor--;
+			if (contributor >= last_contributor)
+				continue;
+			// Compute blending values, as before.
+			const float2 xy = collected_xy[j];
+			const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
+			const float4 con_o = collected_conic_opacity[j];
+			const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
+			if (power > 0.0f)
+				continue;
+			const float G = exp(power);
+			const float alpha = min(0.99f, con_o.w * G);
+			if (alpha < 1.0f / 255.0f)
+				continue;
+			T = T / (1.f - alpha);
+			const float dchannel_dcolor = alpha * T;
+			const float dpixel_depth_ddepth = alpha * T;
+			// Propagate gradients to per-Gaussian colors and keep
+			// gradients w.r.t. alpha (blending factor for a Gaussian/pixel
+			// pair).
+			float dL_dopa = 0.0f;
+			const int global_id = collected_id[j];
+			for (int ch = 0; ch < C; ch++)
+			{
+				const float c = collected_colors[ch * BLOCK_SIZE + j];
+				// Update last color (to be used in the next iteration)
+				accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
+				last_color[ch] = c;
+				const float dL_dchannel = dL_dpixel[ch];
+				dL_dopa += (c - accum_rec[ch]) * dL_dchannel;
+				// Update the gradients w.r.t. color of the Gaussian.
+				// Atomic, since this pixel is just one of potentially
+				// many that were affected by this Gaussian.
+				atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel);
+			}
+			// Propagate gradients from pixel depth to opacity
+			const float c_d = collected_depths[j];
+			accum_depth_rec = last_alpha * last_depth + (1.f - last_alpha) * accum_depth_rec;
+			last_depth = c_d;
+			dL_dopa += (c_d - accum_depth_rec) * dL_dpixel_depth;
+			atomicAdd(&(dL_ddepths[global_id]), dpixel_depth_ddepth * dL_dpixel_depth);
+			// Propagate gradients from pixel alpha (weights_sum) to opacity
+			accum_alpha_rec = last_alpha + (1.f - last_alpha) * accum_alpha_rec;
+			dL_dopa += (1 - accum_alpha_rec) * dL_dalpha; //- (alpha - accum_alpha_rec) * dL_dalpha;
+			dL_dopa *= T;
+			// Update last alpha (to be used in the next iteration)
+			last_alpha = alpha;
+			// Account for fact that alpha also influences how much of
+			// the background color is added if nothing left to blend
+			float bg_dot_dpixel = 0;
+			for (int i = 0; i < C; i++)
+				bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
+			dL_dopa += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
+			// Helpful reusable temporary variables
+			const float dL_dG = con_o.w * dL_dopa;
+			const float gdx = G * d.x;
+			const float gdy = G * d.y;
+			const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
+			const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
+			// Update gradients w.r.t. 2D mean position of the Gaussian
+			atomicAdd(&dL_dmean2D[global_id].x, dL_dG * dG_ddelx * ddelx_dx);
+			atomicAdd(&dL_dmean2D[global_id].y, dL_dG * dG_ddely * ddely_dy);
+			// Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric)
+			atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG);
+			atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG);
+			atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG);
+			// Update gradients w.r.t. opacity of the Gaussian
+			atomicAdd(&(dL_dopacity[global_id]), G * dL_dopa);
+		}
+	}
+}
+void BACKWARD::preprocess(
+	int P, int D, int M,
+	const float3* means3D,
+	const int* radii,
+	const float* shs,
+	const bool* clamped,
+	const glm::vec3* scales,
+	const glm::vec4* rotations,
+	const float scale_modifier,
+	const float* cov3Ds,
+	const float* viewmatrix,
+	const float* projmatrix,
+	const float focal_x, float focal_y,
+	const float tan_fovx, float tan_fovy,
+	const glm::vec3* campos,
+	const float3* dL_dmean2D,
+	const float* dL_dconic,
+	glm::vec3* dL_dmean3D,
+	float* dL_dcolor,
+	float* dL_ddepth,
+	float* dL_dcov3D,
+	float* dL_dsh,
+	glm::vec3* dL_dscale,
+	glm::vec4* dL_drot)
+{
+	// Propagate gradients for the path of 2D conic matrix computation.
+	// Somewhat long, thus it is its own kernel rather than being part of
+	// "preprocess". When done, loss gradient w.r.t. 3D means has been
+	// modified and gradient w.r.t. 3D covariance matrix has been computed.
+	computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
+		P,
+		means3D,
+		radii,
+		cov3Ds,
+		focal_x,
+		focal_y,
+		tan_fovx,
+		tan_fovy,
+		viewmatrix,
+		dL_dconic,
+		(float3*)dL_dmean3D,
+		dL_dcov3D);
+	// Propagate gradients for remaining steps: finish 3D mean gradients,
+	// propagate color gradients to SH (if desireD), propagate 3D covariance
+	// matrix gradients to scale and rotation.
+	preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
+		P, D, M,
+		(float3*)means3D,
+		radii,
+		shs,
+		clamped,
+		(glm::vec3*)scales,
+		(glm::vec4*)rotations,
+		scale_modifier,
+		viewmatrix,
+		projmatrix,
+		campos,
+		(float3*)dL_dmean2D,
+		(glm::vec3*)dL_dmean3D,
+		dL_dcolor,
+		dL_ddepth,
+		dL_dcov3D,
+		dL_dsh,
+		dL_dscale,
+		dL_drot);
+}
+void BACKWARD::render(
+	const dim3 grid, const dim3 block,
+	const uint2* ranges,
+	const uint32_t* point_list,
+	int W, int H,
+	const float* bg_color,
+	const float2* means2D,
+	const float4* conic_opacity,
+	const float* colors,
+	const float* depths,
+	const float* alphas,
+	const uint32_t* n_contrib,
+	const float* dL_dpixels,
+	const float* dL_dpixel_depths,
+	const float* dL_dalphas,
+	float3* dL_dmean2D,
+	float4* dL_dconic2D,
+	float* dL_dopacity,
+	float* dL_dcolors,
+	float* dL_ddepths)
+{
+	renderCUDA<NUM_CHANNELS> << <grid, block >> >(
+		ranges,
+		point_list,
+		W, H,
+		bg_color,
+		means2D,
+		conic_opacity,
+		colors,
+		depths,
+		alphas,
+		n_contrib,
+		dL_dpixels,
+		dL_dpixel_depths,
+		dL_dalphas,
+		dL_dmean2D,
+		dL_dconic2D,
+		dL_dopacity,
+		dL_dcolors,
+		dL_ddepths
+		);
+}

diff-gaussian-rasterization/cuda_rasterizer/backward.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED
+#define CUDA_RASTERIZER_BACKWARD_H_INCLUDED
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#define GLM_FORCE_CUDA
+#include <glm/glm.hpp>
+namespace BACKWARD
+{
+	void render(
+		const dim3 grid, dim3 block,
+		const uint2* ranges,
+		const uint32_t* point_list,
+		int W, int H,
+		const float* bg_color,
+		const float2* means2D,
+		const float4* conic_opacity,
+		const float* colors,
+		const float* depths,
+		const float* alphas,
+		const uint32_t* n_contrib,
+		const float* dL_dpixels,
+		const float* dL_dpixel_depths,
+		const float* dL_dalphas,
+		float3* dL_dmean2D,
+		float4* dL_dconic2D,
+		float* dL_dopacity,
+		float* dL_dcolors,
+		float* dL_ddepths);
+	void preprocess(
+		int P, int D, int M,
+		const float3* means,
+		const int* radii,
+		const float* shs,
+		const bool* clamped,
+		const glm::vec3* scales,
+		const glm::vec4* rotations,
+		const float scale_modifier,
+		const float* cov3Ds,
+		const float* view,
+		const float* proj,
+		const float focal_x, float focal_y,
+		const float tan_fovx, float tan_fovy,
+		const glm::vec3* campos,
+		const float3* dL_dmean2D,
+		const float* dL_dconics,
+		glm::vec3* dL_dmeans,
+		float* dL_dcolor,
+		float* dL_ddepth,
+		float* dL_dcov3D,
+		float* dL_dsh,
+		glm::vec3* dL_dscale,
+		glm::vec4* dL_drot);
+}
+#endif

diff-gaussian-rasterization/cuda_rasterizer/config.h ADDED Viewed

	@@ -0,0 +1,19 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED
+#define CUDA_RASTERIZER_CONFIG_H_INCLUDED
+#define NUM_CHANNELS 3 // Default 3, RGB
+#define BLOCK_X 16
+#define BLOCK_Y 16
+#endif

diff-gaussian-rasterization/cuda_rasterizer/forward.cu ADDED Viewed

	@@ -0,0 +1,466 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "forward.h"
+#include "auxiliary.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+// Forward method for converting the input spherical harmonics
+// coefficients of each Gaussian to a simple RGB color.
+__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
+{
+	// The implementation is loosely based on code for
+	// "Differentiable Point-Based Radiance Fields for
+	// Efficient View Synthesis" by Zhang et al. (2022)
+	glm::vec3 pos = means[idx];
+	glm::vec3 dir = pos - campos;
+	dir = dir / glm::length(dir);
+	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
+	glm::vec3 result = SH_C0 * sh[0];
+	if (deg > 0)
+	{
+		float x = dir.x;
+		float y = dir.y;
+		float z = dir.z;
+		result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
+		if (deg > 1)
+		{
+			float xx = x * x, yy = y * y, zz = z * z;
+			float xy = x * y, yz = y * z, xz = x * z;
+			result = result +
+				SH_C2[0] * xy * sh[4] +
+				SH_C2[1] * yz * sh[5] +
+				SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
+				SH_C2[3] * xz * sh[7] +
+				SH_C2[4] * (xx - yy) * sh[8];
+			if (deg > 2)
+			{
+				result = result +
+					SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
+					SH_C3[1] * xy * z * sh[10] +
+					SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
+					SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
+					SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
+					SH_C3[5] * z * (xx - yy) * sh[14] +
+					SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
+			}
+		}
+	}
+	result += 0.5f;
+	// RGB colors are clamped to positive values. If values are
+	// clamped, we need to keep track of this for the backward pass.
+	clamped[3 * idx + 0] = (result.x < 0);
+	clamped[3 * idx + 1] = (result.y < 0);
+	clamped[3 * idx + 2] = (result.z < 0);
+	return glm::max(result, 0.0f);
+}
+// Forward version of 2D covariance matrix computation
+__device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix)
+{
+	// The following models the steps outlined by equations 29
+	// and 31 in "EWA Splatting" (Zwicker et al., 2002).
+	// Additionally considers aspect / scaling of viewport.
+	// Transposes used to account for row-/column-major conventions.
+	float3 t = transformPoint4x3(mean, viewmatrix);
+	const float limx = 1.3f * tan_fovx;
+	const float limy = 1.3f * tan_fovy;
+	const float txtz = t.x / t.z;
+	const float tytz = t.y / t.z;
+	t.x = min(limx, max(-limx, txtz)) * t.z;
+	t.y = min(limy, max(-limy, tytz)) * t.z;
+	glm::mat3 J = glm::mat3(
+		focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
+		0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
+		0, 0, 0);
+	glm::mat3 W = glm::mat3(
+		viewmatrix[0], viewmatrix[4], viewmatrix[8],
+		viewmatrix[1], viewmatrix[5], viewmatrix[9],
+		viewmatrix[2], viewmatrix[6], viewmatrix[10]);
+	glm::mat3 T = W * J;
+	glm::mat3 Vrk = glm::mat3(
+		cov3D[0], cov3D[1], cov3D[2],
+		cov3D[1], cov3D[3], cov3D[4],
+		cov3D[2], cov3D[4], cov3D[5]);
+	glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
+	// Apply low-pass filter: every Gaussian should be at least
+	// one pixel wide/high. Discard 3rd row and column.
+	cov[0][0] += 0.3f;
+	cov[1][1] += 0.3f;
+	return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
+}
+// Forward method for converting scale and rotation properties of each
+// Gaussian to a 3D covariance matrix in world space. Also takes care
+// of quaternion normalization.
+__device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
+{
+	// Create scaling matrix
+	glm::mat3 S = glm::mat3(1.0f);
+	S[0][0] = mod * scale.x;
+	S[1][1] = mod * scale.y;
+	S[2][2] = mod * scale.z;
+	// Normalize quaternion to get valid rotation
+	glm::vec4 q = rot;// / glm::length(rot);
+	float r = q.x;
+	float x = q.y;
+	float y = q.z;
+	float z = q.w;
+	// Compute rotation matrix from quaternion
+	glm::mat3 R = glm::mat3(
+		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
+		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
+		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
+	);
+	glm::mat3 M = S * R;
+	// Compute 3D world covariance matrix Sigma
+	glm::mat3 Sigma = glm::transpose(M) * M;
+	// Covariance is symmetric, only store upper right
+	cov3D[0] = Sigma[0][0];
+	cov3D[1] = Sigma[0][1];
+	cov3D[2] = Sigma[0][2];
+	cov3D[3] = Sigma[1][1];
+	cov3D[4] = Sigma[1][2];
+	cov3D[5] = Sigma[2][2];
+}
+// Perform initial steps for each Gaussian prior to rasterization.
+template<int C>
+__global__ void preprocessCUDA(int P, int D, int M,
+	const float* orig_points,
+	const glm::vec3* scales,
+	const float scale_modifier,
+	const glm::vec4* rotations,
+	const float* opacities,
+	const float* shs,
+	bool* clamped,
+	const float* cov3D_precomp,
+	const float* colors_precomp,
+	const float* viewmatrix,
+	const float* projmatrix,
+	const glm::vec3* cam_pos,
+	const int W, int H,
+	const float tan_fovx, float tan_fovy,
+	const float focal_x, float focal_y,
+	int* radii,
+	float2* points_xy_image,
+	float* depths,
+	float* cov3Ds,
+	float* rgb,
+	float4* conic_opacity,
+	const dim3 grid,
+	uint32_t* tiles_touched,
+	bool prefiltered)
+{
+	auto idx = cg::this_grid().thread_rank();
+	if (idx >= P)
+		return;
+	// Initialize radius and touched tiles to 0. If this isn't changed,
+	// this Gaussian will not be processed further.
+	radii[idx] = 0;
+	tiles_touched[idx] = 0;
+	// Perform near culling, quit if outside.
+	float3 p_view;
+	if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
+		return;
+	// Transform point by projecting
+	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
+	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
+	float p_w = 1.0f / (p_hom.w + 0.0000001f);
+	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
+	// If 3D covariance matrix is precomputed, use it, otherwise compute
+	// from scaling and rotation parameters.
+	const float* cov3D;
+	if (cov3D_precomp != nullptr)
+	{
+		cov3D = cov3D_precomp + idx * 6;
+	}
+	else
+	{
+		computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
+		cov3D = cov3Ds + idx * 6;
+	}
+	// Compute 2D screen-space covariance matrix
+	float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
+	// Invert covariance (EWA algorithm)
+	float det = (cov.x * cov.z - cov.y * cov.y);
+	if (det == 0.0f)
+		return;
+	float det_inv = 1.f / det;
+	float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
+	// Compute extent in screen space (by finding eigenvalues of
+	// 2D covariance matrix). Use extent to compute a bounding rectangle
+	// of screen-space tiles that this Gaussian overlaps with. Quit if
+	// rectangle covers 0 tiles.
+	float mid = 0.5f * (cov.x + cov.z);
+	float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
+	float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
+	float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
+	float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
+	uint2 rect_min, rect_max;
+	getRect(point_image, my_radius, rect_min, rect_max, grid);
+	if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
+		return;
+	// If colors have been precomputed, use them, otherwise convert
+	// spherical harmonics coefficients to RGB color.
+	if (colors_precomp == nullptr)
+	{
+		glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
+		rgb[idx * C + 0] = result.x;
+		rgb[idx * C + 1] = result.y;
+		rgb[idx * C + 2] = result.z;
+	}
+	// Store some useful helper data for the next steps.
+	depths[idx] = p_view.z;
+	radii[idx] = my_radius;
+	points_xy_image[idx] = point_image;
+	// Inverse 2D covariance and opacity neatly pack into one float4
+	conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
+	tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
+}
+// Main rasterization method. Collaboratively works on one tile per
+// block, each thread treats one pixel. Alternates between fetching
+// and rasterizing data.
+template <uint32_t CHANNELS>
+__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
+renderCUDA(
+	const uint2* __restrict__ ranges,
+	const uint32_t* __restrict__ point_list,
+	int W, int H,
+	const float2* __restrict__ points_xy_image,
+	const float* __restrict__ features,
+	const float* __restrict__ depths,
+	const float4* __restrict__ conic_opacity,
+	float* __restrict__ out_alpha,
+	uint32_t* __restrict__ n_contrib,
+	const float* __restrict__ bg_color,
+	float* __restrict__ out_color,
+	float* __restrict__ out_depth)
+{
+	// Identify current tile and associated min/max pixel range.
+	auto block = cg::this_thread_block();
+	uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
+	uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
+	uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
+	uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
+	uint32_t pix_id = W * pix.y + pix.x;
+	float2 pixf = { (float)pix.x, (float)pix.y };
+	// Check if this thread is associated with a valid pixel or outside.
+	bool inside = pix.x < W&& pix.y < H;
+	// Done threads can help with fetching, but don't rasterize
+	bool done = !inside;
+	// Load start/end range of IDs to process in bit sorted list.
+	uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
+	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
+	int toDo = range.y - range.x;
+	// Allocate storage for batches of collectively fetched data.
+	__shared__ int collected_id[BLOCK_SIZE];
+	__shared__ float2 collected_xy[BLOCK_SIZE];
+	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
+	// Initialize helper variables
+	float T = 1.0f;
+	uint32_t contributor = 0;
+	uint32_t last_contributor = 0;
+	float C[CHANNELS] = { 0 };
+	float weight = 0;
+	float D = 0;
+	// Iterate over batches until all done or range is complete
+	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
+	{
+		// End if entire block votes that it is done rasterizing
+		int num_done = __syncthreads_count(done);
+		if (num_done == BLOCK_SIZE)
+			break;
+		// Collectively fetch per-Gaussian data from global to shared
+		int progress = i * BLOCK_SIZE + block.thread_rank();
+		if (range.x + progress < range.y)
+		{
+			int coll_id = point_list[range.x + progress];
+			collected_id[block.thread_rank()] = coll_id;
+			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
+			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
+		}
+		block.sync();
+		// Iterate over current batch
+		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
+		{
+			// Keep track of current position in range
+			contributor++;
+			// Resample using conic matrix (cf. "Surface
+			// Splatting" by Zwicker et al., 2001)
+			float2 xy = collected_xy[j];
+			float2 d = { xy.x - pixf.x, xy.y - pixf.y };
+			float4 con_o = collected_conic_opacity[j];
+			float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
+			if (power > 0.0f)
+				continue;
+			// Eq. (2) from 3D Gaussian splatting paper.
+			// Obtain alpha by multiplying with Gaussian opacity
+			// and its exponential falloff from mean.
+			// Avoid numerical instabilities (see paper appendix).
+			float alpha = min(0.99f, con_o.w * exp(power));
+			if (alpha < 1.0f / 255.0f)
+				continue;
+			float test_T = T * (1 - alpha);
+			if (test_T < 0.0001f)
+			{
+				done = true;
+				continue;
+			}
+			// Eq. (3) from 3D Gaussian splatting paper.
+			for (int ch = 0; ch < CHANNELS; ch++)
+				C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
+			weight += alpha * T;
+			D += depths[collected_id[j]] * alpha * T;
+			T = test_T;
+			// Keep track of last range entry to update this
+			// pixel.
+			last_contributor = contributor;
+		}
+	}
+	// All threads that treat valid pixel write out their final
+	// rendering data to the frame and auxiliary buffers.
+	if (inside)
+	{
+		n_contrib[pix_id] = last_contributor;
+		for (int ch = 0; ch < CHANNELS; ch++)
+			out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
+		out_alpha[pix_id] = weight; //1 - T;
+		out_depth[pix_id] = D;
+	}
+}
+void FORWARD::render(
+	const dim3 grid, dim3 block,
+	const uint2* ranges,
+	const uint32_t* point_list,
+	int W, int H,
+	const float2* means2D,
+	const float* colors,
+	const float* depths,
+	const float4* conic_opacity,
+	float* out_alpha,
+	uint32_t* n_contrib,
+	const float* bg_color,
+	float* out_color,
+	float* out_depth)
+{
+	renderCUDA<NUM_CHANNELS> << <grid, block >> > (
+		ranges,
+		point_list,
+		W, H,
+		means2D,
+		colors,
+		depths,
+		conic_opacity,
+		out_alpha,
+		n_contrib,
+		bg_color,
+		out_color,
+		out_depth);
+}
+void FORWARD::preprocess(int P, int D, int M,
+	const float* means3D,
+	const glm::vec3* scales,
+	const float scale_modifier,
+	const glm::vec4* rotations,
+	const float* opacities,
+	const float* shs,
+	bool* clamped,
+	const float* cov3D_precomp,
+	const float* colors_precomp,
+	const float* viewmatrix,
+	const float* projmatrix,
+	const glm::vec3* cam_pos,
+	const int W, int H,
+	const float focal_x, float focal_y,
+	const float tan_fovx, float tan_fovy,
+	int* radii,
+	float2* means2D,
+	float* depths,
+	float* cov3Ds,
+	float* rgb,
+	float4* conic_opacity,
+	const dim3 grid,
+	uint32_t* tiles_touched,
+	bool prefiltered)
+{
+	preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
+		P, D, M,
+		means3D,
+		scales,
+		scale_modifier,
+		rotations,
+		opacities,
+		shs,
+		clamped,
+		cov3D_precomp,
+		colors_precomp,
+		viewmatrix,
+		projmatrix,
+		cam_pos,
+		W, H,
+		tan_fovx, tan_fovy,
+		focal_x, focal_y,
+		radii,
+		means2D,
+		depths,
+		cov3Ds,
+		rgb,
+		conic_opacity,
+		grid,
+		tiles_touched,
+		prefiltered
+		);
+}

diff-gaussian-rasterization/cuda_rasterizer/forward.h ADDED Viewed

	@@ -0,0 +1,68 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED
+#define CUDA_RASTERIZER_FORWARD_H_INCLUDED
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#define GLM_FORCE_CUDA
+#include <glm/glm.hpp>
+namespace FORWARD
+{
+	// Perform initial steps for each Gaussian prior to rasterization.
+	void preprocess(int P, int D, int M,
+		const float* orig_points,
+		const glm::vec3* scales,
+		const float scale_modifier,
+		const glm::vec4* rotations,
+		const float* opacities,
+		const float* shs,
+		bool* clamped,
+		const float* cov3D_precomp,
+		const float* colors_precomp,
+		const float* viewmatrix,
+		const float* projmatrix,
+		const glm::vec3* cam_pos,
+		const int W, int H,
+		const float focal_x, float focal_y,
+		const float tan_fovx, float tan_fovy,
+		int* radii,
+		float2* points_xy_image,
+		float* depths,
+		float* cov3Ds,
+		float* colors,
+		float4* conic_opacity,
+		const dim3 grid,
+		uint32_t* tiles_touched,
+		bool prefiltered);
+	// Main rasterization method.
+	void render(
+		const dim3 grid, dim3 block,
+		const uint2* ranges,
+		const uint32_t* point_list,
+		int W, int H,
+		const float2* points_xy_image,
+		const float* features,
+		const float* depths,
+		const float4* conic_opacity,
+		float* out_alpha,
+		uint32_t* n_contrib,
+		const float* bg_color,
+		float* out_color,
+		float* out_depth);
+}
+#endif

diff-gaussian-rasterization/cuda_rasterizer/rasterizer.h ADDED Viewed

	@@ -0,0 +1,94 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_H_INCLUDED
+#define CUDA_RASTERIZER_H_INCLUDED
+#include <vector>
+#include <functional>
+namespace CudaRasterizer
+{
+	class Rasterizer
+	{
+	public:
+		static void markVisible(
+			int P,
+			float* means3D,
+			float* viewmatrix,
+			float* projmatrix,
+			bool* present);
+		static int forward(
+			std::function<char* (size_t)> geometryBuffer,
+			std::function<char* (size_t)> binningBuffer,
+			std::function<char* (size_t)> imageBuffer,
+			const int P, int D, int M,
+			const float* background,
+			const int width, int height,
+			const float* means3D,
+			const float* shs,
+			const float* colors_precomp,
+			const float* opacities,
+			const float* scales,
+			const float scale_modifier,
+			const float* rotations,
+			const float* cov3D_precomp,
+			const float* viewmatrix,
+			const float* projmatrix,
+			const float* cam_pos,
+			const float tan_fovx, float tan_fovy,
+			const bool prefiltered,
+			float* out_color,
+			float* out_depth,
+			float* out_alpha,
+			int* radii = nullptr,
+			bool debug = false);
+		static void backward(
+			const int P, int D, int M, int R,
+			const float* background,
+			const int width, int height,
+			const float* means3D,
+			const float* shs,
+			const float* colors_precomp,
+			const float* alphas,
+			const float* scales,
+			const float scale_modifier,
+			const float* rotations,
+			const float* cov3D_precomp,
+			const float* viewmatrix,
+			const float* projmatrix,
+			const float* campos,
+			const float tan_fovx, float tan_fovy,
+			const int* radii,
+			char* geom_buffer,
+			char* binning_buffer,
+			char* image_buffer,
+			const float* dL_dpix,
+			const float* dL_dpix_depth,
+			const float* dL_dalphas,
+			float* dL_dmean2D,
+			float* dL_dconic,
+			float* dL_dopacity,
+			float* dL_dcolor,
+			float* dL_ddepth,
+			float* dL_dmean3D,
+			float* dL_dcov3D,
+			float* dL_dsh,
+			float* dL_dscale,
+			float* dL_drot,
+			bool debug);
+	};
+};
+#endif

diff-gaussian-rasterization/cuda_rasterizer/rasterizer_impl.cu ADDED Viewed

	@@ -0,0 +1,447 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "rasterizer_impl.h"
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <numeric>
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#define GLM_FORCE_CUDA
+#include <glm/glm.hpp>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+#include "auxiliary.h"
+#include "forward.h"
+#include "backward.h"
+// Helper function to find the next-highest bit of the MSB
+// on the CPU.
+uint32_t getHigherMsb(uint32_t n)
+{
+	uint32_t msb = sizeof(n) * 4;
+	uint32_t step = msb;
+	while (step > 1)
+	{
+		step /= 2;
+		if (n >> msb)
+			msb += step;
+		else
+			msb -= step;
+	}
+	if (n >> msb)
+		msb++;
+	return msb;
+}
+// Wrapper method to call auxiliary coarse frustum containment test.
+// Mark all Gaussians that pass it.
+__global__ void checkFrustum(int P,
+	const float* orig_points,
+	const float* viewmatrix,
+	const float* projmatrix,
+	bool* present)
+{
+	auto idx = cg::this_grid().thread_rank();
+	if (idx >= P)
+		return;
+	float3 p_view;
+	present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view);
+}
+// Generates one key/value pair for all Gaussian / tile overlaps.
+// Run once per Gaussian (1:N mapping).
+__global__ void duplicateWithKeys(
+	int P,
+	const float2* points_xy,
+	const float* depths,
+	const uint32_t* offsets,
+	uint64_t* gaussian_keys_unsorted,
+	uint32_t* gaussian_values_unsorted,
+	int* radii,
+	dim3 grid)
+{
+	auto idx = cg::this_grid().thread_rank();
+	if (idx >= P)
+		return;
+	// Generate no key/value pair for invisible Gaussians
+	if (radii[idx] > 0)
+	{
+		// Find this Gaussian's offset in buffer for writing keys/values.
+		uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
+		uint2 rect_min, rect_max;
+		getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
+		// For each tile that the bounding rect overlaps, emit a
+		// key/value pair. The key is |  tile ID  |      depth      |,
+		// and the value is the ID of the Gaussian. Sorting the values
+		// with this key yields Gaussian IDs in a list, such that they
+		// are first sorted by tile and then by depth.
+		for (int y = rect_min.y; y < rect_max.y; y++)
+		{
+			for (int x = rect_min.x; x < rect_max.x; x++)
+			{
+				uint64_t key = y * grid.x + x;
+				key <<= 32;
+				key |= *((uint32_t*)&depths[idx]);
+				gaussian_keys_unsorted[off] = key;
+				gaussian_values_unsorted[off] = idx;
+				off++;
+			}
+		}
+	}
+}
+// Check keys to see if it is at the start/end of one tile's range in
+// the full sorted list. If yes, write start/end of this tile.
+// Run once per instanced (duplicated) Gaussian ID.
+__global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges)
+{
+	auto idx = cg::this_grid().thread_rank();
+	if (idx >= L)
+		return;
+	// Read tile ID from key. Update start/end of tile range if at limit.
+	uint64_t key = point_list_keys[idx];
+	uint32_t currtile = key >> 32;
+	if (idx == 0)
+		ranges[currtile].x = 0;
+	else
+	{
+		uint32_t prevtile = point_list_keys[idx - 1] >> 32;
+		if (currtile != prevtile)
+		{
+			ranges[prevtile].y = idx;
+			ranges[currtile].x = idx;
+		}
+	}
+	if (idx == L - 1)
+		ranges[currtile].y = L;
+}
+// Mark Gaussians as visible/invisible, based on view frustum testing
+void CudaRasterizer::Rasterizer::markVisible(
+	int P,
+	float* means3D,
+	float* viewmatrix,
+	float* projmatrix,
+	bool* present)
+{
+	checkFrustum << <(P + 255) / 256, 256 >> > (
+		P,
+		means3D,
+		viewmatrix, projmatrix,
+		present);
+}
+CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P)
+{
+	GeometryState geom;
+	obtain(chunk, geom.depths, P, 128);
+	obtain(chunk, geom.clamped, P * 3, 128);
+	obtain(chunk, geom.internal_radii, P, 128);
+	obtain(chunk, geom.means2D, P, 128);
+	obtain(chunk, geom.cov3D, P * 6, 128);
+	obtain(chunk, geom.conic_opacity, P, 128);
+	obtain(chunk, geom.rgb, P * 3, 128);
+	obtain(chunk, geom.tiles_touched, P, 128);
+	cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P);
+	obtain(chunk, geom.scanning_space, geom.scan_size, 128);
+	obtain(chunk, geom.point_offsets, P, 128);
+	return geom;
+}
+CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N)
+{
+	ImageState img;
+	obtain(chunk, img.n_contrib, N, 128);
+	obtain(chunk, img.ranges, N, 128);
+	return img;
+}
+CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P)
+{
+	BinningState binning;
+	obtain(chunk, binning.point_list, P, 128);
+	obtain(chunk, binning.point_list_unsorted, P, 128);
+	obtain(chunk, binning.point_list_keys, P, 128);
+	obtain(chunk, binning.point_list_keys_unsorted, P, 128);
+	cub::DeviceRadixSort::SortPairs(
+		nullptr, binning.sorting_size,
+		binning.point_list_keys_unsorted, binning.point_list_keys,
+		binning.point_list_unsorted, binning.point_list, P);
+	obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128);
+	return binning;
+}
+// Forward rendering procedure for differentiable rasterization
+// of Gaussians.
+int CudaRasterizer::Rasterizer::forward(
+	std::function<char* (size_t)> geometryBuffer,
+	std::function<char* (size_t)> binningBuffer,
+	std::function<char* (size_t)> imageBuffer,
+	const int P, int D, int M,
+	const float* background,
+	const int width, int height,
+	const float* means3D,
+	const float* shs,
+	const float* colors_precomp,
+	const float* opacities,
+	const float* scales,
+	const float scale_modifier,
+	const float* rotations,
+	const float* cov3D_precomp,
+	const float* viewmatrix,
+	const float* projmatrix,
+	const float* cam_pos,
+	const float tan_fovx, float tan_fovy,
+	const bool prefiltered,
+	float* out_color,
+	float* out_depth,
+	float* out_alpha,
+	int* radii,
+	bool debug)
+{
+	const float focal_y = height / (2.0f * tan_fovy);
+	const float focal_x = width / (2.0f * tan_fovx);
+	size_t chunk_size = required<GeometryState>(P);
+	char* chunkptr = geometryBuffer(chunk_size);
+	GeometryState geomState = GeometryState::fromChunk(chunkptr, P);
+	if (radii == nullptr)
+	{
+		radii = geomState.internal_radii;
+	}
+	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
+	dim3 block(BLOCK_X, BLOCK_Y, 1);
+	// Dynamically resize image-based auxiliary buffers during training
+	size_t img_chunk_size = required<ImageState>(width * height);
+	char* img_chunkptr = imageBuffer(img_chunk_size);
+	ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height);
+	if (NUM_CHANNELS != 3 && colors_precomp == nullptr)
+	{
+		throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
+	}
+	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
+	CHECK_CUDA(FORWARD::preprocess(
+		P, D, M,
+		means3D,
+		(glm::vec3*)scales,
+		scale_modifier,
+		(glm::vec4*)rotations,
+		opacities,
+		shs,
+		geomState.clamped,
+		cov3D_precomp,
+		colors_precomp,
+		viewmatrix, projmatrix,
+		(glm::vec3*)cam_pos,
+		width, height,
+		focal_x, focal_y,
+		tan_fovx, tan_fovy,
+		radii,
+		geomState.means2D,
+		geomState.depths,
+		geomState.cov3D,
+		geomState.rgb,
+		geomState.conic_opacity,
+		tile_grid,
+		geomState.tiles_touched,
+		prefiltered
+	), debug)
+	// Compute prefix sum over full list of touched tile counts by Gaussians
+	// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
+	CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug)
+	// Retrieve total number of Gaussian instances to launch and resize aux buffers
+	int num_rendered;
+	CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug);
+	size_t binning_chunk_size = required<BinningState>(num_rendered);
+	char* binning_chunkptr = binningBuffer(binning_chunk_size);
+	BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered);
+	// For each instance to be rendered, produce adequate [ tile | depth ] key
+	// and corresponding dublicated Gaussian indices to be sorted
+	duplicateWithKeys << <(P + 255) / 256, 256 >> > (
+		P,
+		geomState.means2D,
+		geomState.depths,
+		geomState.point_offsets,
+		binningState.point_list_keys_unsorted,
+		binningState.point_list_unsorted,
+		radii,
+		tile_grid)
+	CHECK_CUDA(, debug)
+	int bit = getHigherMsb(tile_grid.x * tile_grid.y);
+	// Sort complete list of (duplicated) Gaussian indices by keys
+	CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
+		binningState.list_sorting_space,
+		binningState.sorting_size,
+		binningState.point_list_keys_unsorted, binningState.point_list_keys,
+		binningState.point_list_unsorted, binningState.point_list,
+		num_rendered, 0, 32 + bit), debug)
+	CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug);
+	// Identify start and end of per-tile workloads in sorted list
+	if (num_rendered > 0)
+		identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
+			num_rendered,
+			binningState.point_list_keys,
+			imgState.ranges);
+	CHECK_CUDA(, debug);
+	// Let each tile blend its range of Gaussians independently in parallel
+	const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
+	CHECK_CUDA(FORWARD::render(
+		tile_grid, block,
+		imgState.ranges,
+		binningState.point_list,
+		width, height,
+		geomState.means2D,
+		feature_ptr,
+		geomState.depths,
+		geomState.conic_opacity,
+		out_alpha,
+		imgState.n_contrib,
+		background,
+		out_color,
+		out_depth), debug);
+	return num_rendered;
+}
+// Produce necessary gradients for optimization, corresponding
+// to forward render pass
+void CudaRasterizer::Rasterizer::backward(
+	const int P, int D, int M, int R,
+	const float* background,
+	const int width, int height,
+	const float* means3D,
+	const float* shs,
+	const float* colors_precomp,
+	const float* alphas,
+	const float* scales,
+	const float scale_modifier,
+	const float* rotations,
+	const float* cov3D_precomp,
+	const float* viewmatrix,
+	const float* projmatrix,
+	const float* campos,
+	const float tan_fovx, float tan_fovy,
+	const int* radii,
+	char* geom_buffer,
+	char* binning_buffer,
+	char* img_buffer,
+	const float* dL_dpix,
+	const float* dL_dpix_depth,
+	const float* dL_dalphas,
+	float* dL_dmean2D,
+	float* dL_dconic,
+	float* dL_dopacity,
+	float* dL_dcolor,
+	float* dL_ddepth,
+	float* dL_dmean3D,
+	float* dL_dcov3D,
+	float* dL_dsh,
+	float* dL_dscale,
+	float* dL_drot,
+	bool debug)
+{
+	GeometryState geomState = GeometryState::fromChunk(geom_buffer, P);
+	BinningState binningState = BinningState::fromChunk(binning_buffer, R);
+	ImageState imgState = ImageState::fromChunk(img_buffer, width * height);
+	if (radii == nullptr)
+	{
+		radii = geomState.internal_radii;
+	}
+	const float focal_y = height / (2.0f * tan_fovy);
+	const float focal_x = width / (2.0f * tan_fovx);
+	const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
+	const dim3 block(BLOCK_X, BLOCK_Y, 1);
+	// Compute loss gradients w.r.t. 2D mean position, conic matrix,
+	// opacity and RGB of Gaussians from per-pixel loss gradients.
+	// If we were given precomputed colors and not SHs, use them.
+	const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb;
+	const float* depth_ptr = geomState.depths;
+	CHECK_CUDA(BACKWARD::render(
+		tile_grid,
+		block,
+		imgState.ranges,
+		binningState.point_list,
+		width, height,
+		background,
+		geomState.means2D,
+		geomState.conic_opacity,
+		color_ptr,
+		depth_ptr,
+		alphas,
+		imgState.n_contrib,
+		dL_dpix,
+		dL_dpix_depth,
+		dL_dalphas,
+		(float3*)dL_dmean2D,
+		(float4*)dL_dconic,
+		dL_dopacity,
+		dL_dcolor,
+		dL_ddepth), debug)
+	// Take care of the rest of preprocessing. Was the precomputed covariance
+	// given to us or a scales/rot pair? If precomputed, pass that. If not,
+	// use the one we computed ourselves.
+	const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D;
+	CHECK_CUDA(BACKWARD::preprocess(P, D, M,
+		(float3*)means3D,
+		radii,
+		shs,
+		geomState.clamped,
+		(glm::vec3*)scales,
+		(glm::vec4*)rotations,
+		scale_modifier,
+		cov3D_ptr,
+		viewmatrix,
+		projmatrix,
+		focal_x, focal_y,
+		tan_fovx, tan_fovy,
+		(glm::vec3*)campos,
+		(float3*)dL_dmean2D,
+		dL_dconic,
+		(glm::vec3*)dL_dmean3D,
+		dL_dcolor,
+		dL_ddepth,
+		dL_dcov3D,
+		dL_dsh,
+		(glm::vec3*)dL_dscale,
+		(glm::vec4*)dL_drot), debug)
+}

diff-gaussian-rasterization/cuda_rasterizer/rasterizer_impl.h ADDED Viewed

	@@ -0,0 +1,73 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#pragma once
+#include <iostream>
+#include <vector>
+#include "rasterizer.h"
+#include <cuda_runtime_api.h>
+namespace CudaRasterizer
+{
+	template <typename T>
+	static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
+	{
+		std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
+		ptr = reinterpret_cast<T*>(offset);
+		chunk = reinterpret_cast<char*>(ptr + count);
+	}
+	struct GeometryState
+	{
+		size_t scan_size;
+		float* depths;
+		char* scanning_space;
+		bool* clamped;
+		int* internal_radii;
+		float2* means2D;
+		float* cov3D;
+		float4* conic_opacity;
+		float* rgb;
+		uint32_t* point_offsets;
+		uint32_t* tiles_touched;
+		static GeometryState fromChunk(char*& chunk, size_t P);
+	};
+	struct ImageState
+	{
+		uint2* ranges;
+		uint32_t* n_contrib;
+		static ImageState fromChunk(char*& chunk, size_t N);
+	};
+	struct BinningState
+	{
+		size_t sorting_size;
+		uint64_t* point_list_keys_unsorted;
+		uint64_t* point_list_keys;
+		uint32_t* point_list_unsorted;
+		uint32_t* point_list;
+		char* list_sorting_space;
+		static BinningState fromChunk(char*& chunk, size_t P);
+	};
+	template<typename T>
+	size_t required(size_t P)
+	{
+		char* size = nullptr;
+		T::fromChunk(size, P);
+		return ((size_t)size) + 128;
+	}
+};

diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py ADDED Viewed

	@@ -0,0 +1,224 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  george.drettakis@inria.fr
+#
+from typing import NamedTuple
+import torch.nn as nn
+import torch
+from . import _C
+def cpu_deep_copy_tuple(input_tuple):
+    copied_tensors = [item.cpu().clone() if isinstance(item, torch.Tensor) else item for item in input_tuple]
+    return tuple(copied_tensors)
+def rasterize_gaussians(
+    means3D,
+    means2D,
+    sh,
+    colors_precomp,
+    opacities,
+    scales,
+    rotations,
+    cov3Ds_precomp,
+    raster_settings,
+):
+    return _RasterizeGaussians.apply(
+        means3D,
+        means2D,
+        sh,
+        colors_precomp,
+        opacities,
+        scales,
+        rotations,
+        cov3Ds_precomp,
+        raster_settings,
+    )
+class _RasterizeGaussians(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        means3D,
+        means2D,
+        sh,
+        colors_precomp,
+        opacities,
+        scales,
+        rotations,
+        cov3Ds_precomp,
+        raster_settings,
+    ):
+        # Restructure arguments the way that the C++ lib expects them
+        args = (
+            raster_settings.bg,
+            means3D,
+            colors_precomp,
+            opacities,
+            scales,
+            rotations,
+            raster_settings.scale_modifier,
+            cov3Ds_precomp,
+            raster_settings.viewmatrix,
+            raster_settings.projmatrix,
+            raster_settings.tanfovx,
+            raster_settings.tanfovy,
+            raster_settings.image_height,
+            raster_settings.image_width,
+            sh,
+            raster_settings.sh_degree,
+            raster_settings.campos,
+            raster_settings.prefiltered,
+            raster_settings.debug
+        )
+        # Invoke C++/CUDA rasterizer
+        if raster_settings.debug:
+            cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted
+            try:
+                num_rendered, color, depth, alpha, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
+            except Exception as ex:
+                torch.save(cpu_args, "snapshot_fw.dump")
+                print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.")
+                raise ex
+        else:
+            num_rendered, color, depth, alpha, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
+        # Keep relevant tensors for backward
+        ctx.raster_settings = raster_settings
+        ctx.num_rendered = num_rendered
+        ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer, alpha)
+        return color, radii, depth, alpha
+    @staticmethod
+    def backward(ctx, grad_color, grad_radii, grad_depth, grad_alpha):
+        # Restore necessary values from context
+        num_rendered = ctx.num_rendered
+        raster_settings = ctx.raster_settings
+        colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer, alpha = ctx.saved_tensors
+        # Restructure args as C++ method expects them
+        args = (raster_settings.bg,
+                means3D,
+                radii,
+                colors_precomp,
+                scales,
+                rotations,
+                raster_settings.scale_modifier,
+                cov3Ds_precomp,
+                raster_settings.viewmatrix,
+                raster_settings.projmatrix,
+                raster_settings.tanfovx,
+                raster_settings.tanfovy,
+                grad_color,
+                grad_depth,
+                grad_alpha,
+                sh,
+                raster_settings.sh_degree,
+                raster_settings.campos,
+                geomBuffer,
+                num_rendered,
+                binningBuffer,
+                imgBuffer,
+                alpha,
+                raster_settings.debug)
+        # Compute gradients for relevant tensors by invoking backward method
+        if raster_settings.debug:
+            cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted
+            try:
+                grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args)
+            except Exception as ex:
+                torch.save(cpu_args, "snapshot_bw.dump")
+                print("\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n")
+                raise ex
+        else:
+             grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args)
+        grads = (
+            grad_means3D,
+            grad_means2D,
+            grad_sh,
+            grad_colors_precomp,
+            grad_opacities,
+            grad_scales,
+            grad_rotations,
+            grad_cov3Ds_precomp,
+            None,
+        )
+        return grads
+class GaussianRasterizationSettings(NamedTuple):
+    image_height: int
+    image_width: int
+    tanfovx : float
+    tanfovy : float
+    bg : torch.Tensor
+    scale_modifier : float
+    viewmatrix : torch.Tensor
+    projmatrix : torch.Tensor
+    sh_degree : int
+    campos : torch.Tensor
+    prefiltered : bool
+    debug : bool
+class GaussianRasterizer(nn.Module):
+    def __init__(self, raster_settings):
+        super().__init__()
+        self.raster_settings = raster_settings
+    def markVisible(self, positions):
+        # Mark visible points (based on frustum culling for camera) with a boolean
+        with torch.no_grad():
+            raster_settings = self.raster_settings
+            visible = _C.mark_visible(
+                positions,
+                raster_settings.viewmatrix,
+                raster_settings.projmatrix)
+        return visible
+    def forward(self, means3D, means2D, opacities, shs = None, colors_precomp = None, scales = None, rotations = None, cov3D_precomp = None):
+        raster_settings = self.raster_settings
+        if (shs is None and colors_precomp is None) or (shs is not None and colors_precomp is not None):
+            raise Exception('Please provide excatly one of either SHs or precomputed colors!')
+        if ((scales is None or rotations is None) and cov3D_precomp is None) or ((scales is not None or rotations is not None) and cov3D_precomp is not None):
+            raise Exception('Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!')
+        if shs is None:
+            shs = torch.Tensor([])
+        if colors_precomp is None:
+            colors_precomp = torch.Tensor([])
+        if scales is None:
+            scales = torch.Tensor([])
+        if rotations is None:
+            rotations = torch.Tensor([])
+        if cov3D_precomp is None:
+            cov3D_precomp = torch.Tensor([])
+        # Invoke C++/CUDA rasterization routine
+        return rasterize_gaussians(
+            means3D,
+            means2D,
+            shs,
+            colors_precomp,
+            opacities,
+            scales,
+            rotations,
+            cov3D_precomp,
+            raster_settings,
+        )

diff-gaussian-rasterization/ext.cpp ADDED Viewed

	@@ -0,0 +1,19 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include <torch/extension.h>
+#include "rasterize_points.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rasterize_gaussians", &RasterizeGaussiansCUDA);
+  m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA);
+  m.def("mark_visible", &markVisible);
+}

diff-gaussian-rasterization/rasterize_points.cu ADDED Viewed

	@@ -0,0 +1,229 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include <math.h>
+#include <torch/extension.h>
+#include <cstdio>
+#include <sstream>
+#include <iostream>
+#include <tuple>
+#include <stdio.h>
+#include <cuda_runtime_api.h>
+#include <memory>
+#include "cuda_rasterizer/config.h"
+#include "cuda_rasterizer/rasterizer.h"
+#include <fstream>
+#include <string>
+#include <functional>
+std::function<char*(size_t N)> resizeFunctional(torch::Tensor& t) {
+    auto lambda = [&t](size_t N) {
+        t.resize_({(long long)N});
+		return reinterpret_cast<char*>(t.contiguous().data_ptr());
+    };
+    return lambda;
+}
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeGaussiansCUDA(
+	const torch::Tensor& background,
+	const torch::Tensor& means3D,
+    const torch::Tensor& colors,
+    const torch::Tensor& opacity,
+	const torch::Tensor& scales,
+	const torch::Tensor& rotations,
+	const float scale_modifier,
+	const torch::Tensor& cov3D_precomp,
+	const torch::Tensor& viewmatrix,
+	const torch::Tensor& projmatrix,
+	const float tan_fovx,
+	const float tan_fovy,
+    const int image_height,
+    const int image_width,
+	const torch::Tensor& sh,
+	const int degree,
+	const torch::Tensor& campos,
+	const bool prefiltered,
+	const bool debug)
+{
+  if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
+    AT_ERROR("means3D must have dimensions (num_points, 3)");
+  }
+  const int P = means3D.size(0);
+  const int H = image_height;
+  const int W = image_width;
+  auto int_opts = means3D.options().dtype(torch::kInt32);
+  auto float_opts = means3D.options().dtype(torch::kFloat32);
+  torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts);
+  torch::Tensor out_depth = torch::full({1, H, W}, 0.0, float_opts);
+  torch::Tensor out_alpha = torch::full({1, H, W}, 0.0, float_opts);
+  torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
+  torch::Device device(torch::kCUDA);
+  torch::TensorOptions options(torch::kByte);
+  torch::Tensor geomBuffer = torch::empty({0}, options.device(device));
+  torch::Tensor binningBuffer = torch::empty({0}, options.device(device));
+  torch::Tensor imgBuffer = torch::empty({0}, options.device(device));
+  std::function<char*(size_t)> geomFunc = resizeFunctional(geomBuffer);
+  std::function<char*(size_t)> binningFunc = resizeFunctional(binningBuffer);
+  std::function<char*(size_t)> imgFunc = resizeFunctional(imgBuffer);
+  int rendered = 0;
+  if(P != 0)
+  {
+	  int M = 0;
+	  if(sh.size(0) != 0)
+	  {
+		M = sh.size(1);
+      }
+	  rendered = CudaRasterizer::Rasterizer::forward(
+	    geomFunc,
+		binningFunc,
+		imgFunc,
+	    P, degree, M,
+		background.contiguous().data<float>(),
+		W, H,
+		means3D.contiguous().data<float>(),
+		sh.contiguous().data_ptr<float>(),
+		colors.contiguous().data<float>(),
+		opacity.contiguous().data<float>(),
+		scales.contiguous().data_ptr<float>(),
+		scale_modifier,
+		rotations.contiguous().data_ptr<float>(),
+		cov3D_precomp.contiguous().data<float>(),
+		viewmatrix.contiguous().data<float>(),
+		projmatrix.contiguous().data<float>(),
+		campos.contiguous().data<float>(),
+		tan_fovx,
+		tan_fovy,
+		prefiltered,
+		out_color.contiguous().data<float>(),
+		out_depth.contiguous().data<float>(),
+		out_alpha.contiguous().data<float>(),
+		radii.contiguous().data<int>(),
+		debug);
+  }
+  return std::make_tuple(rendered, out_color, out_depth, out_alpha, radii, geomBuffer, binningBuffer, imgBuffer);
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+ RasterizeGaussiansBackwardCUDA(
+ 	const torch::Tensor& background,
+	const torch::Tensor& means3D,
+	const torch::Tensor& radii,
+    const torch::Tensor& colors,
+	const torch::Tensor& scales,
+	const torch::Tensor& rotations,
+	const float scale_modifier,
+	const torch::Tensor& cov3D_precomp,
+	const torch::Tensor& viewmatrix,
+    const torch::Tensor& projmatrix,
+	const float tan_fovx,
+	const float tan_fovy,
+    const torch::Tensor& dL_dout_color,
+	const torch::Tensor& dL_dout_depth,
+	const torch::Tensor& dL_dout_alpha,
+	const torch::Tensor& sh,
+	const int degree,
+	const torch::Tensor& campos,
+	const torch::Tensor& geomBuffer,
+	const int R,
+	const torch::Tensor& binningBuffer,
+	const torch::Tensor& imageBuffer,
+	const torch::Tensor& alphas,
+	const bool debug)
+{
+  const int P = means3D.size(0);
+  const int H = dL_dout_color.size(1);
+  const int W = dL_dout_color.size(2);
+  int M = 0;
+  if(sh.size(0) != 0)
+  {
+	M = sh.size(1);
+  }
+  torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options());
+  torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options());
+  torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options());
+  torch::Tensor dL_ddepths = torch::zeros({P, 1}, means3D.options());
+  torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options());
+  torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options());
+  torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options());
+  torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options());
+  torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options());
+  torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options());
+  if(P != 0)
+  {
+	  CudaRasterizer::Rasterizer::backward(P, degree, M, R,
+	  background.contiguous().data<float>(),
+	  W, H,
+	  means3D.contiguous().data<float>(),
+	  sh.contiguous().data<float>(),
+	  colors.contiguous().data<float>(),
+	  alphas.contiguous().data<float>(),
+	  scales.data_ptr<float>(),
+	  scale_modifier,
+	  rotations.data_ptr<float>(),
+	  cov3D_precomp.contiguous().data<float>(),
+	  viewmatrix.contiguous().data<float>(),
+	  projmatrix.contiguous().data<float>(),
+	  campos.contiguous().data<float>(),
+	  tan_fovx,
+	  tan_fovy,
+	  radii.contiguous().data<int>(),
+	  reinterpret_cast<char*>(geomBuffer.contiguous().data_ptr()),
+	  reinterpret_cast<char*>(binningBuffer.contiguous().data_ptr()),
+	  reinterpret_cast<char*>(imageBuffer.contiguous().data_ptr()),
+	  dL_dout_color.contiguous().data<float>(),
+	  dL_dout_depth.contiguous().data<float>(),
+	  dL_dout_alpha.contiguous().data<float>(),
+	  dL_dmeans2D.contiguous().data<float>(),
+	  dL_dconic.contiguous().data<float>(),
+	  dL_dopacity.contiguous().data<float>(),
+	  dL_dcolors.contiguous().data<float>(),
+	  dL_ddepths.contiguous().data<float>(),
+	  dL_dmeans3D.contiguous().data<float>(),
+	  dL_dcov3D.contiguous().data<float>(),
+	  dL_dsh.contiguous().data<float>(),
+	  dL_dscales.contiguous().data<float>(),
+	  dL_drotations.contiguous().data<float>(),
+	  debug);
+  }
+  return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations);
+}
+torch::Tensor markVisible(
+		torch::Tensor& means3D,
+		torch::Tensor& viewmatrix,
+		torch::Tensor& projmatrix)
+{
+  const int P = means3D.size(0);
+  torch::Tensor present = torch::full({P}, false, means3D.options().dtype(at::kBool));
+  if(P != 0)
+  {
+	CudaRasterizer::Rasterizer::markVisible(P,
+		means3D.contiguous().data<float>(),
+		viewmatrix.contiguous().data<float>(),
+		projmatrix.contiguous().data<float>(),
+		present.contiguous().data<bool>());
+  }
+  return present;
+}

diff-gaussian-rasterization/rasterize_points.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#pragma once
+#include <torch/extension.h>
+#include <cstdio>
+#include <tuple>
+#include <string>
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeGaussiansCUDA(
+	const torch::Tensor& background,
+	const torch::Tensor& means3D,
+    const torch::Tensor& colors,
+    const torch::Tensor& opacity,
+	const torch::Tensor& scales,
+	const torch::Tensor& rotations,
+	const float scale_modifier,
+	const torch::Tensor& cov3D_precomp,
+	const torch::Tensor& viewmatrix,
+	const torch::Tensor& projmatrix,
+	const float tan_fovx,
+	const float tan_fovy,
+    const int image_height,
+    const int image_width,
+	const torch::Tensor& sh,
+	const int degree,
+	const torch::Tensor& campos,
+	const bool prefiltered,
+	const bool debug);
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+ RasterizeGaussiansBackwardCUDA(
+ 	const torch::Tensor& background,
+	const torch::Tensor& means3D,
+	const torch::Tensor& radii,
+    const torch::Tensor& colors,
+	const torch::Tensor& scales,
+	const torch::Tensor& rotations,
+	const float scale_modifier,
+	const torch::Tensor& cov3D_precomp,
+	const torch::Tensor& viewmatrix,
+    const torch::Tensor& projmatrix,
+	const float tan_fovx,
+	const float tan_fovy,
+    const torch::Tensor& dL_dout_color,
+	const torch::Tensor& dL_dout_depth,
+	const torch::Tensor& dL_dout_alpha,
+	const torch::Tensor& sh,
+	const int degree,
+	const torch::Tensor& campos,
+	const torch::Tensor& geomBuffer,
+	const int R,
+	const torch::Tensor& binningBuffer,
+	const torch::Tensor& imageBuffer,
+	const torch::Tensor& alpha,
+	const bool debug);
+torch::Tensor markVisible(
+		torch::Tensor& means3D,
+		torch::Tensor& viewmatrix,
+		torch::Tensor& projmatrix);

diff-gaussian-rasterization/setup.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  george.drettakis@inria.fr
+#
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+import os
+os.path.dirname(os.path.abspath(__file__))
+setup(
+    name="diff_gaussian_rasterization",
+    packages=['diff_gaussian_rasterization'],
+    ext_modules=[
+        CUDAExtension(
+            name="diff_gaussian_rasterization._C",
+            sources=[
+            "cuda_rasterizer/rasterizer_impl.cu",
+            "cuda_rasterizer/forward.cu",
+            "cuda_rasterizer/backward.cu",
+            "rasterize_points.cu",
+            "ext.cpp"],
+            extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]})
+        ],
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)

diff-gaussian-rasterization/third_party/glm/.appveyor.yml ADDED Viewed

	@@ -0,0 +1,92 @@

+shallow_clone: true
+platform:
+  - x86
+  - x64
+configuration:
+  - Debug
+  - Release
+image:
+  - Visual Studio 2013
+  - Visual Studio 2015
+  - Visual Studio 2017
+  - Visual Studio 2019
+environment:
+  matrix:
+    - GLM_ARGUMENTS: -DGLM_TEST_FORCE_PURE=ON
+    - GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_SSE2=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON
+    - GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON
+    - GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_CXX_14=ON
+    - GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_CXX_17=ON
+matrix:
+    exclude:
+    - image: Visual Studio 2013
+      GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON
+    - image: Visual Studio 2013
+      GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_CXX_14=ON
+    - image: Visual Studio 2013
+      GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_CXX_17=ON
+    - image: Visual Studio 2013
+      configuration: Debug
+    - image: Visual Studio 2015
+      GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_SSE2=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON
+    - image: Visual Studio 2015
+      GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_CXX_14=ON
+    - image: Visual Studio 2015
+      GLM_ARGUMENTS: -DGLM_TEST_ENABLE_SIMD_AVX=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_CXX_17=ON
+    - image: Visual Studio 2015
+      platform: x86
+    - image: Visual Studio 2015
+      configuration: Debug
+    - image: Visual Studio 2017
+      platform: x86
+    - image: Visual Studio 2017
+      configuration: Debug
+    - image: Visual Studio 2019
+      platform: x64
+branches:
+  only:
+    - master
+before_build:
+  - ps: |
+      mkdir build
+      cd build
+      if ("$env:APPVEYOR_JOB_NAME" -match "Image: Visual Studio 2013") {
+          $env:generator="Visual Studio 12 2013"
+      }
+      if ("$env:APPVEYOR_JOB_NAME" -match "Image: Visual Studio 2015") {
+          $env:generator="Visual Studio 14 2015"
+      }
+      if ("$env:APPVEYOR_JOB_NAME" -match "Image: Visual Studio 2017") {
+          $env:generator="Visual Studio 15 2017"
+      }
+      if ("$env:APPVEYOR_JOB_NAME" -match "Image: Visual Studio 2019") {
+          $env:generator="Visual Studio 16 2019"
+      }
+      if ($env:PLATFORM -eq "x64") {
+          $env:generator="$env:generator Win64"
+      }
+      echo generator="$env:generator"
+      cmake .. -G "$env:generator" -DCMAKE_INSTALL_PREFIX="$env:APPVEYOR_BUILD_FOLDER/install" -DGLM_QUIET=ON -DGLM_TEST_ENABLE=ON "$env:GLM_ARGUMENTS"
+build_script:
+  - cmake --build . --parallel --config %CONFIGURATION% -- /m /v:minimal
+  - cmake --build . --target install --parallel --config %CONFIGURATION% -- /m /v:minimal
+test_script:
+  - ctest --parallel 4 --verbose -C %CONFIGURATION%
+  - cd ..
+  - ps: |
+      mkdir build_test_cmake
+      cd build_test_cmake
+      cmake ..\test\cmake\ -G "$env:generator" -DCMAKE_PREFIX_PATH="$env:APPVEYOR_BUILD_FOLDER/install"
+  - cmake --build . --parallel --config %CONFIGURATION% -- /m /v:minimal
+deploy: off

diff-gaussian-rasterization/third_party/glm/.gitignore ADDED Viewed

	@@ -0,0 +1,61 @@

+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+# Precompiled Headers
+*.gch
+*.pch
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+# Fortran module files
+*.mod
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+# Executables
+*.exe
+*.out
+*.app
+# CMake
+CMakeCache.txt
+CMakeFiles
+cmake_install.cmake
+install_manifest.txt
+*.cmake
+!glmConfig.cmake
+!glmConfig-version.cmake
+# ^ May need to add future .cmake files as exceptions
+# Test logs
+Testing/*
+# Test input
+test/gtc/*.dds
+# Project Files
+Makefile
+*.cbp
+*.user
+# Misc.
+*.log
+# local build(s)
+build*
+/.vs
+/.vscode
+/CMakeSettings.json
+.DS_Store
+*.swp

diff-gaussian-rasterization/third_party/glm/.travis.yml ADDED Viewed

	@@ -0,0 +1,388 @@

+language: cpp
+branches:
+  only:
+    - master
+    - stable
+jobs:
+  include:
+    - name: "Xcode 7.3 C++98 pure release"
+      os: osx
+      osx_image: xcode7.3
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_98=ON -DGLM_TEST_FORCE_PURE=ON"
+    - name: "Xcode 7.3 C++98 sse2 release"
+      os: osx
+      osx_image: xcode7.3
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_98=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE2=ON"
+    - name: "Xcode 7.3 C++98 ms release"
+      os: osx
+      osx_image: xcode7.3
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_98=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON"
+    - name: "XCode 7.3 C++11 pure release"
+      os: osx
+      osx_image: xcode7.3
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_FORCE_PURE=ON"
+    - name: "XCode 7.3 C++11 sse2 release"
+      os: osx
+      osx_image: xcode7.3
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE3=ON"
+    - name: "XCode 10.3 C++11 sse2 release"
+      os: osx
+      osx_image: xcode10.3
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE3=ON"
+    - name: "XCode 12.2 C++11 sse2 release"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE3=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++11 sse2 debug"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE3=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++11 avx debug"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_AVX=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++14 avx debug"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_AVX=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++14 pure debug"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++17 pure debug"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++17 sse2 debug"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE2=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++17 sse2 release"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE2=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "XCode 12.2 C++17 avx release"
+      os: osx
+      osx_image: xcode12.2
+      env:
+        - MATRIX_EVAL=""
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_AVX=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 4.9 C++98 pure release"
+      os: linux
+      dist: Xenial
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+      env:
+        - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_98=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 4.9 C++98 pure debug"
+      os: linux
+      dist: Xenial
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+      env:
+        - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_98=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 4.9 C++98 ms debug"
+      os: linux
+      dist: Xenial
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+      env:
+        - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_98=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 4.9 C++11 ms debug"
+      os: linux
+      dist: Xenial
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+      env:
+        - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 4.9 C++11 pure debug"
+      os: linux
+      dist: Xenial
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+      env:
+        - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_11=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 6 C++14 pure debug"
+      os: linux
+      dist: bionic
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-6
+      env:
+        - MATRIX_EVAL="CC=gcc-6 && CXX=g++-6"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 6 C++14 ms debug"
+      os: linux
+      dist: bionic
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-6
+      env:
+        - MATRIX_EVAL="CC=gcc-6 && CXX=g++-6"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 7 C++17 ms debug"
+      os: linux
+      dist: bionic
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-7
+      env:
+        - MATRIX_EVAL="CC=gcc-7 && CXX=g++-7"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 7 C++17 pure debug"
+      os: linux
+      dist: bionic
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-7
+      env:
+        - MATRIX_EVAL="CC=gcc-7 && CXX=g++-7"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 10 C++17 pure debug"
+      os: linux
+      dist: bionic
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-10
+      env:
+        - MATRIX_EVAL="CC=gcc-10 && CXX=g++-10"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "GCC 10 C++17 pure release"
+      os: linux
+      dist: bionic
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-10
+      env:
+        - MATRIX_EVAL="CC=gcc-10 && CXX=g++-10"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++14 pure release"
+      os: linux
+      dist: Xenial
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++14 pure debug"
+      os: linux
+      dist: Xenial
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++14 sse2 debug"
+      os: linux
+      dist: Xenial
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE2=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++14 sse2 debug"
+      os: linux
+      dist: focal
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_14=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE2=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++17 sse2 debug"
+      os: linux
+      dist: focal
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_SSE2=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++17 avx2 debug"
+      os: linux
+      dist: focal
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_ENABLE_LANG_EXTENSIONS=ON -DGLM_TEST_ENABLE_SIMD_AVX2=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++17 pure debug"
+      os: linux
+      dist: focal
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Debug -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+    - name: "Clang C++17 pure release"
+      os: linux
+      dist: focal
+      env:
+        - MATRIX_EVAL="CC=clang && CXX=clang++"
+        - CMAKE_BUILD_ENV="-DCMAKE_BUILD_TYPE=Release -DGLM_TEST_ENABLE=ON -DGLM_TEST_ENABLE_CXX_17=ON -DGLM_TEST_FORCE_PURE=ON"
+        - CTEST_ENV="--parallel 4 --output-on-failure"
+        - CMAKE_ENV="--parallel"
+before_script:
+    - cmake --version
+    - eval "${MATRIX_EVAL}"
+script:
+  - ${CC} --version
+  - mkdir ./build
+  - cd ./build
+  - cmake -DCMAKE_INSTALL_PREFIX=$TRAVIS_BUILD_DIR/install -DCMAKE_CXX_COMPILER=$COMPILER ${CMAKE_BUILD_ENV} ..
+  - cmake --build . ${CMAKE_ENV}
+  - ctest ${CTEST_ENV}
+  - cmake --build . --target install ${CMAKE_ENV}
+  - cd $TRAVIS_BUILD_DIR
+  - mkdir ./build_test_cmake
+  - cd ./build_test_cmake
+  - cmake -DCMAKE_CXX_COMPILER=$COMPILER $TRAVIS_BUILD_DIR/test/cmake/ -DCMAKE_PREFIX_PATH=$TRAVIS_BUILD_DIR/install
+  - cmake --build .

diff-gaussian-rasterization/third_party/glm/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
+cmake_policy(VERSION 3.2)
+file(READ "glm/detail/setup.hpp" GLM_SETUP_FILE)
+string(REGEX MATCH "#define[ ]+GLM_VERSION_MAJOR[ ]+([0-9]+)" _ ${GLM_SETUP_FILE})
+set(GLM_VERSION_MAJOR "${CMAKE_MATCH_1}")
+string(REGEX MATCH "#define[ ]+GLM_VERSION_MINOR[ ]+([0-9]+)" _ ${GLM_SETUP_FILE})
+set(GLM_VERSION_MINOR "${CMAKE_MATCH_1}")
+string(REGEX MATCH "#define[ ]+GLM_VERSION_PATCH[ ]+([0-9]+)" _ ${GLM_SETUP_FILE})
+set(GLM_VERSION_PATCH "${CMAKE_MATCH_1}")
+string(REGEX MATCH "#define[ ]+GLM_VERSION_REVISION[ ]+([0-9]+)" _ ${GLM_SETUP_FILE})
+set(GLM_VERSION_REVISION "${CMAKE_MATCH_1}")
+set(GLM_VERSION ${GLM_VERSION_MAJOR}.${GLM_VERSION_MINOR}.${GLM_VERSION_PATCH}.${GLM_VERSION_REVISION})
+project(glm VERSION ${GLM_VERSION} LANGUAGES CXX)
+message(STATUS "GLM: Version " ${GLM_VERSION})
+add_subdirectory(glm)
+add_library(glm::glm ALIAS glm)
+if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
+	include(CPack)
+	install(DIRECTORY glm DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} PATTERN "CMakeLists.txt" EXCLUDE)
+	install(EXPORT glm FILE glmConfig.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/glm NAMESPACE glm::)
+	include(CMakePackageConfigHelpers)
+	write_basic_package_version_file("glmConfigVersion.cmake" COMPATIBILITY AnyNewerVersion)
+	install(FILES ${CMAKE_CURRENT_BINARY_DIR}/glmConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/glm)
+	include(CTest)
+	if(BUILD_TESTING)
+		add_subdirectory(test)
+	endif()
+endif(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
+if (NOT TARGET uninstall)
+configure_file(cmake/cmake_uninstall.cmake.in
+               cmake_uninstall.cmake IMMEDIATE @ONLY)
+add_custom_target(uninstall
+                  "${CMAKE_COMMAND}" -P
+                  "${CMAKE_BINARY_DIR}/cmake_uninstall.cmake")
+endif()