Spaces:

Ruicheng
/

MoGe

Running on Zero

File size: 35,382 Bytes

ec0c8fa

from typing import *
from numbers import Number
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from .utils import image_uv


__all__ = [
    'get_rays',
    'get_image_rays',
    'get_mipnerf_cones',
    'volume_rendering',
    'bin_sample',
    'importance_sample',
    'nerf_render_rays',
    'mipnerf_render_rays',
    'nerf_render_view',
    'mipnerf_render_view',
    'InstantNGP',
]


def get_rays(extrinsics: Tensor, intrinsics: Tensor, uv: Tensor) -> Tuple[Tensor, Tensor]:
    """
    Args:
        extrinsics: (..., 4, 4) extrinsics matrices.
        intrinsics: (..., 3, 3) intrinsics matrices.
        uv: (..., n_rays, 2) uv coordinates of the rays. 

    Returns:
        rays_o: (..., 1,      3) ray origins
        rays_d: (..., n_rays, 3) ray directions. 
            NOTE: ray directions are NOT normalized. They actuallys makes rays_o + rays_d * z = world coordinates, where z is the depth.
    """
    uvz = torch.cat([uv, torch.ones_like(uv[..., :1])], dim=-1).to(extrinsics)                                                          # (n_batch, n_views, n_rays, 3)

    with torch.cuda.amp.autocast(enabled=False):
        inv_transformation = (intrinsics @ extrinsics[..., :3, :3]).inverse()
        inv_extrinsics = extrinsics.inverse()
    rays_d = uvz @ inv_transformation.transpose(-1, -2)                                                  
    rays_o = inv_extrinsics[..., None, :3, 3]                                                                                           # (n_batch, n_views, 1, 3)
    return rays_o, rays_d


def get_image_rays(extrinsics: Tensor, intrinsics: Tensor, width: int, height: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        extrinsics: (..., 4, 4) extrinsics matrices.
        intrinsics: (..., 3, 3) intrinsics matrices.
        width: width of the image.
        height: height of the image.
    
    Returns:
        rays_o: (..., 1,      1,     3) ray origins
        rays_d: (..., height, width, 3) ray directions. 
            NOTE: ray directions are NOT normalized. They actuallys makes rays_o + rays_d * z = world coordinates, where z is the depth.
    """
    uv = image_uv(height, width).to(extrinsics).flatten(0, 1)
    rays_o, rays_d = get_rays(extrinsics, intrinsics, uv)
    rays_o = rays_o.unflatten(-2, (1, 1))
    rays_d = rays_d.unflatten(-2, (height, width))
    return rays_o, rays_d


def get_mipnerf_cones(rays_o: Tensor, rays_d: Tensor, z_vals: Tensor, pixel_width: Tensor) -> Tuple[Tensor, Tensor]:
    """
    Args:
        rays_o: (..., n_rays, 3) ray origins
        rays_d: (..., n_rays, 3) ray directions.
        z_vals: (..., n_rays, n_samples) z values.
        pixel_width: (...) pixel width. = 1 / (normalized focal length * width)
    
    Returns:
        mu: (..., n_rays, n_samples, 3) cone mu.
        sigma: (..., n_rays, n_samples, 3, 3) cone sigma.
    """
    t_mu = (z_vals[..., 1:] + z_vals[..., :-1]).mul_(0.5)
    t_delta = (z_vals[..., 1:] - z_vals[..., :-1]).mul_(0.5)
    t_mu_square = t_mu.square()
    t_delta_square = t_delta.square()
    t_delta_quad = t_delta_square.square()
    mu_t = t_mu + 2.0 * t_mu * t_delta_square / (3.0 * t_mu_square + t_delta_square)
    sigma_t = t_delta_square / 3.0 - (4.0 / 15.0) * t_delta_quad / (3.0 * t_mu_square + t_delta_square).square() * (12.0 * t_mu_square - t_delta_square)
    sigma_r = (pixel_width[..., None, None].square() / 3.0) * (t_mu_square / 4.0 + (5.0 / 12.0) * t_delta_square - (4.0 / 15.0) * t_delta_quad / (3.0 * t_mu_square + t_delta_square))
    points_mu = rays_o[:, :, :, None, :] + rays_d[:, :, :, None, :] * mu_t[..., None]
    d_dt = rays_d[..., :, None] * rays_d[..., None, :]      # (..., n_rays, 3, 3)
    points_sigma = sigma_t[..., None, None] * d_dt[..., None, :, :] + sigma_r[..., None, None] * (torch.eye(3).to(rays_o) - d_dt[..., None, :, :])
    return points_mu, points_sigma


def get_pixel_width(intrinsics: Tensor, width: int, height: int) -> Tensor:
    """
    Args:
        intrinsics: (..., 3, 3) intrinsics matrices.
        width: width of the image.
        height: height of the image.
    
    Returns:
        pixel_width: (...) pixel width. = 1 / (normalized focal length * width)
    """
    assert width == height, "Currently, only square images are supported."
    pixel_width = torch.reciprocal((intrinsics[..., 0, 0] * intrinsics[..., 1, 1]).sqrt() * width)
    return pixel_width


def volume_rendering(color: Tensor, sigma: Tensor, z_vals: Tensor, ray_length: Tensor, rgb: bool = True, depth: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
    """
    Given color, sigma and z_vals (linear depth of the sampling points), render the volume.

    NOTE: By default, color and sigma should have one less sample than z_vals, in correspondence with the average value in intervals.
    If queried color are aligned with z_vals, we use trapezoidal rule to calculate the average values in intervals.

    Args:
        color: (..., n_samples or n_samples - 1, 3) color values.
        sigma: (..., n_samples or n_samples - 1) density values.
        z_vals: (..., n_samples) z values.
        ray_length: (...) length of the ray

    Returns:
        rgb: (..., 3) rendered color values.
        depth: (...) rendered depth values.
        weights (..., n_samples) weights.
    """
    dists = (z_vals[..., 1:] - z_vals[..., :-1]) * ray_length[..., None]
    if color.shape[-2] == z_vals.shape[-1]:
        color = (color[..., 1:, :] + color[..., :-1, :]).mul_(0.5)
        sigma = (sigma[..., 1:] + sigma[..., :-1]).mul_(0.5)                                        
    sigma_delta = sigma * dists                                                      
    transparancy = (-torch.cat([torch.zeros_like(sigma_delta[..., :1]), sigma_delta[..., :-1]], dim=-1).cumsum(dim=-1)).exp_()     # First cumsum then exp for numerical stability
    alpha = 1.0 - (-sigma_delta).exp_()                                               
    weights = alpha * transparancy
    if rgb:
        rgb = torch.sum(weights[..., None] * color, dim=-2) if rgb else None        
    if depth:
        z_vals = (z_vals[..., 1:] + z_vals[..., :-1]).mul_(0.5)
        depth = torch.sum(weights * z_vals, dim=-1) / weights.sum(dim=-1).clamp_min_(1e-8) if depth else None            
    return rgb, depth, weights


def neus_volume_rendering(color: Tensor, sdf: Tensor, s: torch.Tensor, z_vals: Tensor = None, rgb: bool = True, depth: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
    """
    Given color, sdf values and z_vals (linear depth of the sampling points), do volume rendering. (NeuS)

    Args:
        color: (..., n_samples or n_samples - 1, 3) color values.
        sdf: (..., n_samples) sdf values.
        s: (..., n_samples) S values of S-density function in NeuS. The standard deviation of such S-density distribution is 1 / s.
        z_vals: (..., n_samples) z values.
        ray_length: (...) length of the ray

    Returns:
        rgb: (..., 3) rendered color values.
        depth: (...) rendered depth values.
        weights (..., n_samples) weights.
    """

    if color.shape[-2] == z_vals.shape[-1]:
        color = (color[..., 1:, :] + color[..., :-1, :]).mul_(0.5)

    sigmoid_sdf = torch.sigmoid(s * sdf)
    alpha = F.relu(1 - sigmoid_sdf[..., :-1] / sigmoid_sdf[..., :-1])
    transparancy = torch.cumprod(torch.cat([torch.ones_like(alpha[..., :1]), alpha], dim=-1), dim=-1)
    weights = alpha * transparancy

    if rgb:
        rgb = torch.sum(weights[..., None] * color, dim=-2) if rgb else None        
    if depth:
        z_vals = (z_vals[..., 1:] + z_vals[..., :-1]).mul_(0.5)
        depth = torch.sum(weights * z_vals, dim=-1) / weights.sum(dim=-1).clamp_min_(1e-8) if depth else None            
    return rgb, depth, weights


def bin_sample(size: Union[torch.Size, Tuple[int, ...]], n_samples: int, min_value: Number, max_value: Number, spacing: Literal['linear', 'inverse_linear'], dtype: torch.dtype = None, device: torch.device = None) -> Tensor:
    """
    Uniformly (or uniformly in inverse space) sample z values in `n_samples` bins in range [min_value, max_value].
    Args:
        size: size of the rays
        n_samples: number of samples to be sampled, also the number of bins
        min_value: minimum value of the range
        max_value: maximum value of the range
        space: 'linear' or 'inverse_linear'. If 'inverse_linear', the sampling is uniform in inverse space.
    
    Returns:
        z_rand: (*size, n_samples) sampled z values, sorted in ascending order.
    """
    if spacing == 'linear':
        pass
    elif spacing == 'inverse_linear':
        min_value = 1.0 / min_value
        max_value = 1.0 / max_value
    bin_length = (max_value - min_value) / n_samples
    z_rand = (torch.rand(*size, n_samples, device=device, dtype=dtype) - 0.5) * bin_length + torch.linspace(min_value + bin_length * 0.5, max_value - bin_length * 0.5, n_samples, device=device, dtype=dtype)   
    if spacing == 'inverse_linear':
        z_rand = 1.0 / z_rand
    return z_rand


def importance_sample(z_vals: Tensor, weights: Tensor, n_samples: int) -> Tuple[Tensor, Tensor]:
    """
    Importance sample z values.

    NOTE: By default, weights should have one less sample than z_vals, in correspondence with the intervals.
    If weights has the same number of samples as z_vals, we use trapezoidal rule to calculate the average weights in intervals.

    Args:
        z_vals: (..., n_rays, n_input_samples) z values, sorted in ascending order.
        weights: (..., n_rays, n_input_samples or n_input_samples - 1) weights.
        n_samples: number of output samples for importance sampling.
    
    Returns:
        z_importance: (..., n_rays, n_samples) importance sampled z values, unsorted.
    """
    if weights.shape[-1] == z_vals.shape[-1]:
        weights = (weights[..., 1:] + weights[..., :-1]).mul_(0.5)
    weights = weights / torch.sum(weights, dim=-1, keepdim=True)                      # (..., n_rays, n_input_samples - 1)
    bins_a, bins_b = z_vals[..., :-1], z_vals[..., 1:]

    pdf = weights / torch.sum(weights, dim=-1, keepdim=True)                          # (..., n_rays, n_input_samples - 1)
    cdf = torch.cumsum(pdf, dim=-1)
    u = torch.rand(*z_vals.shape[:-1], n_samples, device=z_vals.device, dtype=z_vals.dtype)
    
    inds = torch.searchsorted(cdf, u, right=True).clamp(0, cdf.shape[-1] - 1)         # (..., n_rays, n_samples)
    
    bins_a = torch.gather(bins_a, dim=-1, index=inds)
    bins_b = torch.gather(bins_b, dim=-1, index=inds)
    z_importance = bins_a + (bins_b - bins_a) * torch.rand_like(u)
    return z_importance


def nerf_render_rays(
    nerf: Union[Callable[[Tensor, Tensor], Tuple[Tensor, Tensor]], Tuple[Callable[[Tensor], Tuple[Tensor, Tensor]], Callable[[Tensor], Tuple[Tensor, Tensor]]]],
    rays_o: Tensor, rays_d: Tensor,
    *, 
    return_dict: bool = False,
    n_coarse: int = 64, n_fine: int = 64,
    near: float = 0.1, far: float = 100.0,
    z_spacing: Literal['linear', 'inverse_linear'] = 'linear',
):
    """
    NeRF rendering of rays. Note that it supports arbitrary batch dimensions (denoted as `...`)

    Args:
        nerf: nerf model, which takes (points, directions) as input and returns (color, density) as output.
            If nerf is a tuple, it should be (nerf_coarse, nerf_fine), where nerf_coarse and nerf_fine are two nerf models for coarse and fine stages respectively.
            
            nerf args:
                points: (..., n_rays, n_samples, 3)
                directions: (..., n_rays, n_samples, 3)
            nerf returns:
                color: (..., n_rays, n_samples, 3) color values.
                density: (..., n_rays, n_samples) density values.
                
        rays_o: (..., n_rays, 3) ray origins
        rays_d: (..., n_rays, 3) ray directions.
        pixel_width: (..., n_rays) pixel width. How to compute? pixel_width = 1 / (normalized focal length * width)
    
    Returns 
        if return_dict is False, return rendered rgb and depth for short cut. (If there are separate coarse and fine results, return fine results)
            rgb: (..., n_rays, 3) rendered color values. 
            depth: (..., n_rays) rendered depth values.
        else, return a dict. If `n_fine == 0` or `nerf` is a single model, the dict only contains coarse results:
        ```
        {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..}
        ```
        If there are two models for coarse and fine stages, the dict contains both coarse and fine results:
        ```
        {
            "coarse": {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..},
            "fine": {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..}
        }
        ```
    """
    if isinstance(nerf, tuple):
        nerf_coarse, nerf_fine = nerf
    else:
        nerf_coarse = nerf_fine = nerf
    # 1. Coarse: bin sampling
    z_coarse = bin_sample(rays_d.shape[:-1], n_coarse, near, far, device=rays_o.device, dtype=rays_o.dtype, spacing=z_spacing)                       # (n_batch, n_views, n_rays, n_samples)
    points_coarse = rays_o[..., None, :] + rays_d[..., None, :] * z_coarse[..., None]                                                                # (n_batch, n_views, n_rays, n_samples, 3)
    ray_length = rays_d.norm(dim=-1)

    #    Query color and density                   
    color_coarse, density_coarse = nerf_coarse(points_coarse, rays_d[..., None, :].expand_as(points_coarse))               # (n_batch, n_views, n_rays, n_samples, 3), (n_batch, n_views, n_rays, n_samples)
    
    #    Volume rendering
    with torch.no_grad():
        rgb_coarse, depth_coarse, weights = volume_rendering(color_coarse, density_coarse, z_coarse, ray_length)            # (n_batch, n_views, n_rays, 3), (n_batch, n_views, n_rays, 1), (n_batch, n_views, n_rays, n_samples)
    
    if n_fine == 0:
        if return_dict:
            return {'rgb': rgb_coarse, 'depth': depth_coarse, 'weights': weights, 'z_vals': z_coarse, 'color': color_coarse, 'density': density_coarse}
        else:
            return rgb_coarse, depth_coarse
    
    # 2. Fine: Importance sampling
    if nerf_coarse is nerf_fine:
        # If coarse and fine stages share the same model, the points of coarse stage can be reused, 
        # and we only need to query the importance samples of fine stage.
        z_fine = importance_sample(z_coarse, weights, n_fine)               
        points_fine = rays_o[..., None, :] + rays_d[..., None, :] * z_fine[..., None]                      
        color_fine, density_fine = nerf_fine(points_fine, rays_d[..., None, :].expand_as(points_fine))

        # Merge & volume rendering
        z_vals = torch.cat([z_coarse, z_fine], dim=-1)          
        color = torch.cat([color_coarse, color_fine], dim=-2)
        density = torch.cat([density_coarse, density_fine], dim=-1)     
        z_vals, sort_inds = torch.sort(z_vals, dim=-1)                   
        color = torch.gather(color, dim=-2, index=sort_inds[..., None].expand_as(color))
        density = torch.gather(density, dim=-1, index=sort_inds)
        rgb, depth, weights = volume_rendering(color, density, z_vals, ray_length)
        
        if return_dict:
            return {'rgb': rgb, 'depth': depth, 'weights': weights, 'z_vals': z_vals, 'color': color, 'density': density}
        else:
            return rgb, depth
    else:
        # If coarse and fine stages use different models, we need to query the importance samples of both stages.
        z_fine = importance_sample(z_coarse, weights, n_fine)
        z_vals = torch.cat([z_coarse, z_fine], dim=-1)   
        points = rays_o[..., None, :] + rays_d[..., None, :] * z_vals[..., None]
        color, density = nerf_fine(points)
        rgb, depth, weights = volume_rendering(color, density, z_vals, ray_length)

        if return_dict:
            return {
                'coarse': {'rgb': rgb_coarse, 'depth': depth_coarse, 'weights': weights, 'z_vals': z_coarse, 'color': color_coarse, 'density': density_coarse},
                'fine': {'rgb': rgb, 'depth': depth, 'weights': weights, 'z_vals': z_vals, 'color': color, 'density': density}
            }
        else:
            return rgb, depth


def mipnerf_render_rays(
    mipnerf: Callable[[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]],
    rays_o: Tensor, rays_d: Tensor, pixel_width: Tensor, 
    *, 
    return_dict: bool = False,
    n_coarse: int = 64, n_fine: int = 64, uniform_ratio: float = 0.4,
    near: float = 0.1, far: float = 100.0,
    z_spacing: Literal['linear', 'inverse_linear'] = 'linear',
) -> Union[Tuple[Tensor, Tensor], Dict[str, Tensor]]:
    """
    MipNeRF rendering.

    Args:
        mipnerf: mipnerf model, which takes (points_mu, points_sigma) as input and returns (color, density) as output.

            mipnerf args:
                points_mu: (..., n_rays, n_samples, 3) cone mu.
                points_sigma: (..., n_rays, n_samples, 3, 3) cone sigma.
                directions: (..., n_rays, n_samples, 3)
            mipnerf returns:
                color: (..., n_rays, n_samples, 3) color values.
                density: (..., n_rays, n_samples) density values.

        rays_o: (..., n_rays, 3) ray origins
        rays_d: (..., n_rays, 3) ray directions.
        pixel_width: (..., n_rays) pixel width. How to compute? pixel_width = 1 / (normalized focal length * width)
    
    Returns 
        if return_dict is False, return rendered results only: (If `n_fine == 0`, return coarse results, otherwise return fine results)
            rgb: (..., n_rays, 3) rendered color values. 
            depth: (..., n_rays) rendered depth values.
        else, return a dict. If `n_fine == 0`, the dict only contains coarse results:
        ```
        {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..}
        ```
        If n_fine > 0, the dict contains both coarse and fine results :
        ```
        {
            "coarse": {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..},
            "fine": {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..}
        }
        ```
    """
    # 1. Coarse: bin sampling
    z_coarse = bin_sample(rays_d.shape[:-1], n_coarse, near, far, spacing=z_spacing, device=rays_o.device, dtype=rays_o.dtype)
    points_mu_coarse, points_sigma_coarse = get_mipnerf_cones(rays_o, rays_d, z_coarse, pixel_width)
    ray_length = rays_d.norm(dim=-1)

    #    Query color and density
    color_coarse, density_coarse = mipnerf(points_mu_coarse, points_sigma_coarse, rays_d[..., None, :].expand_as(points_mu_coarse))             # (n_batch, n_views, n_rays, n_samples, 3), (n_batch, n_views, n_rays, n_samples)

    #    Volume rendering
    rgb_coarse, depth_coarse, weights_coarse = volume_rendering(color_coarse, density_coarse, z_coarse, ray_length)                             # (n_batch, n_views, n_rays, 3), (n_batch, n_views, n_rays, 1), (n_batch, n_views, n_rays, n_samples)

    if n_fine == 0:
        if return_dict:
            return {'rgb': rgb_coarse, 'depth': depth_coarse, 'weights': weights_coarse, 'z_vals': z_coarse, 'color': color_coarse, 'density': density_coarse}
        else:
            return rgb_coarse, depth_coarse

    # 2. Fine: Importance sampling. (NOTE: coarse stages and fine stages always share the same model, but coarse stage points can not be reused)
    with torch.no_grad():
        weights_coarse = (1.0 - uniform_ratio) * weights_coarse + uniform_ratio / weights_coarse.shape[-1]
    z_fine = importance_sample(z_coarse, weights_coarse, n_fine)
    z_fine, _ = torch.sort(z_fine, dim=-2)
    points_mu_fine, points_sigma_fine = get_mipnerf_cones(rays_o, rays_d, z_fine, pixel_width)                                                           
    color_fine, density_fine = mipnerf(points_mu_fine, points_sigma_fine, rays_d[..., None, :].expand_as(points_mu_fine))

    #   Volume rendering                    
    rgb_fine, depth_fine, weights_fine = volume_rendering(color_fine, density_fine, z_fine, ray_length)

    if return_dict:
        return {
            'coarse': {'rgb': rgb_coarse, 'depth': depth_coarse, 'weights': weights_coarse, 'z_vals': z_coarse, 'color': color_coarse, 'density': density_coarse},
            'fine': {'rgb': rgb_fine, 'depth': depth_fine, 'weights': weights_fine, 'z_vals': z_fine, 'color': color_fine, 'density': density_fine}
        }
    else:
        return rgb_fine, depth_fine


def neus_render_rays(
    neus: Callable[[Tensor, Tensor], Tuple[Tensor, Tensor]],
    s: Union[Number, Tensor],
    rays_o: Tensor, rays_d: Tensor, 
    *, 
    compute_normal: bool = True,
    return_dict: bool = False,
    n_coarse: int = 64, n_fine: int = 64,
    near: float = 0.1, far: float = 100.0,
    z_spacing: Literal['linear', 'inverse_linear'] = 'linear',
):
    """
    TODO
    NeuS rendering of rays. Note that it supports arbitrary batch dimensions (denoted as `...`)

    Args:
        neus: neus model, which takes (points, directions) as input and returns (color, density) as output.

            nerf args:
                points: (..., n_rays, n_samples, 3)
                directions: (..., n_rays, n_samples, 3)
            nerf returns:
                color: (..., n_rays, n_samples, 3) color values.
                density: (..., n_rays, n_samples) density values.
                
        rays_o: (..., n_rays, 3) ray origins
        rays_d: (..., n_rays, 3) ray directions.
        pixel_width: (..., n_rays) pixel width. How to compute? pixel_width = 1 / (normalized focal length * width)
    
    Returns 
        if return_dict is False, return rendered results only: (If `n_fine == 0`, return coarse results, otherwise return fine results)
            rgb: (..., n_rays, 3) rendered color values. 
            depth: (..., n_rays) rendered depth values.
        else, return a dict. If `n_fine == 0`, the dict only contains coarse results:
        ```
        {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'sdf': ..., 'normal': ...}
        ```
        If n_fine > 0, the dict contains both coarse and fine results:
        ```
        {
            "coarse": {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..},
            "fine": {'rgb': .., 'depth': .., 'weights': .., 'z_vals': .., 'color': .., 'density': ..}
        }
        ```
    """

    # 1. Coarse: bin sampling
    z_coarse = bin_sample(rays_d.shape[:-1], n_coarse, near, far, device=rays_o.device, dtype=rays_o.dtype, spacing=z_spacing)                       # (n_batch, n_views, n_rays, n_samples)
    points_coarse = rays_o[..., None, :] + rays_d[..., None, :] * z_coarse[..., None]                                                                # (n_batch, n_views, n_rays, n_samples, 3)

    #    Query color and density                   
    color_coarse, sdf_coarse = neus(points_coarse, rays_d[..., None, :].expand_as(points_coarse))                                  # (n_batch, n_views, n_rays, n_samples, 3), (n_batch, n_views, n_rays, n_samples)
    
    #    Volume rendering
    with torch.no_grad():
        rgb_coarse, depth_coarse, weights = neus_volume_rendering(color_coarse, sdf_coarse, s, z_coarse)            # (n_batch, n_views, n_rays, 3), (n_batch, n_views, n_rays, 1), (n_batch, n_views, n_rays, n_samples)
    
    if n_fine == 0:
        if return_dict:
            return {'rgb': rgb_coarse, 'depth': depth_coarse, 'weights': weights, 'z_vals': z_coarse, 'color': color_coarse, 'sdf': sdf_coarse}
        else:
            return rgb_coarse, depth_coarse
    
    # If coarse and fine stages share the same model, the points of coarse stage can be reused, 
    # and we only need to query the importance samples of fine stage.
    z_fine = importance_sample(z_coarse, weights, n_fine)               
    points_fine = rays_o[..., None, :] + rays_d[..., None, :] * z_fine[..., None]                      
    color_fine, sdf_fine = neus(points_fine, rays_d[..., None, :].expand_as(points_fine))

    # Merge & volume rendering
    z_vals = torch.cat([z_coarse, z_fine], dim=-1)          
    color = torch.cat([color_coarse, color_fine], dim=-2)
    sdf = torch.cat([sdf_coarse, sdf_fine], dim=-1)     
    z_vals, sort_inds = torch.sort(z_vals, dim=-1)                   
    color = torch.gather(color, dim=-2, index=sort_inds[..., None].expand_as(color))
    sdf = torch.gather(sdf, dim=-1, index=sort_inds)
    rgb, depth, weights = neus_volume_rendering(color, sdf, s, z_vals)

    if return_dict:
        return {
            'coarse': {'rgb': rgb_coarse, 'depth': depth_coarse, 'weights': weights, 'z_vals': z_coarse, 'color': color_coarse, 'sdf': sdf_coarse},
            'fine': {'rgb': rgb, 'depth': depth, 'weights': weights, 'z_vals': z_vals, 'color': color, 'sdf': sdf}
        }
    else:
        return rgb, depth


def nerf_render_view(
    nerf: Tensor,
    extrinsics: Tensor, 
    intrinsics: Tensor, 
    width: int,
    height: int,
    *,
    patchify: bool = False,
    patch_size: Tuple[int, int] = (64, 64),
    **options: Dict[str, Any]
) -> Tuple[Tensor, Tensor]:
    """
    NeRF rendering of views. Note that it supports arbitrary batch dimensions (denoted as `...`)

    Args:
        extrinsics: (..., 4, 4) extrinsics matrice of the rendered views
        intrinsics (optional): (..., 3, 3) intrinsics matrice of the rendered views.
        width (optional): image width of the rendered views.
        height (optional): image height of the rendered views.
        patchify (optional): If the image is too large, render it patch by patch
        **options: rendering options.
    
    Returns:
        rgb: (..., channels, height, width) rendered color values.
        depth: (..., height, width) rendered depth values.
    """
    if patchify:
        # Patchified rendering
        max_patch_width, max_patch_height = patch_size
        n_rows, n_columns = math.ceil(height / max_patch_height), math.ceil(width / max_patch_width)

        rgb_rows, depth_rows = [], []
        for i_row in range(n_rows):
            rgb_row, depth_row = [], []
            for i_column in range(n_columns):
                patch_shape = patch_height, patch_width = min(max_patch_height, height - i_row * max_patch_height), min(max_patch_width, width - i_column * max_patch_width)
                uv = image_uv(height, width, i_column * max_patch_width, i_row * max_patch_height, i_column * max_patch_width + patch_width, i_row * max_patch_height + patch_height).to(extrinsics)
                uv = uv.flatten(0, 1)                                               # (patch_height * patch_width, 2)
                ray_o_, ray_d_ = get_rays(extrinsics, intrinsics, uv)
                rgb_, depth_ = nerf_render_rays(nerf, ray_o_, ray_d_, **options, return_dict=False)
                rgb_ = rgb_.transpose(-1, -2).unflatten(-1, patch_shape)            # (..., 3, patch_height, patch_width)
                depth_ = depth_.unflatten(-1, patch_shape)                          # (..., patch_height, patch_width)
                
                rgb_row.append(rgb_)
                depth_row.append(depth_)
            rgb_rows.append(torch.cat(rgb_row, dim=-1))
            depth_rows.append(torch.cat(depth_row, dim=-1))
        rgb = torch.cat(rgb_rows, dim=-2)
        depth = torch.cat(depth_rows, dim=-2)

        return rgb, depth
    else:
        # Full rendering
        uv = image_uv(height, width).to(extrinsics)
        uv = uv.flatten(0, 1)                                                       # (height * width, 2)
        ray_o_, ray_d_ = get_rays(extrinsics, intrinsics, uv)
        rgb, depth = nerf_render_rays(nerf, ray_o_, ray_d_, **options, return_dict=False)
        rgb = rgb.transpose(-1, -2).unflatten(-1, (height, width))                  # (..., 3, height, width)
        depth = depth.unflatten(-1, (height, width))                                # (..., height, width)
        
        return rgb, depth
    

def mipnerf_render_view(
    mipnerf: Tensor,
    extrinsics: Tensor, 
    intrinsics: Tensor, 
    width: int,
    height: int,
    *,
    patchify: bool = False,
    patch_size: Tuple[int, int] = (64, 64),
    **options: Dict[str, Any]
) -> Tuple[Tensor, Tensor]:
    """
    MipNeRF rendering of views. Note that it supports arbitrary batch dimensions (denoted as `...`)

    Args:
        extrinsics: (..., 4, 4) extrinsics matrice of the rendered views
        intrinsics (optional): (..., 3, 3) intrinsics matrice of the rendered views.
        width (optional): image width of the rendered views.
        height (optional): image height of the rendered views.
        patchify (optional): If the image is too large, render it patch by patch
        **options: rendering options.
    
    Returns:
        rgb: (..., 3, height, width) rendered color values.
        depth: (..., height, width) rendered depth values.
    """
    pixel_width = get_pixel_width(intrinsics, width, height)

    if patchify:
        # Patchified rendering
        max_patch_width, max_patch_height = patch_size
        n_rows, n_columns = math.ceil(height / max_patch_height), math.ceil(width / max_patch_width)

        rgb_rows, depth_rows = [], []
        for i_row in range(n_rows):
            rgb_row, depth_row = [], []
            for i_column in range(n_columns):
                patch_shape = patch_height, patch_width = min(max_patch_height, height - i_row * max_patch_height), min(max_patch_width, width - i_column * max_patch_width)
                uv = image_uv(height, width, i_column * max_patch_width, i_row * max_patch_height, i_column * max_patch_width + patch_width, i_row * max_patch_height + patch_height).to(extrinsics)
                uv = uv.flatten(0, 1)                                               # (patch_height * patch_width, 2)
                ray_o_, ray_d_ = get_rays(extrinsics, intrinsics, uv)
                rgb_, depth_ = mipnerf_render_rays(mipnerf, ray_o_, ray_d_, pixel_width, **options) 
                rgb_ = rgb_.transpose(-1, -2).unflatten(-1, patch_shape)            # (..., 3, patch_height, patch_width)
                depth_ = depth_.unflatten(-1, patch_shape)                          # (..., patch_height, patch_width)
                
                rgb_row.append(rgb_)
                depth_row.append(depth_)
            rgb_rows.append(torch.cat(rgb_row, dim=-1))
            depth_rows.append(torch.cat(depth_row, dim=-1))
        rgb = torch.cat(rgb_rows, dim=-2)
        depth = torch.cat(depth_rows, dim=-2)

        return rgb, depth
    else:
        # Full rendering
        uv = image_uv(height, width).to(extrinsics)
        uv = uv.flatten(0, 1)                                                       # (height * width, 2)
        ray_o_, ray_d_ = get_rays(extrinsics, intrinsics, uv)
        rgb, depth = mipnerf_render_rays(mipnerf, ray_o_, ray_d_, pixel_width, **options) 
        rgb = rgb.transpose(-1, -2).unflatten(-1, (height, width))                  # (..., 3, height, width)
        depth = depth.unflatten(-1, (height, width))                                # (..., height, width)
        
        return rgb, depth


class InstantNGP(nn.Module):
    """
    An implementation of InstantNGP, Müller et. al., https://nvlabs.github.io/instant-ngp/.
    Requires `tinycudann` package.
    Install it by:
    ```
    pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
    ```
    """
    def __init__(self,
        view_dependent: bool = True,
        base_resolution: int = 16,
        finest_resolution: int = 2048,
        n_levels: int = 16,
        num_layers_density: int = 2,
        hidden_dim_density: int = 64,
        num_layers_color: int = 3,
        hidden_dim_color: int = 64,
        log2_hashmap_size: int = 19,
        bound: float = 1.0,
        color_channels: int = 3,
    ):
        super().__init__()
        import tinycudann
        N_FEATURES_PER_LEVEL = 2
        GEO_FEAT_DIM = 15

        self.bound = bound
        self.color_channels = color_channels

        # density network
        self.num_layers_density = num_layers_density
        self.hidden_dim_density = hidden_dim_density

        per_level_scale = (finest_resolution / base_resolution) ** (1 / (n_levels - 1))

        self.encoder = tinycudann.Encoding(
            n_input_dims=3,
            encoding_config={
                "otype": "HashGrid",
                "n_levels": n_levels,
                "n_features_per_level": N_FEATURES_PER_LEVEL,
                "log2_hashmap_size": log2_hashmap_size,
                "base_resolution": base_resolution,
                "per_level_scale": per_level_scale,
            },
        )

        self.density_net = tinycudann.Network(
            n_input_dims=N_FEATURES_PER_LEVEL * n_levels,
            n_output_dims=1 + GEO_FEAT_DIM,
            network_config={
                "otype": "FullyFusedMLP",
                "activation": "ReLU",
                "output_activation": "None",
                "n_neurons": hidden_dim_density,
                "n_hidden_layers": num_layers_density - 1,
            },
        )

        # color network
        self.num_layers_color = num_layers_color        
        self.hidden_dim_color = hidden_dim_color
        
        self.view_dependent = view_dependent
        if view_dependent:
            self.encoder_dir = tinycudann.Encoding(
                n_input_dims=3,
                encoding_config={
                    "otype": "SphericalHarmonics",
                    "degree": 4,
                },
            )
            self.in_dim_color = self.encoder_dir.n_output_dims + GEO_FEAT_DIM
        else:
            self.in_dim_color = GEO_FEAT_DIM

        self.color_net = tinycudann.Network(
            n_input_dims=self.in_dim_color,
            n_output_dims=color_channels,
            network_config={
                "otype": "FullyFusedMLP",
                "activation": "ReLU",
                "output_activation": "None",
                "n_neurons": hidden_dim_color,
                "n_hidden_layers": num_layers_color - 1,
            },
        )
    
    def forward(self, x: torch.Tensor, d: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            x: (..., 3) points
            d: (..., 3) directions
        Returns:
            color: (..., 3) color values.
            density: (..., 1) density values.
        """
        batch_shape = x.shape[:-1]
        x, d = x.reshape(-1, 3), d.reshape(-1, 3)

        # density
        x = (x + self.bound) / (2 * self.bound)     # to [0, 1]
        x = self.encoder(x)
        density, geo_feat = self.density_net(x).split([1, 15], dim=-1)
        density = F.softplus(density).squeeze(-1)

        # color
        if self.view_dependent:
            d = (F.normalize(d, dim=-1) + 1) / 2    # tcnn SH encoding requires inputs to be in [0, 1]
            d = self.encoder_dir(d)
            h = torch.cat([d, geo_feat], dim=-1)
        else:
            h = geo_feat
        color = self.color_net(h)

        return color.reshape(*batch_shape, self.color_channels), density.reshape(*batch_shape)