File size: 8,425 Bytes

180521e

"""
Back-projection utilities: depth map → 3D point cloud.

DepthPro outputs metric depth (meters) and an estimated focal length.
Using the standard pinhole camera model, each pixel can be back-projected
into a 3D point relative to the camera centre.
"""

from __future__ import annotations

from typing import Optional, Tuple

import numpy as np


def depth_to_point_cloud(
    depth: np.ndarray,
    focal_length: float,
    principal_point: Optional[Tuple[float, float]] = None,
    *,
    mask: Optional[np.ndarray] = None,
    sample_step: int = 1,
) -> np.ndarray:
    """
    Back-project a metric depth map into a 3D point cloud.

    Parameters
    ----------
    depth : np.ndarray
        (H, W) float array of metric depths in meters.
    focal_length : float
        Focal length in pixels (for the resolution of *depth*).
        DepthPro returns this automatically via ``DepthResult.focal_length``.
    principal_point : (cx, cy), optional
        Principal point in pixel coordinates.  Defaults to the image centre
        ``(W/2, H/2)``.
    mask : np.ndarray, optional
        (H, W) boolean array.  Only pixels where ``mask == True`` are kept.
        Useful for removing sky/background, invalid depths, etc.
    sample_step : int, default 1
        Spatial sub-sampling step.  ``2`` keeps every 2nd pixel (75 % reduction),
        ``4`` keeps every 4th (93.75 % reduction).  Handy for real-time viz.

    Returns
    -------
    points : np.ndarray
        (N, 3) float array of 3D points in the camera coordinate frame.
        ``+Z`` points forward (into the scene), ``+X`` is right, ``+Y`` is
        down (standard image convention).

    Notes
    -----
    DepthPro assumes square pixels (aspect ratio = 1) and therefore a single
    focal length value is sufficient: ``fx == fy == focal_length``.

    The standard pinhole projection equations are::

        X = (u - cx) * Z / fx
        Y = (v - cy) * Z / fy
        Z = depth[v, u]

    where ``(u, v)`` are pixel column/row indices.
    """
    depth = np.asarray(depth, dtype=np.float32)
    H, W = depth.shape

    if principal_point is None:
        cx, cy = W / 2.0, H / 2.0
    else:
        cx, cy = float(principal_point[0]), float(principal_point[1])

    fx = fy = float(focal_length)

    # Build pixel grid — sample every sample_step pixel
    v_idx = np.arange(0, H, sample_step)
    u_idx = np.arange(0, W, sample_step)
    u, v = np.meshgrid(u_idx, v_idx)

    Z = depth[v_idx[:, None], u_idx[None, :]]

    # Remove invalid / zero depths
    valid = Z > 0.0
    if mask is not None:
        mask = np.asarray(mask)
        if mask.shape != (H, W):
            raise ValueError(f"mask shape {mask.shape} does not match depth shape {(H, W)}")
        # Down-sample mask to match the sampled grid
        valid &= mask[v_idx[:, None], u_idx[None, :]]

    u = u[valid]
    v = v[valid]
    Z = Z[valid]

    X = (u - cx) * Z / fx
    Y = (v - cy) * Z / fy

    points = np.stack([X, Y, Z], axis=-1).astype(np.float32)
    return points


def rgbd_to_point_cloud(
    depth: np.ndarray,
    rgb: np.ndarray,
    focal_length: float,
    principal_point: Optional[Tuple[float, float]] = None,
    *,
    mask: Optional[np.ndarray] = None,
    sample_step: int = 1,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Back-project an RGB-D pair into a coloured 3D point cloud.

    Parameters
    ----------
    depth : np.ndarray
        (H, W) metric depth map.
    rgb : np.ndarray
        (H, W, 3) uint8 RGB image.
    focal_length : float
        Estimated focal length in pixels.
    principal_point : (cx, cy), optional
        Defaults to image centre.
    mask : np.ndarray, optional
        Boolean mask selecting pixels to keep.
    sample_step : int, default 1
        Spatial sub-sampling step.

    Returns
    -------
    points : np.ndarray
        (N, 3) float32 3D points.
    colors : np.ndarray
        (N, 3) uint8 RGB colours aligned with *points*.
    """
    depth = np.asarray(depth)
    rgb = np.asarray(rgb)
    if depth.shape[:2] != rgb.shape[:2]:
        raise ValueError(
            f"depth shape {depth.shape} and rgb shape {rgb.shape} must have same H×W"
        )

    H, W = depth.shape
    if principal_point is None:
        cx, cy = W / 2.0, H / 2.0
    else:
        cx, cy = float(principal_point[0]), float(principal_point[1])

    fx = fy = float(focal_length)

    v_idx = np.arange(0, H, sample_step)
    u_idx = np.arange(0, W, sample_step)
    u, v = np.meshgrid(u_idx, v_idx)

    Z = depth[v_idx[:, None], u_idx[None, :]]
    colors_sampled = rgb[v_idx[:, None], u_idx[None, :]]

    valid = Z > 0.0
    if mask is not None:
        mask = np.asarray(mask)
        valid &= mask[v_idx[:, None], u_idx[None, :]]

    u = u[valid]
    v = v[valid]
    Z = Z[valid]
    colors = colors_sampled[valid]

    X = (u - cx) * Z / fx
    Y = (v - cy) * Z / fy

    points = np.stack([X, Y, Z], axis=-1).astype(np.float32)
    colors = np.asarray(colors, dtype=np.uint8)
    return points, colors


def normals_from_depth(
    depth: np.ndarray,
    focal_length: float,
    principal_point: Optional[Tuple[float, float]] = None,
) -> np.ndarray:
    """
    Compute per-pixel surface normals directly from the depth map.

    This is a fast, approximate normal estimator that works well for
    visualisation or as input to downstream surface-reconstruction methods
    (e.g. Poisson, NKSR).

    Parameters
    ----------
    depth : np.ndarray
        (H, W) metric depth map.
    focal_length : float
        Focal length in pixels.
    principal_point : (cx, cy), optional
        Defaults to image centre.

    Returns
    -------
    normals : np.ndarray
        (H, W, 3) float32 array of **unoriented** unit normals.
        ``normals[v, u]`` is the normal at pixel ``(u, v)``.
    """
    depth = np.asarray(depth, dtype=np.float64)
    H, W = depth.shape

    if principal_point is None:
        cx, cy = W / 2.0, H / 2.0
    else:
        cx, cy = float(principal_point[0]), float(principal_point[1])

    fx = fy = float(focal_length)

    # Compute 3D coordinates for every pixel
    u = np.arange(W)
    v = np.arange(H)
    u, v = np.meshgrid(u, v)

    Z = depth
    X = (u - cx) * Z / fx
    Y = (v - cy) * Z / fy

    # Cross-product of neighbour vectors → normal
    # Forward differences (with edge padding)
    dx = np.zeros_like(Z)
    dy = np.zeros_like(Z)

    dx[:, :-1] = (X[:, 1:] - X[:, :-1]) * (Z[:, :-1] > 0) * (Z[:, 1:] > 0)
    dy[:-1, :] = (Y[1:, :] - Y[:-1, :]) * (Z[:-1, :] > 0) * (Z[1:, :] > 0)

    # Average with backward differences for smoother edges
    dx[:, 1:] += (X[:, 1:] - X[:, :-1]) * (Z[:, :-1] > 0) * (Z[:, 1:] > 0)
    dy[1:, :] += (Y[1:, :] - Y[:-1, :]) * (Z[:-1, :] > 0) * (Z[1:, :] > 0)
    dx[:, 1:-1] *= 0.5
    dy[1:-1, :] *= 0.5

    # Central differences in the interior
    dx[:, 1:-1] = (X[:, 2:] - X[:, :-2]) / 2.0
    dy[1:-1, :] = (Y[2:, :] - Y[:-2, :]) / 2.0

    # Vectors in 3D
    vx = np.stack([dx, np.zeros_like(dx), np.zeros_like(dx)], axis=-1)
    vy = np.stack([np.zeros_like(dy), dy, np.zeros_like(dy)], axis=-1)

    # More accurate: use the actual 3D neighbour differences
    dX = np.zeros_like(X)
    dY = np.zeros_like(Y)
    dZ = np.zeros_like(Z)

    dX[:, :-1] = X[:, 1:] - X[:, :-1]
    dY[:, :-1] = Y[:, 1:] - Y[:, :-1]
    dZ[:, :-1] = Z[:, 1:] - Z[:, :-1]

    dX[:-1, :] += X[1:, :] - X[:-1, :]
    dY[:-1, :] += Y[1:, :] - Y[:-1, :]
    dZ[:-1, :] += Z[1:, :] - Z[:-1, :]

    # Use central diff version
    grad_x = np.zeros((H, W, 3), dtype=np.float32)
    grad_y = np.zeros((H, W, 3), dtype=np.float32)

    grad_x[:, :-1, 0] = X[:, 1:] - X[:, :-1]
    grad_x[:, :-1, 1] = Y[:, 1:] - Y[:, :-1]
    grad_x[:, :-1, 2] = Z[:, 1:] - Z[:, :-1]

    grad_y[:-1, :, 0] = X[1:, :] - X[:-1, :]
    grad_y[:-1, :, 1] = Y[1:, :] - Y[:-1, :]
    grad_y[:-1, :, 2] = Z[1:, :] - Z[:-1, :]

    # Average with the opposite direction for interior pixels
    grad_x[:, 1:, :] += np.stack([X[:, :-1] - X[:, 1:], Y[:, :-1] - Y[:, 1:], Z[:, :-1] - Z[:, 1:]], axis=-1)
    grad_y[1:, :, :] += np.stack([X[:-1, :] - X[1:, :], Y[:-1, :] - Y[1:, :], Z[:-1, :] - Z[1:, :]], axis=-1)

    # Cross product for normal
    normals = np.cross(grad_x, grad_y)

    # Normalise
    norm = np.linalg.norm(normals, axis=-1, keepdims=True)
    normals = np.where(norm > 1e-8, normals / norm, 0.0)

    return normals.astype(np.float32)