Spaces:
Sleeping
Sleeping
| """ | |
| Depth context constructor β the core innovation of the system. | |
| Translates a depth map + detection results into a structured text preamble | |
| that gives the VLM spatial awareness: how far objects are, how large they | |
| are, and where they sit horizontally in the scene. | |
| Specification: | |
| 1. Distance β median per-object disparity mapped [0,255] β [20,200] cm | |
| 2. Size β pinhole projection: w_cm = 2 * depth * tan(FoV/2) * (w_px / W) | |
| 3. Position β horizontal thirds (left / centre / right) | |
| 4. Ordering β nearest-first sort | |
| 5. Layout β foreground / midground / background pixel percentages | |
| """ | |
| import math | |
| from typing import Sequence, Union | |
| import numpy as np | |
| from .config import DEPTH_MAX_CM, DEPTH_MIN_CM, HFOV_DEG | |
| # --------------------------------------------------------------------------- | |
| # Module-level constants derived from config | |
| # --------------------------------------------------------------------------- | |
| # Horizontal scale factor for pinhole projection. | |
| # Full formula: w_cm = _H_SCALE * depth_cm * (w_px / W) | |
| _H_SCALE: float = 2.0 * math.tan(math.radians(HFOV_DEG / 2.0)) | |
| # Disparity thresholds that partition [0, 255] into depth zones. | |
| # Depth Anything V2 outputs disparity (higher value = closer object). | |
| # disparity >= 170 β foreground (β€ ~73 cm) | |
| # 85 <= disp < 170 β midground (~73β140 cm) | |
| # disparity < 85 β background (> ~140 cm) | |
| _FG_DISP: int = 170 | |
| _MG_DISP: int = 85 | |
| # --------------------------------------------------------------------------- | |
| # Private helpers | |
| # --------------------------------------------------------------------------- | |
| def _disparity_to_cm(disparity: float) -> float: | |
| """Map a normalised disparity value to an estimated distance in cm. | |
| Depth Anything V2 is disparity-convention: higher value = closer. | |
| So disparity 255 β DEPTH_MIN_CM (20 cm), disparity 0 β DEPTH_MAX_CM (200 cm). | |
| Args: | |
| disparity: Value in [0, 255]. | |
| Returns: | |
| Estimated distance in centimetres. | |
| """ | |
| return DEPTH_MAX_CM - (disparity / 255.0) * (DEPTH_MAX_CM - DEPTH_MIN_CM) | |
| def _horizontal_position(cx: float, img_w: int) -> str: | |
| """Classify a horizontal pixel coordinate into left / centre / right thirds. | |
| Args: | |
| cx: Horizontal centre of the bounding box in pixels. | |
| img_w: Image width in pixels. | |
| Returns: | |
| 'left', 'centre', or 'right'. | |
| """ | |
| third = img_w / 3.0 | |
| if cx < third: | |
| return "left" | |
| if cx < 2.0 * third: | |
| return "centre" | |
| return "right" | |
| def _pinhole_size( | |
| depth_cm: float, w_px: float, h_px: float, img_w: int | |
| ) -> tuple[float, float]: | |
| """Estimate physical dimensions via pinhole camera projection. | |
| Formula: w_cm = 2 * depth * tan(FoV/2) * (w_px / W) | |
| The same horizontal calibration is applied to the height axis for a | |
| consistent scale (avoids needing a separate vertical FoV constant). | |
| Args: | |
| depth_cm: Object distance in centimetres. | |
| w_px: Bounding-box width in pixels. | |
| h_px: Bounding-box height in pixels. | |
| img_w: Image width in pixels. | |
| Returns: | |
| (width_cm, height_cm) tuple. | |
| """ | |
| scale = _H_SCALE * depth_cm / img_w | |
| return scale * w_px, scale * h_px | |
| def _room_depth_estimate(depth_map: np.ndarray) -> float: | |
| """Estimate the room depth from background pixels. | |
| Takes the median disparity of pixels in the background zone (far wall, | |
| ceiling, floor) and converts to centimetres. Because Depth Anything V2 | |
| gives relative, not metric depth, this is an estimate, not an absolute | |
| measurement β but it is consistent within a single scene and gives the | |
| VLM a meaningful sense of scale. | |
| Args: | |
| depth_map: uint8 array (H, W); higher value = closer. | |
| Returns: | |
| Estimated background distance in centimetres. | |
| """ | |
| bg_pixels = depth_map[depth_map < _MG_DISP] | |
| if bg_pixels.size == 0: # entirely close-up scene | |
| bg_pixels = depth_map.flatten() | |
| return _disparity_to_cm(float(np.median(bg_pixels))) | |
| def _scene_layout(depth_map: np.ndarray) -> tuple[float, float, float]: | |
| """Compute foreground / midground / background percentages of the frame. | |
| Args: | |
| depth_map: uint8 array of shape (H, W); higher value = closer. | |
| Returns: | |
| (fg_pct, mg_pct, bg_pct) floats that sum to 100. | |
| """ | |
| total = float(depth_map.size) | |
| fg = np.sum(depth_map >= _FG_DISP) / total * 100.0 | |
| mg = np.sum((depth_map >= _MG_DISP) & (depth_map < _FG_DISP)) / total * 100.0 | |
| bg = np.sum(depth_map < _MG_DISP) / total * 100.0 | |
| return float(fg), float(mg), float(bg) | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def build_depth_context( | |
| frame_rgb: np.ndarray, | |
| depth_np: np.ndarray, | |
| boxes: Union[np.ndarray, Sequence], | |
| classes: Sequence[str], | |
| confidences: Sequence[float], | |
| ) -> str: | |
| """Build the depth-context preamble prepended to every VLM query. | |
| Objects are measured individually (median disparity inside their bounding | |
| box for robustness), converted to physical distances and sizes via pinhole | |
| projection, classified by horizontal position, and sorted nearest-first. | |
| A scene-level layout summary is appended at the end. | |
| Args: | |
| frame_rgb: uint8 RGB array of shape (H, W, 3). | |
| depth_np: uint8 depth/disparity map of shape (H, W). | |
| Higher value means closer to the camera. | |
| boxes: Bounding boxes as (N, 4) array or list of [x1, y1, x2, y2] | |
| in pixel coordinates. | |
| classes: Class label for each detected object, length N. | |
| confidences: Detection confidence in [0, 1] for each object, length N. | |
| Returns: | |
| Structured text preamble describing 3D scene geometry, e.g.:: | |
| You have access to 3D scene geometry from a depth sensor. | |
| Measurements: | |
| - Object 1: cup (confidence 92%), depth ~35 cm, size ~8x10 cm, centre | |
| - Object 2: laptop (confidence 87%), depth ~65 cm, size ~35x25 cm, right | |
| - Scene layout: foreground (32%), midground (45%), background (23%) | |
| """ | |
| H, W = frame_rgb.shape[:2] | |
| boxes_arr = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) | |
| n = len(boxes_arr) | |
| # ββ Per-object measurements βββββββββββββββββββββββββββββββββββββββββββββββ | |
| records: list[dict] = [] | |
| for i in range(n): | |
| x1, y1, x2, y2 = boxes_arr[i] | |
| # Clip box to valid image coordinates before indexing the depth map. | |
| px1 = int(max(0.0, x1)) | |
| py1 = int(max(0.0, y1)) | |
| px2 = int(min(float(W - 1), x2)) | |
| py2 = int(min(float(H - 1), y2)) | |
| roi = depth_np[py1 : py2 + 1, px1 : px2 + 1] | |
| # Median is more robust than mean against depth-map boundary artefacts. | |
| median_disp = float(np.median(roi)) if roi.size > 0 else 127.0 | |
| depth_cm = _disparity_to_cm(median_disp) | |
| box_w_px = float(x2 - x1) | |
| box_h_px = float(y2 - y1) | |
| w_cm, h_cm = _pinhole_size(depth_cm, box_w_px, box_h_px, W) | |
| cx = (x1 + x2) / 2.0 | |
| position = _horizontal_position(cx, W) | |
| records.append( | |
| { | |
| "label": classes[i], | |
| "confidence": float(confidences[i]), | |
| "depth_cm": depth_cm, | |
| "w_cm": w_cm, | |
| "h_cm": h_cm, | |
| "position": position, | |
| } | |
| ) | |
| # ββ Nearest-first sort ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| records.sort(key=lambda r: r["depth_cm"]) | |
| # ββ Scene-level measurements ββββββββββββββββββββββββββββββββββββββββββββββ | |
| fg_pct, mg_pct, bg_pct = _scene_layout(depth_np) | |
| room_depth_cm = _room_depth_estimate(depth_np) | |
| room_depth_m = room_depth_cm / 100.0 | |
| # ββ Assemble preamble text ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| lines: list[str] = [ | |
| "Depth sensor data for this scene:", | |
| f" - Space extends approximately {room_depth_m:.1f} m in depth", | |
| ] | |
| if records: | |
| lines.append(" - Objects detected (nearest first):") | |
| for rec in records: | |
| lines.append( | |
| f" {rec['label']} at ~{rec['depth_cm']:.0f} cm " | |
| f"({rec['position']}), ~{rec['w_cm']:.0f}x{rec['h_cm']:.0f} cm" | |
| ) | |
| lines.append( | |
| f" - Scene proportions: " | |
| f"{fg_pct:.0f}% close (<73 cm), " | |
| f"{mg_pct:.0f}% mid-range, " | |
| f"{bg_pct:.0f}% far/background" | |
| ) | |
| return "\n".join(lines) | |